705 files changed, 145928 insertions, 28545 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 3803e87fea..03a2ddf5e4 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -8,13 +8,12 @@ readability.
 ## basis_universal
 
 - Upstream: https://github.com/BinomialLLC/basis_universal
-- Version: git (895ee8ee7e04f22267f8d16d46de04d5a01d63ac, 2020)
+- Version: git (ba1c3e40f1d434ebaf9a167b44e9b11d2bf0f765, 2021)
 - License: Apache 2.0
 
 Files extracted from upstream source:
 
-- `.cpp` and `.h` files in root folder except for `basisu_tool.cpp` (contains `main` and can cause link error)
-- `.cpp`, `.h` and `.inc` files in `transcoder/`, keeping folder structure
+- `encoder/` and `transcoder/` folders
 - `LICENSE`
 
 
@@ -62,6 +61,26 @@ Files extracted from upstream source:
 Extracted from .zip provided. Extracted license and header only.
 
 
+## embree
+
+- Upstream: https://github.com/embree/embree
+- Version: 3.13.0 (7c53133eb21424f7f0ae1e25bf357e358feaf6ab, 2021)
+- License: Apache 2.0
+
+Files extracted from upstream:
+
+- All cpp files listed in `modules/raycast/godot_update_embree.py`
+- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
+
+The `modules/raycast/godot_update_embree.py` script can be used to pull the
+relevant files from the latest Embree release and apply some automatic changes.
+
+Some changes have been made in order to remove exceptions and fix minor build errors.
+They are marked with `// -- GODOT start --` and `// -- GODOT end --`
+comments. Apply the patches in the `patches/` folder when syncing on newer upstream
+commits.
+
+
 ## enet
 
 - Upstream: http://enet.bespin.org
@@ -86,20 +105,20 @@ It is still possible to build against a system wide ENet but doing so
 will limit its functionality to IPv4 only.
 
 
-## etc2comp
+## etcpak
 
-- Upstream: https://github.com/google/etc2comp
-- Version: git (9cd0f9cae0f32338943699bb418107db61bb66f2, 2017)
-- License: Apache 2.0
+- Upstream: https://github.com/wolfpld/etcpak
+- Version: git (f27daea656ff77671580f838a889e33049430ebd, 2021)
+- License: BSD-3-Clause
 
 Files extracted from upstream source:
 
-- all .cpp and .h files in EtcLib/
-- README.md, LICENSE, AUTHORS
-
-Important: Some files have Godot-made changes.
-They are marked with `// -- GODOT start --` and `// -- GODOT end --`
-comments.
+- Only the files relevant for compression (i.e. `Process*.cpp` and their deps):
+  ```
+  Dither.{cpp,hpp} ForceInline.hpp Math.hpp ProcessCommon.hpp ProcessRGB.{cpp,hpp}
+  ProcessDxtc.{cpp,hpp} Tables.{cpp,hpp} Vector.hpp
+  ```
+- `AUTHORS.txt` and `LICENSE.txt`
 
 
 ## fonts
@@ -118,6 +137,10 @@ comments.
   * Upstream: https://android.googlesource.com/platform/frameworks/base/+/master/data/fonts/
   * Version: ? (pre-2014 commit when DroidSansJapanese.ttf was obsoleted)
   * License: Apache 2.0
+- `OpenSans_SemiBold.ttf`:
+  * Upstream: https://fonts.google.com/specimen/Open+Sans
+  * Version: 1.10 (downloaded from Google Fonts in February 2021)
+  * License: Apache 2.0
 - `Tamsyn*.png`:
   * Upstream: http://www.fial.com/~scott/tamsyn-font/
   * Version: 1.11 (2015)
@@ -174,7 +197,7 @@ Files extracted from upstream source:
 ## harfbuzz
 
 - Upstream: https://github.com/harfbuzz/harfbuzz
-- Version: 2.7.4 (7236c7e29cef1c2d76c7a284c5081ff4d3aa1127, 2020)
+- Version: 2.8.0 (03538e872a0610a65fad692b33d3646f387cf578, 2021)
 - License: MIT
 
 Files extracted from upstream source:
@@ -186,17 +209,17 @@ Files extracted from upstream source:
 ## icu4c
 
 - Upstream: https://github.com/unicode-org/icu
-- Version: 68.2 (84e1f26ea77152936e70d53178a816dbfbf69989, 2020)
+- Version: 69.1 (0e7b4428866f3133b4abba2d932ee3faa708db1d, 2021)
 - License: Unicode
 
 Files extracted from upstream source:
 
 - the `common` folder
-- `APIChangeReport.md`, `LICENSE`
+- `LICENSE`
 
 Files generated from upstream source:
 
-- the `icudt68l.dat` built with the provided `godot_data.json` config file (see
+- the `icudt69l.dat` built with the provided `godot_data.json` config file (see
   https://github.com/unicode-org/icu/blob/master/docs/userguide/icu_data/buildtool.md
   for instructions)
 
@@ -324,7 +347,7 @@ changes are marked with `// -- GODOT --` comments.
 ## mbedtls
 
 - Upstream: https://tls.mbed.org/
-- Version: 2.16.9 (3fac0bae4a50113989b3d015cd2d948f51a6d9ac, 2020)
+- Version: 2.16.10 (d61fa61bef06b64132e3490543c81b8ee40fbee3, 2021)
 - License: Apache 2.0
 
 File extracted from upstream release tarball:
@@ -344,7 +367,7 @@ File extracted from upstream release tarball:
 ## meshoptimizer
 
 - Upstream: https://github.com/zeux/meshoptimizer
-- Version: git (e3f53f66e7a35b9b8764bee478589d79e34fa698, 2021)
+- Version: git (f5d83e879c48f8664783a69b4f50711d27549b66, 2021)
 - License: MIT
 
 Files extracted from upstream repository:
@@ -352,21 +375,29 @@ Files extracted from upstream repository:
 - All files in `src/`.
 - `LICENSE.md`.
 
+An [experimental upstream feature](https://github.com/zeux/meshoptimizer/tree/simplify-attr),
+has been backported, see patch in `patches` directory.
+
 
 ## miniupnpc
 
-- Upstream: https://github.com/miniupnp/miniupnp/tree/master/miniupnpc
-- Version: git (44366328661826603982d1e0d7ebb4062c5f2bfc, 2020)
+- Upstream: https://github.com/miniupnp/miniupnp
+- Version: 2.2.2 (81029a860baf1f727903e5b85307903b3f40cbc8, 2021)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
 
 - All `*.c` and `*.h` files from `miniupnpc` to `thirdparty/miniupnpc/miniupnpc`
-- Remove `test*`, `minihttptestserver.c` and `wingenminiupnpcstrings.c`
+- Remove the following test or sample files:
+  `listdevices.c minihttptestserver.c miniupnpcmodule.c upnpc.c upnperrors.* test* wingenminiupnpcstrings.c`
+- `LICENSE`
 
-The patch `windows_fix.diff` is applied to `minissdpc.c` to fix an upstream issue.
-The only modified file is miniupnpcstrings.h, which was created for Godot
-(it is usually autogenerated by cmake).
+The only modified file is `miniupnpcstrings.h`, which was created for Godot
+(it is usually autogenerated by cmake). Bump the version number for miniupnpc in that
+file when upgrading.
+
+Note: The following upstream patch has been applied, remove this notice on next update.
+https://github.com/miniupnp/miniupnp/commit/3a08dd4b89af2e9effa22a136bac86f2f306fd79
 
 
 ## minizip
@@ -438,6 +469,10 @@ Collection of single-file libraries used in Godot components.
   * Version: git (2f625846a775501fb69456567409a8b12f10ea25, 2012)
   * License: BSD-3-Clause
   * Modifications: use `const char*` instead of `char*` for input string
+- `smolv.h`
+  * Upstream: https://github.com/aras-p/smol-v
+  * Version: git (4b52c165c13763051a18e80ffbc2ee436314ceb2, 2020)
+  * License: Public Domain or MIT
 - `stb_rect_pack.h`
   * Upstream: https://github.com/nothings/stb
   * Version: 1.00 (2bb4a0accd4003c1db4c24533981e01b1adfd656, 2019)
@@ -700,3 +735,4 @@ Files extracted from upstream source:
 
 - lib/{common/,compress/,decompress/,zstd.h}
 - LICENSE
+
diff --git a/thirdparty/basis_universal/basisu_comp.cpp b/thirdparty/basis_universal/basisu_comp.cpp
deleted file mode 100644
index 1e4679311c..0000000000
--- a/thirdparty/basis_universal/basisu_comp.cpp
+++ /dev/null
@@ -1,1206 +0,0 @@
-// basisu_comp.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "basisu_comp.h"
-#include "basisu_enc.h"
-#include <unordered_set>
-
-#define BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN 0
-#define DEBUG_CROP_TEXTURE_TO_64x64 (0)
-#define DEBUG_RESIZE_TEXTURE (0)
-#define DEBUG_EXTRACT_SINGLE_BLOCK (0)
-
-namespace basisu
-{
-   basis_compressor::basis_compressor() :
-		m_total_blocks(0),
-		m_auto_global_sel_pal(false),
-		m_basis_file_size(0),
-		m_basis_bits_per_texel(0),
-		m_any_source_image_has_alpha(false)
-	{
-		debug_printf("basis_compressor::basis_compressor\n");
-	}
-
-	bool basis_compressor::init(const basis_compressor_params &params)
-	{
-		debug_printf("basis_compressor::init\n");
-
-		m_params = params;
-
-		if (m_params.m_debug)
-		{
-			debug_printf("basis_compressor::init:\n");
-
-#define PRINT_BOOL_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
-#define PRINT_INT_VALUE(v) debug_printf("%s: %i %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
-#define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<uint32_t>(m_params.v), m_params.v.was_changed());
-#define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast<float>(m_params.v), m_params.v.was_changed());
-
-			debug_printf("Has global selector codebook: %i\n", m_params.m_pSel_codebook != nullptr);
-
-			debug_printf("Source images: %u, source filenames: %u, source alpha filenames: %i\n", 
-				(uint32_t)m_params.m_source_images.size(), (uint32_t)m_params.m_source_filenames.size(), (uint32_t)m_params.m_source_alpha_filenames.size());
-
-			PRINT_BOOL_VALUE(m_y_flip);
-			PRINT_BOOL_VALUE(m_debug);
-			PRINT_BOOL_VALUE(m_debug_images);
-			PRINT_BOOL_VALUE(m_global_sel_pal);
-			PRINT_BOOL_VALUE(m_auto_global_sel_pal);
-			PRINT_BOOL_VALUE(m_compression_level);
-			PRINT_BOOL_VALUE(m_no_hybrid_sel_cb);
-			PRINT_BOOL_VALUE(m_perceptual);
-			PRINT_BOOL_VALUE(m_no_endpoint_rdo);
-			PRINT_BOOL_VALUE(m_no_selector_rdo);
-			PRINT_BOOL_VALUE(m_read_source_images);
-			PRINT_BOOL_VALUE(m_write_output_basis_files);
-			PRINT_BOOL_VALUE(m_compute_stats);
-			PRINT_BOOL_VALUE(m_check_for_alpha)
-			PRINT_BOOL_VALUE(m_force_alpha)
-			PRINT_BOOL_VALUE(m_seperate_rg_to_color_alpha);
-			PRINT_BOOL_VALUE(m_multithreading);
-			PRINT_BOOL_VALUE(m_disable_hierarchical_endpoint_codebooks);
-			
-			PRINT_FLOAT_VALUE(m_hybrid_sel_cb_quality_thresh);
-			
-			PRINT_INT_VALUE(m_global_pal_bits);
-			PRINT_INT_VALUE(m_global_mod_bits);
-
-			PRINT_FLOAT_VALUE(m_endpoint_rdo_thresh);
-			PRINT_FLOAT_VALUE(m_selector_rdo_thresh);
-			
-			PRINT_BOOL_VALUE(m_mip_gen);
-			PRINT_BOOL_VALUE(m_mip_renormalize);
-			PRINT_BOOL_VALUE(m_mip_wrapping);
-			PRINT_BOOL_VALUE(m_mip_srgb);
-			PRINT_FLOAT_VALUE(m_mip_premultiplied);
-			PRINT_FLOAT_VALUE(m_mip_scale);
-			PRINT_INT_VALUE(m_mip_smallest_dimension);
-			debug_printf("m_mip_filter: %s\n", m_params.m_mip_filter.c_str());
-
-			debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters);
-			debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters);
-			debug_printf("m_quality_level: %i\n", m_params.m_quality_level);
-
-			debug_printf("m_tex_type: %u\n", m_params.m_tex_type);
-			debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1);
-			debug_printf("m_us_per_frame: %i (%f fps)\n", m_params.m_us_per_frame, m_params.m_us_per_frame ? 1.0f / (m_params.m_us_per_frame / 1000000.0f) : 0);
-						
-#undef PRINT_BOOL_VALUE
-#undef PRINT_INT_VALUE
-#undef PRINT_UINT_VALUE
-#undef PRINT_FLOAT_VALUE
-		}
-
-		if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size()))
-		{
-			assert(0);
-			return false;
-		}
-
-		return true;
-	}
-		
-	basis_compressor::error_code basis_compressor::process()
-	{
-		debug_printf("basis_compressor::process\n");
-
-		if (!read_source_images())
-			return cECFailedReadingSourceImages;
-
-		if (!validate_texture_type_constraints())
-			return cECFailedValidating;
-
-		if (!process_frontend())
-			return cECFailedFrontEnd;
-
-		if (!extract_frontend_texture_data())
-			return cECFailedFontendExtract;
-
-		if (!process_backend())
-			return cECFailedBackend;
-
-		if (!create_basis_file_and_transcode())
-			return cECFailedCreateBasisFile;
-
-		if (!write_output_files_and_compute_stats())
-			return cECFailedWritingOutput;
-
-		return cECSuccess;
-	}
-
-	bool basis_compressor::generate_mipmaps(const image &img, std::vector<image> &mips, bool has_alpha)
-	{
-		debug_printf("basis_compressor::generate_mipmaps\n");
-
-		uint32_t total_levels = 1;
-		uint32_t w = img.get_width(), h = img.get_height();
-		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
-		{
-			w = maximum(w >> 1U, 1U);
-			h = maximum(h >> 1U, 1U);
-			total_levels++;
-		}
-
-#if BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN
-		// Requires stb_image_resize
-		stbir_filter filter = STBIR_FILTER_DEFAULT;
-		if (m_params.m_mip_filter == "box")
-			filter = STBIR_FILTER_BOX;
-		else if (m_params.m_mip_filter == "triangle")
-			filter = STBIR_FILTER_TRIANGLE;
-		else if (m_params.m_mip_filter == "cubic")
-			filter = STBIR_FILTER_CUBICBSPLINE;
-		else if (m_params.m_mip_filter == "catmull")
-			filter = STBIR_FILTER_CATMULLROM;
-		else if (m_params.m_mip_filter == "mitchell")
-			filter = STBIR_FILTER_MITCHELL;
-
-		for (uint32_t level = 1; level < total_levels; level++)
-		{
-			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
-			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
-
-			image &level_img = *enlarge_vector(mips, 1);
-			level_img.resize(level_width, level_height);
-						
-			int result = stbir_resize_uint8_generic( 
-				(const uint8_t *)img.get_ptr(), img.get_width(), img.get_height(), img.get_pitch() * sizeof(color_rgba),
-            (uint8_t *)level_img.get_ptr(), level_img.get_width(), level_img.get_height(), level_img.get_pitch() * sizeof(color_rgba),
-            has_alpha ? 4 : 3, has_alpha ? 3 : STBIR_ALPHA_CHANNEL_NONE, m_params.m_mip_premultiplied ? STBIR_FLAG_ALPHA_PREMULTIPLIED : 0,
-            m_params.m_mip_wrapping ? STBIR_EDGE_WRAP : STBIR_EDGE_CLAMP, filter, m_params.m_mip_srgb ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, 
-				nullptr);
-
-			if (result == 0)
-			{
-				error_printf("basis_compressor::generate_mipmaps: stbir_resize_uint8_generic() failed!\n");
-				return false;
-			}
-			
-			if (m_params.m_mip_renormalize)
-				level_img.renormalize_normal_map();
-		}
-#else
-		for (uint32_t level = 1; level < total_levels; level++)
-		{
-			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
-			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
-
-			image &level_img = *enlarge_vector(mips, 1);
-			level_img.resize(level_width, level_height);
-
-			bool status = image_resample(img, level_img, m_params.m_mip_srgb, m_params.m_mip_filter.c_str(), m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
-			if (!status)
-			{
-				error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n");
-				return false;
-			}
-
-			if (m_params.m_mip_renormalize)
-				level_img.renormalize_normal_map();
-		}
-#endif
-
-		return true;
-	}
-
-	bool basis_compressor::read_source_images()
-	{
-		debug_printf("basis_compressor::read_source_images\n");
-
-		const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : (uint32_t)m_params.m_source_images.size();
-		if (!total_source_files)
-			return false;
-
-		m_stats.resize(0);
-		m_slice_descs.resize(0);
-		m_slice_images.resize(0);
-
-		m_total_blocks = 0;
-		uint32_t total_macroblocks = 0;
-
-		m_any_source_image_has_alpha = false;
-
-		std::vector<image> source_images;
-		std::vector<std::string> source_filenames;
-		
-		// First load all source images, and determine if any have an alpha channel.
-		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
-		{
-			const char *pSource_filename = "";
-
-			image file_image;
-			
-			if (m_params.m_read_source_images)
-			{
-				pSource_filename = m_params.m_source_filenames[source_file_index].c_str();
-
-				// Load the source image
-				if (!load_png(pSource_filename, file_image))
-				{
-					error_printf("Failed reading source image: %s\n", pSource_filename);
-					return false;
-				}
-
-				printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
-
-				// Optionally load another image and put a grayscale version of it into the alpha channel.
-				if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
-				{
-					const char *pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str();
-
-					image alpha_data;
-
-					if (!load_png(pSource_alpha_image, alpha_data))
-					{
-						error_printf("Failed reading source image: %s\n", pSource_alpha_image);
-						return false;
-					}
-
-					printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height());
-
-					alpha_data.crop(file_image.get_width(), file_image.get_height());
-
-					for (uint32_t y = 0; y < file_image.get_height(); y++)
-						for (uint32_t x = 0; x < file_image.get_width(); x++)
-							file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma();
-				}
-			}
-			else
-			{
-				file_image = m_params.m_source_images[source_file_index];
-			}
-
-			if (m_params.m_seperate_rg_to_color_alpha)
-			{
-				// Used for XY normal maps in RG - puts X in color, Y in alpha
-				for (uint32_t y = 0; y < file_image.get_height(); y++)
-					for (uint32_t x = 0; x < file_image.get_width(); x++)
-					{
-						const color_rgba &c = file_image(x, y);
-						file_image(x, y).set_noclamp_rgba(c.r, c.r, c.r, c.g);
-					}
-			}
-						
-			bool has_alpha = false;
-			if ((m_params.m_force_alpha) || (m_params.m_seperate_rg_to_color_alpha))
-				has_alpha = true;
-			else if (!m_params.m_check_for_alpha)
-				file_image.set_alpha(255);
-			else if (file_image.has_alpha())
-				has_alpha = true;
-
-			if (has_alpha)
-				m_any_source_image_has_alpha = true;
-
-			debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, file_image.get_width(), file_image.get_height(), has_alpha);
-												
-			if (m_params.m_y_flip)
-				file_image.flip_y();
-
-#if DEBUG_EXTRACT_SINGLE_BLOCK
-			image block_image(4, 4);
-			const uint32_t block_x = 0;
-			const uint32_t block_y = 0;
-			block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0);
-			file_image = block_image;
-#endif
-
-#if DEBUG_CROP_TEXTURE_TO_64x64
-			file_image.resize(64, 64);
-#endif
-#if DEBUG_RESIZE_TEXTURE
-			image temp_img((file_image.get_width() + 1) / 2, (file_image.get_height() + 1) / 2);
-			image_resample(file_image, temp_img, m_params.m_perceptual, "kaiser");
-			temp_img.swap(file_image);
-#endif
-
-			if ((!file_image.get_width()) || (!file_image.get_height()))
-			{
-				error_printf("basis_compressor::read_source_images: Source image has a zero width and/or height!\n");
-				return false;
-			}
-
-			if ((file_image.get_width() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (file_image.get_height() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION))
-			{
-				error_printf("basis_compressor::read_source_images: Source image is too large!\n");
-				return false;
-			}
-
-			source_images.push_back(file_image);
-			source_filenames.push_back(pSource_filename);
-		}
-
-		debug_printf("Any source image has alpha: %u\n", m_any_source_image_has_alpha);
-
-		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
-		{
-			image &file_image = source_images[source_file_index];
-			const std::string &source_filename = source_filenames[source_file_index];
-
-			std::vector<image> slices;
-			
-			slices.reserve(32);
-			slices.push_back(file_image);
-									
-			if (m_params.m_mip_gen)
-			{
-				if (!generate_mipmaps(file_image, slices, m_any_source_image_has_alpha))
-					return false;
-			}
-
-			uint_vec mip_indices(slices.size());
-			for (uint32_t i = 0; i < slices.size(); i++)
-				mip_indices[i] = i;
-						
-			if (m_any_source_image_has_alpha)
-			{
-				// If source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. 
-				std::vector<image> alpha_slices;
-				uint_vec new_mip_indices;
-
-				alpha_slices.reserve(slices.size() * 2);
-
-				for (uint32_t i = 0; i < slices.size(); i++)
-				{
-					image lvl_rgb(slices[i]);
-					image lvl_a(lvl_rgb);
-
-					for (uint32_t y = 0; y < lvl_a.get_height(); y++)
-					{
-						for (uint32_t x = 0; x < lvl_a.get_width(); x++)
-						{
-							uint8_t a = lvl_a(x, y).a;
-							lvl_a(x, y).set_noclamp_rgba(a, a, a, 255);
-						}
-					}
-					
-					lvl_rgb.set_alpha(255);
-
-					alpha_slices.push_back(lvl_rgb);
-					new_mip_indices.push_back(i);
-
-					alpha_slices.push_back(lvl_a);
-					new_mip_indices.push_back(i);
-				}
-
-				slices.swap(alpha_slices);
-				mip_indices.swap(new_mip_indices);
-			}
-
-			assert(slices.size() == mip_indices.size());
-						
-			for (uint32_t slice_index = 0; slice_index < slices.size(); slice_index++)
-			{
-				const bool is_alpha_slice = m_any_source_image_has_alpha && ((slice_index & 1) != 0);
-
-				image &slice_image = slices[slice_index];
-				const uint32_t orig_width = slice_image.get_width();
-				const uint32_t orig_height = slice_image.get_height();
-
-				// Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks.
-				slice_image.crop_dup_borders(slice_image.get_block_width(4) * 4, slice_image.get_block_height(4) * 4);
-
-				if (m_params.m_debug_images)
-				{
-					save_png(string_format("basis_debug_source_image_%u_%u.png", source_file_index, slice_index).c_str(), slice_image);
-				}
-
-				enlarge_vector(m_stats, 1);
-				enlarge_vector(m_slice_images, 1);
-				enlarge_vector(m_slice_descs, 1);
-
-				const uint32_t dest_image_index = (uint32_t)m_stats.size() - 1;
-
-				m_stats[dest_image_index].m_filename = source_filename.c_str();
-				m_stats[dest_image_index].m_width = orig_width;
-				m_stats[dest_image_index].m_height = orig_height;
-
-				m_slice_images[dest_image_index] = slice_image;
-
-				debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), orig_width, orig_height, slice_image.get_width(), slice_image.get_height());
-
-				basisu_backend_slice_desc &slice_desc = m_slice_descs[dest_image_index];
-
-				slice_desc.m_first_block_index = m_total_blocks;
-
-				slice_desc.m_orig_width = orig_width;
-				slice_desc.m_orig_height = orig_height;
-
-				slice_desc.m_width = slice_image.get_width();
-				slice_desc.m_height = slice_image.get_height();
-
-				slice_desc.m_num_blocks_x = slice_image.get_block_width(4);
-				slice_desc.m_num_blocks_y = slice_image.get_block_height(4);
-
-				slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1;
-				slice_desc.m_num_macroblocks_y = (slice_desc.m_num_blocks_y + 1) >> 1;
-
-				slice_desc.m_source_file_index = source_file_index;
-				
-				slice_desc.m_mip_index = mip_indices[slice_index];
-
-				slice_desc.m_alpha = is_alpha_slice;
-				slice_desc.m_iframe = false;
-				if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
-				{
-					slice_desc.m_iframe = (source_file_index == 0);
-				}
-
-				m_total_blocks += slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
-				total_macroblocks += slice_desc.m_num_macroblocks_x * slice_desc.m_num_macroblocks_y;
-			
-			} // slice_index
-
-		} // source_file_index
-
-		debug_printf("Total blocks: %u, Total macroblocks: %u\n", m_total_blocks, total_macroblocks);
-
-		// Make sure we don't have too many slices
-		if (m_slice_descs.size() > BASISU_MAX_SLICES)
-		{
-			error_printf("Too many slices!\n");
-			return false;
-		}
-				
-		// Basic sanity check on the slices
-		for (uint32_t i = 1; i < m_slice_descs.size(); i++)
-		{
-			const basisu_backend_slice_desc &prev_slice_desc = m_slice_descs[i - 1];
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
-
-			// Make sure images are in order
-			int image_delta = (int)slice_desc.m_source_file_index - (int)prev_slice_desc.m_source_file_index;
-			if (image_delta > 1)
-				return false;	
-
-			// Make sure mipmap levels are in order
-			if (!image_delta)
-			{
-				int level_delta = (int)slice_desc.m_mip_index - (int)prev_slice_desc.m_mip_index;
-				if (level_delta > 1)
-					return false;
-			}
-		}
-
-		printf("Total basis file slices: %u\n", (uint32_t)m_slice_descs.size());
-
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
-
-			printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n", 
-				i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
-
-			if (m_any_source_image_has_alpha)
-			{
-				// Alpha slices must be at odd slice indices
-				if (slice_desc.m_alpha)
-				{
-					if ((i & 1) == 0)
-						return false;
-					
-					const basisu_backend_slice_desc &prev_slice_desc = m_slice_descs[i - 1];
-
-					// Make sure previous slice has this image's color data
-					if (prev_slice_desc.m_source_file_index != slice_desc.m_source_file_index)
-						return false;
-					if (prev_slice_desc.m_alpha)
-						return false;
-					if (prev_slice_desc.m_mip_index != slice_desc.m_mip_index)
-						return false;
-					if (prev_slice_desc.m_num_blocks_x != slice_desc.m_num_blocks_x)
-						return false;
-					if (prev_slice_desc.m_num_blocks_y != slice_desc.m_num_blocks_y)
-						return false;
-				}
-				else if (i & 1)
-					return false;
-			}
-			else if (slice_desc.m_alpha)
-			{
-				return false;
-			}
-
-			if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height))
-				return false;
-			if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
-			{
-				if (!slice_desc.m_iframe)
-					return false;
-			}
-		}
-
-		return true;
-	}
-
-	// Do some basic validation for 2D arrays, cubemaps, video, and volumes.
-	bool basis_compressor::validate_texture_type_constraints() 
-	{
-		debug_printf("basis_compressor::validate_texture_type_constraints\n");
-
-		// In 2D mode anything goes (each image may have a different resolution and # of mipmap levels).
-		if (m_params.m_tex_type == basist::cBASISTexType2D)
-			return true;
-				
-		uint32_t total_basis_images = 0;
-
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
-				
-			total_basis_images = maximum<uint32_t>(total_basis_images, slice_desc.m_source_file_index + 1);
-		}
-
-		if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
-		{
-			// For cubemaps, validate that the total # of Basis images is a multiple of 6.
-			if ((total_basis_images % 6) != 0)
-			{
-				error_printf("basis_compressor::validate_texture_type_constraints: For cubemaps the total number of input images is not a multiple of 6!\n");
-				return false;
-			}
-		}
-
-		// Now validate that all the mip0's have the same dimensions, and that each image has the same # of mipmap levels.
-		uint_vec image_mipmap_levels(total_basis_images);
-
-		int width = -1, height = -1;
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
-
-			image_mipmap_levels[slice_desc.m_source_file_index] = maximum(image_mipmap_levels[slice_desc.m_source_file_index], slice_desc.m_mip_index + 1);
-				
-			if (slice_desc.m_mip_index != 0)
-				continue;
-
-			if (width < 0)
-			{
-				width = slice_desc.m_orig_width;
-				height = slice_desc.m_orig_height;
-			}
-			else if ((width != (int)slice_desc.m_orig_width) || (height != (int)slice_desc.m_orig_height))
-			{
-				error_printf("basis_compressor::validate_texture_type_constraints: The source image resolutions are not all equal!\n");
-				return false;
-			}
-		}
-
-		for (size_t i = 1; i < image_mipmap_levels.size(); i++)
-		{
-			if (image_mipmap_levels[0] != image_mipmap_levels[i])
-			{
-				error_printf("basis_compressor::validate_texture_type_constraints: Each image must have the same number of mipmap levels!\n");
-				return false;
-			}
-		}
-
-		return true;
-	}
-
-	bool basis_compressor::process_frontend()
-	{
-		debug_printf("basis_compressor::process_frontend\n");
-				
-		m_source_blocks.resize(m_total_blocks);
-				
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
-
-			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
-			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
-
-			const image &source_image = m_slice_images[slice_index];
-
-			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
-				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-					source_image.extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4);
-		}
-				
-#if 0
-		// TODO
-		basis_etc1_pack_params pack_params;
-		pack_params.m_quality = cETCQualityMedium;
-		pack_params.m_perceptual = m_params.m_perceptual;
-		pack_params.m_use_color4 = false;
-
-		pack_etc1_block_context pack_context;
-
-		std::unordered_set<uint64_t> endpoint_hash;
-		std::unordered_set<uint32_t> selector_hash;
-
-		for (uint32_t i = 0; i < m_source_blocks.size(); i++)
-		{
-			etc_block blk;
-			pack_etc1_block(blk, m_source_blocks[i].get_ptr(), pack_params, pack_context);
-
-			const color_rgba c0(blk.get_block_color(0, false));
-			endpoint_hash.insert((c0.r | (c0.g << 5) | (c0.b << 10)) | (blk.get_inten_table(0) << 16));
-
-			const color_rgba c1(blk.get_block_color(1, false));
-			endpoint_hash.insert((c1.r | (c1.g << 5) | (c1.b << 10)) | (blk.get_inten_table(1) << 16));
-
-			selector_hash.insert(blk.get_raw_selector_bits());
-		}
-
-		const uint32_t total_unique_endpoints = (uint32_t)endpoint_hash.size();
-		const uint32_t total_unique_selectors = (uint32_t)selector_hash.size();
-
-		if (m_params.m_debug)
-		{
-			debug_printf("Unique endpoints: %u, unique selectors: %u\n", total_unique_endpoints, total_unique_selectors);
-		}
-#endif
-
-		const double total_texels = m_total_blocks * 16.0f;
-
-		int endpoint_clusters = m_params.m_max_endpoint_clusters;
-		int selector_clusters = m_params.m_max_selector_clusters;
-
-		if (endpoint_clusters > basisu_frontend::cMaxEndpointClusters)
-		{
-			error_printf("Too many endpoint clusters! (%u but max is %u)\n", endpoint_clusters, basisu_frontend::cMaxEndpointClusters);
-			return false;
-		}
-		if (selector_clusters > basisu_frontend::cMaxSelectorClusters)
-		{
-			error_printf("Too many selector clusters! (%u but max is %u)\n", selector_clusters, basisu_frontend::cMaxSelectorClusters);
-			return false;
-		}
-		
-		if (m_params.m_quality_level != -1)
-		{
-			const float quality = saturate(m_params.m_quality_level / 255.0f);
-									
-			const float bits_per_endpoint_cluster = 14.0f;
-			const float max_desired_endpoint_cluster_bits_per_texel = 1.0f; // .15f
-			int max_endpoints = static_cast<int>((max_desired_endpoint_cluster_bits_per_texel * total_texels) / bits_per_endpoint_cluster);
-			
-			const float mid = 128.0f / 255.0f;
-
-			float color_endpoint_quality = quality;
-
-			const float endpoint_split_point = 0.5f;
-			if (color_endpoint_quality <= mid)
-			{
-				color_endpoint_quality = lerp(0.0f, endpoint_split_point, powf(color_endpoint_quality / mid, .65f));
-
-				max_endpoints = clamp<int>(max_endpoints, 256, 3072);
-				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
-								
-				if (max_endpoints < 64)
-					max_endpoints = 64;
-				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(32, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
-			}
-			else
-			{
-				color_endpoint_quality = powf((color_endpoint_quality - mid) / (1.0f - mid), 1.6f);
-
-				max_endpoints = clamp<int>(max_endpoints, 256, 8192);
-				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
-								
-				if (max_endpoints < 3072)
-					max_endpoints = 3072;
-				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(3072, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
-			}
-						
-			float bits_per_selector_cluster = m_params.m_global_sel_pal ? 21.0f : 14.0f;
-
-			const float max_desired_selector_cluster_bits_per_texel = 1.0f; // .15f
-			int max_selectors = static_cast<int>((max_desired_selector_cluster_bits_per_texel * total_texels) / bits_per_selector_cluster);
-			max_selectors = clamp<int>(max_selectors, 256, basisu_frontend::cMaxSelectorClusters);
-			max_selectors = minimum<uint32_t>(max_selectors, m_total_blocks);
-
-			float color_selector_quality = quality;
-			//color_selector_quality = powf(color_selector_quality, 1.65f);
-			color_selector_quality = powf(color_selector_quality, 2.62f);
-
-			if (max_selectors < 96)
-				max_selectors = 96;
-			selector_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(96, static_cast<float>(max_selectors), color_selector_quality)), 8, basisu_frontend::cMaxSelectorClusters);
-
-			debug_printf("Max endpoints: %u, max selectors: %u\n", endpoint_clusters, selector_clusters);
-
-			if (m_params.m_quality_level >= 223)
-			{
-				if (!m_params.m_selector_rdo_thresh.was_changed())
-				{
-					if (!m_params.m_endpoint_rdo_thresh.was_changed())
-						m_params.m_endpoint_rdo_thresh *= .25f;
-					
-					if (!m_params.m_selector_rdo_thresh.was_changed())
-						m_params.m_selector_rdo_thresh *= .25f;
-				}
-			}
-			else if (m_params.m_quality_level >= 192)
-			{
-				if (!m_params.m_endpoint_rdo_thresh.was_changed())
-					m_params.m_endpoint_rdo_thresh *= .5f;
-
-				if (!m_params.m_selector_rdo_thresh.was_changed())
-					m_params.m_selector_rdo_thresh *= .5f;
-			}
-			else if (m_params.m_quality_level >= 160)
-			{
-				if (!m_params.m_endpoint_rdo_thresh.was_changed())
-					m_params.m_endpoint_rdo_thresh *= .75f;
-
-				if (!m_params.m_selector_rdo_thresh.was_changed())
-					m_params.m_selector_rdo_thresh *= .75f;
-			}
-			else if (m_params.m_quality_level >= 129)
-			{
-				float l = (quality - 129 / 255.0f) / ((160 - 129) / 255.0f);
-
-				if (!m_params.m_endpoint_rdo_thresh.was_changed())
-					m_params.m_endpoint_rdo_thresh *= lerp<float>(1.0f, .75f, l);
-				
-				if (!m_params.m_selector_rdo_thresh.was_changed())
-					m_params.m_selector_rdo_thresh *= lerp<float>(1.0f, .75f, l);
-			}
-		}
-
-		m_auto_global_sel_pal = false;
-		if (!m_params.m_global_sel_pal && m_params.m_auto_global_sel_pal)
-		{
-			const float bits_per_selector_cluster = 31.0f;
-			double selector_codebook_bpp_est = (bits_per_selector_cluster * selector_clusters) / total_texels;
-			debug_printf("selector_codebook_bpp_est: %f\n", selector_codebook_bpp_est);
-			const float force_global_sel_pal_bpp_threshold = .15f;
-			if ((total_texels <= 128.0f*128.0f) && (selector_codebook_bpp_est > force_global_sel_pal_bpp_threshold))
-			{
-				m_auto_global_sel_pal = true;
-				debug_printf("Auto global selector palette enabled\n");
-			}
-		}
-
-		basisu_frontend::params p;
-		p.m_num_source_blocks = m_total_blocks;
-		p.m_pSource_blocks = &m_source_blocks[0];
-		p.m_max_endpoint_clusters = endpoint_clusters;
-		p.m_max_selector_clusters = selector_clusters;
-		p.m_perceptual = m_params.m_perceptual;
-		p.m_debug_stats = m_params.m_debug;
-		p.m_debug_images = m_params.m_debug_images;
-		p.m_compression_level = m_params.m_compression_level;
-		p.m_tex_type = m_params.m_tex_type;
-		p.m_multithreaded = m_params.m_multithreading;
-		p.m_disable_hierarchical_endpoint_codebooks = m_params.m_disable_hierarchical_endpoint_codebooks;
-		p.m_pJob_pool = m_params.m_pJob_pool;
-
-		if ((m_params.m_global_sel_pal) || (m_auto_global_sel_pal))
-		{
-			p.m_pGlobal_sel_codebook = m_params.m_pSel_codebook;
-			p.m_num_global_sel_codebook_pal_bits = m_params.m_global_pal_bits;
-			p.m_num_global_sel_codebook_mod_bits = m_params.m_global_mod_bits;
-			p.m_use_hybrid_selector_codebooks = !m_params.m_no_hybrid_sel_cb;
-			p.m_hybrid_codebook_quality_thresh = m_params.m_hybrid_sel_cb_quality_thresh;
-		}
-
-		if (!m_frontend.init(p))
-		{
-			error_printf("basisu_frontend::init() failed!\n");
-			return false;
-		}
-
-		m_frontend.compress();
-
-		if (m_params.m_debug_images)
-		{
-			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-			{
-				char filename[1024];
-#ifdef _WIN32				
-				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
-#else
-				snprintf(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
-#endif				
-				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, true);
-
-#ifdef _WIN32
-				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
-#else
-				snprintf(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
-#endif				
-				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, false);
-			}
-		}
-
-		return true;
-	}
-
-	bool basis_compressor::extract_frontend_texture_data()
-	{
-		debug_printf("basis_compressor::extract_frontend_texture_data\n");
-
-		m_frontend_output_textures.resize(m_slice_descs.size());
-		m_best_etc1s_images.resize(m_slice_descs.size());
-		m_best_etc1s_images_unpacked.resize(m_slice_descs.size());
-
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
-
-			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
-			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
-
-			const uint32_t width = num_blocks_x * 4;
-			const uint32_t height = num_blocks_y * 4;
-
-			m_frontend_output_textures[i].init(texture_format::cETC1, width, height);
-
-			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
-				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-					memcpy(m_frontend_output_textures[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_output_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
-
-#if 0
-			if (m_params.m_debug_images)
-			{
-				char filename[1024];
-				sprintf_s(filename, sizeof(filename), "rdo_etc_frontend_%u_", i);
-				write_etc1_vis_images(m_frontend_output_textures[i], filename);
-			}
-#endif
-
-			m_best_etc1s_images[i].init(texture_format::cETC1, width, height);
-			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
-				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-					memcpy(m_best_etc1s_images[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_etc1s_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
-
-			m_best_etc1s_images[i].unpack(m_best_etc1s_images_unpacked[i]);
-		}
-
-		return true;
-	}
-
-	bool basis_compressor::process_backend()
-	{
-		debug_printf("basis_compressor::process_backend\n");
-
-		basisu_backend_params backend_params;
-		backend_params.m_debug = m_params.m_debug;
-		backend_params.m_debug_images = m_params.m_debug_images;
-		backend_params.m_etc1s = true;
-		backend_params.m_compression_level = m_params.m_compression_level;
-		
-		if (!m_params.m_no_endpoint_rdo)
-			backend_params.m_endpoint_rdo_quality_thresh = m_params.m_endpoint_rdo_thresh;
-
-		if (!m_params.m_no_selector_rdo)
-			backend_params.m_selector_rdo_quality_thresh = m_params.m_selector_rdo_thresh;
-				
-		backend_params.m_use_global_sel_codebook = (m_frontend.get_params().m_pGlobal_sel_codebook != NULL);
-		backend_params.m_global_sel_codebook_pal_bits = m_frontend.get_params().m_num_global_sel_codebook_pal_bits;
-		backend_params.m_global_sel_codebook_mod_bits = m_frontend.get_params().m_num_global_sel_codebook_mod_bits;
-		backend_params.m_use_hybrid_sel_codebooks = m_frontend.get_params().m_use_hybrid_selector_codebooks;
-
-		m_backend.init(&m_frontend, backend_params, m_slice_descs, m_params.m_pSel_codebook);
-		uint32_t total_packed_bytes = m_backend.encode();
-
-		if (!total_packed_bytes)
-		{
-			error_printf("basis_compressor::encode() failed!\n");
-			return false;
-		}
-
-		debug_printf("Total packed bytes (estimated): %u\n", total_packed_bytes);
-
-		return true;
-	}
-
-	bool basis_compressor::create_basis_file_and_transcode()
-	{
-		debug_printf("basis_compressor::create_basis_file_and_transcode\n");
-
-		const basisu_backend_output &encoded_output = m_backend.get_output();
-
-		if (!m_basis_file.init(encoded_output, m_params.m_tex_type, m_params.m_userdata0, m_params.m_userdata1, m_params.m_y_flip, m_params.m_us_per_frame))
-		{
-			error_printf("basis_compressor::write_output_files_and_compute_stats: basisu_backend:init() failed!\n");
-			return false;
-		}
-
-		const uint8_vec &comp_data = m_basis_file.get_compressed_data();
-
-		m_output_basis_file = comp_data;
-
-		// Verify the compressed data by transcoding it to ETC1/BC1 and validating the CRC's.
-		basist::basisu_transcoder decoder(m_params.m_pSel_codebook);
-		if (!decoder.validate_file_checksums(&comp_data[0], (uint32_t)comp_data.size(), true))
-		{
-			error_printf("decoder.validate_file_checksums() failed!\n");
-			return false;
-		}
-
-		m_decoded_output_textures.resize(m_slice_descs.size());
-		m_decoded_output_textures_unpacked.resize(m_slice_descs.size());
-
-		m_decoded_output_textures_bc1.resize(m_slice_descs.size());
-		m_decoded_output_textures_unpacked_bc1.resize(m_slice_descs.size());
-
-		interval_timer tm;
-		tm.start();
-
-		if (!decoder.start_transcoding(&comp_data[0], (uint32_t)comp_data.size()))
-		{
-			error_printf("decoder.start_transcoding() failed!\n");
-			return false;
-		}
-
-		debug_printf("basisu_comppressor::start_transcoding() took %3.3fms\n", tm.get_elapsed_ms());
-
-		uint32_t total_orig_pixels = 0;
-		uint32_t total_texels = 0;
-
-		double total_time_etc1 = 0;
-
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-		{
-			gpu_image decoded_texture;
-			decoded_texture.init(texture_format::cETC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
-						
-			tm.start();
-						
-			if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
-				reinterpret_cast<etc_block *>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cETC1, 8))
-			{
-				error_printf("Transcoding failed to ETC1 on slice %u!\n", i);
-				return false;
-			}
-
-			total_time_etc1 += tm.get_elapsed_secs();
-
-			uint32_t image_crc16 = basist::crc16(decoded_texture.get_ptr(), decoded_texture.get_size_in_bytes(), 0);
-			if (image_crc16 != m_backend.get_output().m_slice_image_crcs[i])
-			{
-				error_printf("Decoded image data CRC check failed on slice %u!\n", i);
-				return false;
-			}
-			debug_printf("Decoded image data CRC check succeeded on slice %i\n", i);
-
-			m_decoded_output_textures[i] = decoded_texture;
-
-			total_orig_pixels += m_slice_descs[i].m_orig_width * m_slice_descs[i].m_orig_height;
-			total_texels += m_slice_descs[i].m_width * m_slice_descs[i].m_height;
-		}
-
-		tm.start();
-				
-		basist::basisu_transcoder_init();
-
-		debug_printf("basist::basisu_transcoder_init: Took %f ms\n", tm.get_elapsed_ms());
-				
-		double total_time_bc1 = 0;
-
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-		{
-			gpu_image decoded_texture;
-			decoded_texture.init(texture_format::cBC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
-
-			tm.start();
-
-			if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
-				reinterpret_cast<etc_block *>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC1, 8))
-			{
-				error_printf("Transcoding failed to BC1 on slice %u!\n", i);
-				return false;
-			}
-
-			total_time_bc1 += tm.get_elapsed_secs();
-
-			m_decoded_output_textures_bc1[i] = decoded_texture;
-		}
-
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
-		{
-			m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
-			m_decoded_output_textures_bc1[i].unpack(m_decoded_output_textures_unpacked_bc1[i]);
-		}
-
-		debug_printf("Transcoded to ETC1 in %3.3fms, %f texels/sec\n", total_time_etc1 * 1000.0f, total_orig_pixels / total_time_etc1);
-
-		debug_printf("Transcoded to BC1 in %3.3fms, %f texels/sec\n", total_time_bc1 * 1000.0f, total_orig_pixels / total_time_bc1);
-
-		debug_printf("Total .basis output file size: %u, %3.3f bits/texel\n", comp_data.size(), comp_data.size() * 8.0f / total_orig_pixels);
-
-		m_output_blocks.resize(0);
-
-		uint32_t total_orig_texels = 0;
-		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
-
-			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
-
-			const uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
-
-			assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks);
-
-			memcpy(enlarge_vector(m_output_blocks, total_blocks), m_decoded_output_textures[slice_index].get_ptr(), sizeof(etc_block) * total_blocks);
-		}
-
-		m_basis_file_size = (uint32_t)comp_data.size();
-		m_basis_bits_per_texel = (comp_data.size() * 8.0f) / total_orig_texels;
-
-		return true;
-	}
-
-	bool basis_compressor::write_output_files_and_compute_stats()
-	{
-		debug_printf("basis_compressor::write_output_files_and_compute_stats\n");
-
-		if (m_params.m_write_output_basis_files)
-		{
-			const uint8_vec &comp_data = m_basis_file.get_compressed_data();
-
-			const std::string& basis_filename = m_params.m_out_filename;
-
-			if (!write_vec_to_file(basis_filename.c_str(), comp_data))
-			{
-				error_printf("Failed writing output data to file \"%s\"\n", basis_filename.c_str());
-				return false;
-			}
-
-			printf("Wrote output .basis file \"%s\"\n", basis_filename.c_str());
-		}
-
-		m_stats.resize(m_slice_descs.size());
-		
-		uint32_t total_orig_texels = 0;
-
-		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
-		{
-			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
-						
-			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
-
-			if (m_params.m_compute_stats)
-			{
-				printf("Slice: %u\n", slice_index);
-
-				image_stats &s = m_stats[slice_index];
-
-				// TODO: We used to output SSIM (during heavy encoder development), but this slowed down compression too much. We'll be adding it back.
-
-				image_metrics em;
-								
-				// ---- .basis ETC1S stats
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
-				em.print(".basis ETC1S 709 Luma:         ");
-								
-				s.m_basis_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
-				s.m_basis_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
-
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
-				em.print(".basis ETC1S 601 Luma:         ");
-
-				s.m_basis_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
-
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
-				em.print(".basis ETC1S RGB Avg:          ");
-
-				s.m_basis_etc1s_rgb_avg_psnr = em.m_psnr;
-
-				if (m_slice_descs.size() == 1)
-				{
-					debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_etc1s_luma_709_psnr / ((m_backend.get_output().get_output_size_estimate() * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
-				}
-
-				// ---- .basis BC1 stats
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc1[slice_index], 0, 0);
-				em.print(".basis BC1 709 Luma:           ");
-								
-				s.m_basis_bc1_luma_709_psnr = static_cast<float>(em.m_psnr);
-				s.m_basis_bc1_luma_709_ssim = static_cast<float>(em.m_ssim);
-
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc1[slice_index], 0, 0, true, true);
-				em.print(".basis BC1 601 Luma:           ");
-
-				s.m_basis_bc1_luma_601_psnr = static_cast<float>(em.m_psnr);
-
-				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc1[slice_index], 0, 3);
-				em.print(".basis BC1 RGB Avg:            ");
-
-				s.m_basis_bc1_rgb_avg_psnr = static_cast<float>(em.m_psnr);
-
-				// ---- Nearly best possible ETC1S stats
-				em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
-				em.print("Unquantized ETC1S 709 Luma:    ");
-
-				s.m_best_luma_709_psnr = static_cast<float>(em.m_psnr);
-				s.m_best_luma_709_ssim = static_cast<float>(em.m_ssim);
-
-				em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
-				em.print("Unquantized ETC1S 601 Luma:    ");
-
-				s.m_best_luma_601_psnr = static_cast<float>(em.m_psnr);
-				
-				em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
-				em.print("Unquantized ETC1S RGB Avg:     ");
-
-				s.m_best_rgb_avg_psnr = static_cast<float>(em.m_psnr);
-			}
-		
-			if (m_frontend.get_params().m_debug_images)
-			{
-				std::string out_basename;
-				if (m_params.m_out_filename.size())
-					string_get_filename(m_params.m_out_filename.c_str(), out_basename);
-				else if (m_params.m_source_filenames.size())
-					string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
-								
-				string_remove_extension(out_basename);
-				out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
-
-				// Write "best" ETC1S debug images
-				{
-					gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]);
-					best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image);
-
-					image best_etc1s_unpacked;
-					best_etc1s_gpu_image.unpack(best_etc1s_unpacked);
-					save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked);
-				}
-
-				// Write decoded ETC1S debug images
-				{
-					gpu_image decoded_etc1s(m_decoded_output_textures[slice_index]);
-					decoded_etc1s.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					write_compressed_texture_file((out_basename + "_decoded_etc1s.ktx").c_str(), decoded_etc1s);
-
-					image temp(m_decoded_output_textures_unpacked[slice_index]);
-					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					save_png(out_basename + "_decoded_etc1s.png", temp);
-				}
-				
-				// Write decoded BC1 debug images
-				{
-					gpu_image decoded_bc1(m_decoded_output_textures_bc1[slice_index]);
-					decoded_bc1.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					write_compressed_texture_file((out_basename + "_decoded_bc1.ktx").c_str(), decoded_bc1);
-
-					image temp(m_decoded_output_textures_unpacked_bc1[slice_index]);
-					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					save_png(out_basename + "_decoded_bc1.png", temp);
-				}
-			}
-		}
-				
-		return true;
-	}
-
-} // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_etc.cpp b/thirdparty/basis_universal/basisu_etc.cpp
deleted file mode 100644
index 244f1d2e6b..0000000000
--- a/thirdparty/basis_universal/basisu_etc.cpp
+++ /dev/null
@@ -1,1074 +0,0 @@
-// basis_etc.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "basisu_etc.h"
-
-#define BASISU_DEBUG_ETC_ENCODER 0
-#define BASISU_DEBUG_ETC_ENCODER_DEEPER 0
-
-namespace basisu
-{
-	const uint32_t BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE = 165;
-
-	static const struct { uint8_t m_v[4]; } g_cluster_fit_order_tab[BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE] =
-	{
-		{ { 0, 0, 0, 8 } },{ { 0, 5, 2, 1 } },{ { 0, 6, 1, 1 } },{ { 0, 7, 0, 1 } },{ { 0, 7, 1, 0 } },
-		{ { 0, 0, 8, 0 } },{ { 0, 0, 3, 5 } },{ { 0, 1, 7, 0 } },{ { 0, 0, 4, 4 } },{ { 0, 0, 2, 6 } },
-		{ { 0, 0, 7, 1 } },{ { 0, 0, 1, 7 } },{ { 0, 0, 5, 3 } },{ { 1, 6, 0, 1 } },{ { 0, 0, 6, 2 } },
-		{ { 0, 2, 6, 0 } },{ { 2, 4, 2, 0 } },{ { 0, 3, 5, 0 } },{ { 3, 3, 1, 1 } },{ { 4, 2, 0, 2 } },
-		{ { 1, 5, 2, 0 } },{ { 0, 5, 3, 0 } },{ { 0, 6, 2, 0 } },{ { 2, 4, 1, 1 } },{ { 5, 1, 0, 2 } },
-		{ { 6, 1, 1, 0 } },{ { 3, 3, 0, 2 } },{ { 6, 0, 0, 2 } },{ { 0, 8, 0, 0 } },{ { 6, 1, 0, 1 } },
-		{ { 0, 1, 6, 1 } },{ { 1, 6, 1, 0 } },{ { 4, 1, 3, 0 } },{ { 0, 2, 5, 1 } },{ { 5, 0, 3, 0 } },
-		{ { 5, 3, 0, 0 } },{ { 0, 1, 5, 2 } },{ { 0, 3, 4, 1 } },{ { 2, 5, 1, 0 } },{ { 1, 7, 0, 0 } },
-		{ { 0, 1, 4, 3 } },{ { 6, 0, 2, 0 } },{ { 0, 4, 4, 0 } },{ { 2, 6, 0, 0 } },{ { 0, 2, 4, 2 } },
-		{ { 0, 5, 1, 2 } },{ { 0, 6, 0, 2 } },{ { 3, 5, 0, 0 } },{ { 0, 4, 3, 1 } },{ { 3, 4, 1, 0 } },
-		{ { 4, 3, 1, 0 } },{ { 1, 5, 0, 2 } },{ { 0, 3, 3, 2 } },{ { 1, 4, 1, 2 } },{ { 0, 4, 2, 2 } },
-		{ { 2, 3, 3, 0 } },{ { 4, 4, 0, 0 } },{ { 1, 2, 4, 1 } },{ { 0, 5, 0, 3 } },{ { 0, 1, 3, 4 } },
-		{ { 1, 5, 1, 1 } },{ { 1, 4, 2, 1 } },{ { 1, 3, 2, 2 } },{ { 5, 2, 1, 0 } },{ { 1, 3, 3, 1 } },
-		{ { 0, 1, 2, 5 } },{ { 1, 1, 5, 1 } },{ { 0, 3, 2, 3 } },{ { 2, 5, 0, 1 } },{ { 3, 2, 2, 1 } },
-		{ { 2, 3, 0, 3 } },{ { 1, 4, 3, 0 } },{ { 2, 2, 1, 3 } },{ { 6, 2, 0, 0 } },{ { 1, 0, 6, 1 } },
-		{ { 3, 3, 2, 0 } },{ { 7, 1, 0, 0 } },{ { 3, 1, 4, 0 } },{ { 0, 2, 3, 3 } },{ { 0, 4, 1, 3 } },
-		{ { 0, 4, 0, 4 } },{ { 0, 1, 0, 7 } },{ { 2, 0, 5, 1 } },{ { 2, 0, 4, 2 } },{ { 3, 0, 2, 3 } },
-		{ { 2, 2, 4, 0 } },{ { 2, 2, 3, 1 } },{ { 4, 0, 3, 1 } },{ { 3, 2, 3, 0 } },{ { 2, 3, 2, 1 } },
-		{ { 1, 3, 4, 0 } },{ { 7, 0, 1, 0 } },{ { 3, 0, 4, 1 } },{ { 1, 0, 5, 2 } },{ { 8, 0, 0, 0 } },
-		{ { 3, 0, 1, 4 } },{ { 4, 1, 1, 2 } },{ { 4, 0, 2, 2 } },{ { 1, 2, 5, 0 } },{ { 4, 2, 1, 1 } },
-		{ { 3, 4, 0, 1 } },{ { 2, 0, 3, 3 } },{ { 5, 0, 1, 2 } },{ { 5, 0, 0, 3 } },{ { 2, 4, 0, 2 } },
-		{ { 2, 1, 4, 1 } },{ { 4, 0, 1, 3 } },{ { 2, 1, 5, 0 } },{ { 4, 2, 2, 0 } },{ { 4, 0, 4, 0 } },
-		{ { 1, 0, 4, 3 } },{ { 1, 4, 0, 3 } },{ { 3, 0, 3, 2 } },{ { 4, 3, 0, 1 } },{ { 0, 1, 1, 6 } },
-		{ { 1, 3, 1, 3 } },{ { 0, 2, 2, 4 } },{ { 2, 0, 2, 4 } },{ { 5, 1, 1, 1 } },{ { 3, 0, 5, 0 } },
-		{ { 2, 3, 1, 2 } },{ { 3, 0, 0, 5 } },{ { 0, 3, 1, 4 } },{ { 5, 0, 2, 1 } },{ { 2, 1, 3, 2 } },
-		{ { 2, 0, 6, 0 } },{ { 3, 1, 3, 1 } },{ { 5, 1, 2, 0 } },{ { 1, 0, 3, 4 } },{ { 1, 1, 6, 0 } },
-		{ { 4, 0, 0, 4 } },{ { 2, 0, 1, 5 } },{ { 0, 3, 0, 5 } },{ { 1, 3, 0, 4 } },{ { 4, 1, 2, 1 } },
-		{ { 1, 2, 3, 2 } },{ { 3, 1, 0, 4 } },{ { 5, 2, 0, 1 } },{ { 1, 2, 2, 3 } },{ { 3, 2, 1, 2 } },
-		{ { 2, 2, 2, 2 } },{ { 6, 0, 1, 1 } },{ { 1, 2, 1, 4 } },{ { 1, 1, 4, 2 } },{ { 3, 2, 0, 3 } },
-		{ { 1, 2, 0, 5 } },{ { 1, 0, 7, 0 } },{ { 3, 1, 2, 2 } },{ { 1, 0, 2, 5 } },{ { 2, 0, 0, 6 } },
-		{ { 2, 1, 1, 4 } },{ { 2, 2, 0, 4 } },{ { 1, 1, 3, 3 } },{ { 7, 0, 0, 1 } },{ { 1, 0, 0, 7 } },
-		{ { 2, 1, 2, 3 } },{ { 4, 1, 0, 3 } },{ { 3, 1, 1, 3 } },{ { 1, 1, 2, 4 } },{ { 2, 1, 0, 5 } },
-		{ { 1, 0, 1, 6 } },{ { 0, 2, 1, 5 } },{ { 0, 2, 0, 6 } },{ { 1, 1, 1, 5 } },{ { 1, 1, 0, 6 } }
-	};
-		
-	const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] =
-	{
-		{ -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 },
-		{ -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 }
-	};
-
-	const uint8_t g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
-	const uint8_t g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
-
-	// [flip][subblock][pixel_index]
-	const etc_coord2 g_etc1_pixel_coords[2][2][8] =
-	{
-		{
-		  {
-			 { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
-			 { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
-		  },
-		  {
-			 { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
-			 { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
-		  }
-		},
-		{
-		  {
-			 { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
-			 { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 }
-		  },
-		  {
-			 { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
-			 { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
-		  },
-		}
-	};
-
-	// [flip][subblock][pixel_index]
-	const uint32_t g_etc1_pixel_indices[2][2][8] =
-	{
-		{
-			{
-				0 + 4 * 0, 0 + 4 * 1, 0 + 4 * 2, 0 + 4 * 3,
-				1 + 4 * 0, 1 + 4 * 1, 1 + 4 * 2, 1 + 4 * 3
-			},
-			{
-				2 + 4 * 0, 2 + 4 * 1, 2 + 4 * 2, 2 + 4 * 3,
-				3 + 4 * 0, 3 + 4 * 1, 3 + 4 * 2, 3 + 4 * 3
-			}
-		},
-		{
-			{
-				0 + 4 * 0, 1 + 4 * 0, 2 + 4 * 0, 3 + 4 * 0,
-				0 + 4 * 1, 1 + 4 * 1, 2 + 4 * 1, 3 + 4 * 1
-			},
-			{
-				0 + 4 * 2, 1 + 4 * 2, 2 + 4 * 2, 3 + 4 * 2,
-				0 + 4 * 3, 1 + 4 * 3, 2 + 4 * 3, 3 + 4 * 3
-			},
-		}
-	};
-
-	uint16_t etc_block::pack_color5(const color_rgba& color, bool scaled, uint32_t bias)
-	{
-		return pack_color5(color.r, color.g, color.b, scaled, bias);
-	}
-
-	uint16_t etc_block::pack_color5(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
-	{
-		if (scaled)
-		{
-			r = (r * 31U + bias) / 255U;
-			g = (g * 31U + bias) / 255U;
-			b = (b * 31U + bias) / 255U;
-		}
-
-		r = minimum(r, 31U);
-		g = minimum(g, 31U);
-		b = minimum(b, 31U);
-
-		return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
-	}
-
-	color_rgba etc_block::unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha)
-	{
-		uint32_t b = packed_color5 & 31U;
-		uint32_t g = (packed_color5 >> 5U) & 31U;
-		uint32_t r = (packed_color5 >> 10U) & 31U;
-
-		if (scaled)
-		{
-			b = (b << 3U) | (b >> 2U);
-			g = (g << 3U) | (g >> 2U);
-			r = (r << 3U) | (r >> 2U);
-		}
-
-		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
-	}
-
-	void etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, bool scaled)
-	{
-		result = unpack_color5(packed_color5, scaled, 255);
-	}
-
-	void etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, bool scaled)
-	{
-		color_rgba c(unpack_color5(packed_color5, scaled, 0));
-		r = c.r;
-		g = c.g;
-		b = c.b;
-	}
-
-	bool etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
-	{
-		color_rgba_i16 dc(unpack_delta3(packed_delta3));
-
-		int b = (packed_color5 & 31U) + dc.b;
-		int g = ((packed_color5 >> 5U) & 31U) + dc.g;
-		int r = ((packed_color5 >> 10U) & 31U) + dc.r;
-
-		bool success = true;
-		if (static_cast<uint32_t>(r | g | b) > 31U)
-		{
-			success = false;
-			r = clamp<int>(r, 0, 31);
-			g = clamp<int>(g, 0, 31);
-			b = clamp<int>(b, 0, 31);
-		}
-
-		if (scaled)
-		{
-			b = (b << 3U) | (b >> 2U);
-			g = (g << 3U) | (g >> 2U);
-			r = (r << 3U) | (r >> 2U);
-		}
-
-		result.set_noclamp_rgba(r, g, b, minimum(alpha, 255U));
-		return success;
-	}
-
-	bool etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
-	{
-		color_rgba result;
-		const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
-		r = result.r;
-		g = result.g;
-		b = result.b;
-		return success;
-	}
-
-	uint16_t etc_block::pack_delta3(const color_rgba_i16& color)
-	{
-		return pack_delta3(color.r, color.g, color.b);
-	}
-
-	uint16_t etc_block::pack_delta3(int r, int g, int b)
-	{
-		assert((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
-		assert((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
-		assert((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
-		if (r < 0) r += 8;
-		if (g < 0) g += 8;
-		if (b < 0) b += 8;
-		return static_cast<uint16_t>(b | (g << 3) | (r << 6));
-	}
-
-	color_rgba_i16 etc_block::unpack_delta3(uint16_t packed_delta3)
-	{
-		int r = (packed_delta3 >> 6) & 7;
-		int g = (packed_delta3 >> 3) & 7;
-		int b = packed_delta3 & 7;
-		if (r >= 4) r -= 8;
-		if (g >= 4) g -= 8;
-		if (b >= 4) b -= 8;
-		return color_rgba_i16(r, g, b, 255);
-	}
-
-	void etc_block::unpack_delta3(int& r, int& g, int& b, uint16_t packed_delta3)
-	{
-		r = (packed_delta3 >> 6) & 7;
-		g = (packed_delta3 >> 3) & 7;
-		b = packed_delta3 & 7;
-		if (r >= 4) r -= 8;
-		if (g >= 4) g -= 8;
-		if (b >= 4) b -= 8;
-	}
-
-	uint16_t etc_block::pack_color4(const color_rgba& color, bool scaled, uint32_t bias)
-	{
-		return pack_color4(color.r, color.g, color.b, scaled, bias);
-	}
-
-	uint16_t etc_block::pack_color4(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
-	{
-		if (scaled)
-		{
-			r = (r * 15U + bias) / 255U;
-			g = (g * 15U + bias) / 255U;
-			b = (b * 15U + bias) / 255U;
-		}
-
-		r = minimum(r, 15U);
-		g = minimum(g, 15U);
-		b = minimum(b, 15U);
-
-		return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
-	}
-
-	color_rgba etc_block::unpack_color4(uint16_t packed_color4, bool scaled, uint32_t alpha)
-	{
-		uint32_t b = packed_color4 & 15U;
-		uint32_t g = (packed_color4 >> 4U) & 15U;
-		uint32_t r = (packed_color4 >> 8U) & 15U;
-
-		if (scaled)
-		{
-			b = (b << 4U) | b;
-			g = (g << 4U) | g;
-			r = (r << 4U) | r;
-		}
-
-		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
-	}
-
-	void etc_block::unpack_color4(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color4, bool scaled)
-	{
-		color_rgba c(unpack_color4(packed_color4, scaled, 0));
-		r = c.r;
-		g = c.g;
-		b = c.b;
-	}
-
-	void etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint32_t table_idx)
-	{
-		assert(table_idx < cETC1IntenModifierValues);
-		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-		uint32_t r, g, b;
-		unpack_color5(r, g, b, packed_color5, true);
-
-		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-		const int y0 = pInten_modifer_table[0];
-		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
-
-		const int y1 = pInten_modifer_table[1];
-		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
-
-		const int y2 = pInten_modifer_table[2];
-		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
-
-		const int y3 = pInten_modifer_table[3];
-		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
-	}
-
-	bool etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint16_t packed_delta3, uint32_t table_idx)
-	{
-		assert(table_idx < cETC1IntenModifierValues);
-		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-		uint32_t r, g, b;
-		bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
-
-		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-		const int y0 = pInten_modifer_table[0];
-		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
-
-		const int y1 = pInten_modifer_table[1];
-		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
-
-		const int y2 = pInten_modifer_table[2];
-		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
-
-		const int y3 = pInten_modifer_table[3];
-		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
-
-		return success;
-	}
-
-	void etc_block::get_abs_subblock_colors(color_rgba* pDst, uint16_t packed_color4, uint32_t table_idx)
-	{
-		assert(table_idx < cETC1IntenModifierValues);
-		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-		uint32_t r, g, b;
-		unpack_color4(r, g, b, packed_color4, true);
-
-		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-		const int y0 = pInten_modifer_table[0];
-		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
-
-		const int y1 = pInten_modifer_table[1];
-		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
-
-		const int y2 = pInten_modifer_table[2];
-		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
-
-		const int y3 = pInten_modifer_table[3];
-		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
-	}
-		
-	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha)
-	{
-		const bool diff_flag = block.get_diff_bit();
-		const bool flip_flag = block.get_flip_bit();
-		const uint32_t table_index0 = block.get_inten_table(0);
-		const uint32_t table_index1 = block.get_inten_table(1);
-
-		color_rgba subblock_colors0[4];
-		color_rgba subblock_colors1[4];
-
-		if (diff_flag)
-		{
-			const uint16_t base_color5 = block.get_base5_color();
-			const uint16_t delta_color3 = block.get_delta3_color();
-			etc_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
-
-			if (!etc_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
-				return false;
-		}
-		else
-		{
-			const uint16_t base_color4_0 = block.get_base4_color(0);
-			etc_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
-
-			const uint16_t base_color4_1 = block.get_base4_color(1);
-			etc_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
-		}
-
-		if (preserve_alpha)
-		{
-			if (flip_flag)
-			{
-				for (uint32_t y = 0; y < 2; y++)
-				{
-					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-					pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
-					pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
-					pDst += 4;
-				}
-
-				for (uint32_t y = 2; y < 4; y++)
-				{
-					pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
-					pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
-					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-					pDst += 4;
-				}
-			}
-			else
-			{
-				for (uint32_t y = 0; y < 4; y++)
-				{
-					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-					pDst += 4;
-				}
-			}
-		}
-		else
-		{
-			if (flip_flag)
-			{
-				// 0000
-				// 0000
-				// 1111
-				// 1111
-				for (uint32_t y = 0; y < 2; y++)
-				{
-					pDst[0] = subblock_colors0[block.get_selector(0, y)];
-					pDst[1] = subblock_colors0[block.get_selector(1, y)];
-					pDst[2] = subblock_colors0[block.get_selector(2, y)];
-					pDst[3] = subblock_colors0[block.get_selector(3, y)];
-					pDst += 4;
-				}
-
-				for (uint32_t y = 2; y < 4; y++)
-				{
-					pDst[0] = subblock_colors1[block.get_selector(0, y)];
-					pDst[1] = subblock_colors1[block.get_selector(1, y)];
-					pDst[2] = subblock_colors1[block.get_selector(2, y)];
-					pDst[3] = subblock_colors1[block.get_selector(3, y)];
-					pDst += 4;
-				}
-			}
-			else
-			{
-				// 0011
-				// 0011
-				// 0011
-				// 0011
-				for (uint32_t y = 0; y < 4; y++)
-				{
-					pDst[0] = subblock_colors0[block.get_selector(0, y)];
-					pDst[1] = subblock_colors0[block.get_selector(1, y)];
-					pDst[2] = subblock_colors1[block.get_selector(2, y)];
-					pDst[3] = subblock_colors1[block.get_selector(3, y)];
-					pDst += 4;
-				}
-			}
-		}
-
-		return true;
-	}
-
-	inline int extend_6_to_8(uint32_t n)
-	{
-		return (n << 2) | (n >> 4);
-	}
-
-	inline int extend_7_to_8(uint32_t n)
-	{
-		return (n << 1) | (n >> 6);
-	}
-
-	inline int extend_4_to_8(uint32_t n)
-	{
-		return (n << 4) | n;
-	}
-		
-	uint64_t etc_block::evaluate_etc1_error(const color_rgba* pBlock_pixels, bool perceptual, int subblock_index) const
-	{
-		color_rgba unpacked_block[16];
-
-		unpack_etc1(*this, unpacked_block);
-
-		uint64_t total_error = 0;
-
-		if (subblock_index < 0)
-		{
-			for (uint32_t i = 0; i < 16; i++)
-				total_error += color_distance(perceptual, pBlock_pixels[i], unpacked_block[i], false);
-		}
-		else
-		{
-			const bool flip_bit = get_flip_bit();
-
-			for (uint32_t i = 0; i < 8; i++)
-			{
-				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
-
-				total_error += color_distance(perceptual, pBlock_pixels[idx], unpacked_block[idx], false);
-			}
-		}
-
-		return total_error;
-	}
-
-	void etc_block::get_subblock_pixels(color_rgba* pPixels, int subblock_index) const
-	{
-		if (subblock_index < 0)
-			unpack_etc1(*this, pPixels);
-		else
-		{
-			color_rgba unpacked_block[16];
-
-			unpack_etc1(*this, unpacked_block);
-
-			const bool flip_bit = get_flip_bit();
-
-			for (uint32_t i = 0; i < 8; i++)
-			{
-				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
-
-				pPixels[i] = unpacked_block[idx];
-			}
-		}
-	}
-								
-	bool etc1_optimizer::compute()
-	{
-		assert(m_pResult->m_pSelectors);
-
-		if ((m_pParams->m_pForce_selectors) || (m_pParams->m_pEval_solution_override))
-		{
-			assert(m_pParams->m_quality >= cETCQualitySlow);
-		}
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-
-		if (m_pParams->m_cluster_fit)
-		{
-			if (m_pParams->m_quality == cETCQualityFast)
-				compute_internal_cluster_fit(4);
-			else if (m_pParams->m_quality == cETCQualityMedium)
-				compute_internal_cluster_fit(32);
-			else if (m_pParams->m_quality == cETCQualitySlow)
-				compute_internal_cluster_fit(64);
-			else
-				compute_internal_cluster_fit(BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE);
-		}
-		else
-			compute_internal_neighborhood(m_br, m_bg, m_bb);
-
-		if (!m_best_solution.m_valid)
-		{
-			m_pResult->m_error = UINT32_MAX;
-			return false;
-		}
-
-		const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
-
-#ifdef BASISU_BUILD_DEBUG
-		if (m_pParams->m_pEval_solution_override == nullptr)
-		{
-			color_rgba block_colors[4];
-			m_best_solution.m_coords.get_block_colors(block_colors);
-
-			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
-			uint64_t actual_error = 0;
-			for (uint32_t i = 0; i < n; i++)
-			{
-				if ((m_pParams->m_perceptual) && (m_pParams->m_quality >= cETCQualitySlow))
-					actual_error += color_distance(true, pSrc_pixels[i], block_colors[pSelectors[i]], false);
-				else
-					actual_error += color_distance(pSrc_pixels[i], block_colors[pSelectors[i]], false);
-			}
-			assert(actual_error == m_best_solution.m_error);
-		}
-#endif      
-
-		m_pResult->m_error = m_best_solution.m_error;
-
-		m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
-		m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
-
-		m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
-		memcpy(m_pResult->m_pSelectors, pSelectors, n);
-		m_pResult->m_n = n;
-
-		return true;
-	}
-
-	void etc1_optimizer::refine_solution(uint32_t max_refinement_trials)
-	{
-		// Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
-		// Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
-		// The goal is:
-		// pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
-		// Rearranging this:
-		// (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
-		// (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
-		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
-		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
-		// (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
-		// block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
-		// So what this means:
-		// optimal_block_color = avg_input - avg_inten_delta
-		// So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
-		// Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
-		// Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-
-		for (uint32_t refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
-		{
-			const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
-			const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
-
-			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
-			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
-			for (uint32_t r = 0; r < n; r++)
-			{
-				const uint32_t s = *pSelectors++;
-				const int yd_temp = pInten_table[s];
-				// Compute actual delta being applied to each pixel, taking into account clamping.
-				delta_sum_r += clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r;
-				delta_sum_g += clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g;
-				delta_sum_b += clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b;
-			}
-
-			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
-				break;
-
-			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
-			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
-			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
-			const int br1 = clamp<int>(static_cast<uint32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
-			const int bg1 = clamp<int>(static_cast<uint32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
-			const int bb1 = clamp<int>(static_cast<uint32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-			printf("Refinement trial %u, avg_delta %f %f %f\n", refinement_trial, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
-#endif
-
-			if (!evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution))
-				break;
-
-		}  // refinement_trial
-	}
-
-	void etc1_optimizer::compute_internal_neighborhood(int scan_r, int scan_g, int scan_b)
-	{
-		if (m_best_solution.m_error == 0)
-			return;
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-		const int scan_delta_size = m_pParams->m_scan_delta_size;
-
-		// Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
-		// Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
-		for (int zdi = 0; zdi < scan_delta_size; zdi++)
-		{
-			const int zd = m_pParams->m_pScan_deltas[zdi];
-			const int mbb = scan_b + zd;
-			if (mbb < 0) continue; else if (mbb > m_limit) break;
-
-			for (int ydi = 0; ydi < scan_delta_size; ydi++)
-			{
-				const int yd = m_pParams->m_pScan_deltas[ydi];
-				const int mbg = scan_g + yd;
-				if (mbg < 0) continue; else if (mbg > m_limit) break;
-
-				for (int xdi = 0; xdi < scan_delta_size; xdi++)
-				{
-					const int xd = m_pParams->m_pScan_deltas[xdi];
-					const int mbr = scan_r + xd;
-					if (mbr < 0) continue; else if (mbr > m_limit) break;
-
-					etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
-
-					if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
-						continue;
-
-					if (m_pParams->m_refinement)
-					{
-						refine_solution((m_pParams->m_quality == cETCQualityFast) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2));
-					}
-
-				} // xdi
-			} // ydi
-		} // zdi
-	}
-
-	void etc1_optimizer::compute_internal_cluster_fit(uint32_t total_perms_to_try)
-	{
-		if ((!m_best_solution.m_valid) || ((m_br != m_best_solution.m_coords.m_unscaled_color.r) || (m_bg != m_best_solution.m_coords.m_unscaled_color.g) || (m_bb != m_best_solution.m_coords.m_unscaled_color.b)))
-		{
-			evaluate_solution(etc1_solution_coordinates(m_br, m_bg, m_bb, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
-		}
-
-		if ((m_best_solution.m_error == 0) || (!m_best_solution.m_valid))
-			return;
-
-		for (uint32_t i = 0; i < total_perms_to_try; i++)
-		{
-			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
-
-			const int *pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
-			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
-
-			const uint8_t *pNum_selectors = g_cluster_fit_order_tab[i].m_v;
-
-			for (uint32_t q = 0; q < 4; q++)
-			{
-				const int yd_temp = pInten_table[q];
-
-				delta_sum_r += pNum_selectors[q] * (clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r);
-				delta_sum_g += pNum_selectors[q] * (clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g);
-				delta_sum_b += pNum_selectors[q] * (clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b);
-			}
-
-			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
-				continue;
-
-			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / 8;
-			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / 8;
-			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / 8;
-
-			const int br1 = clamp<int>(static_cast<uint32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
-			const int bg1 = clamp<int>(static_cast<uint32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
-			const int bb1 = clamp<int>(static_cast<uint32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-			printf("Second refinement trial %u, avg_delta %f %f %f\n", i, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
-#endif
-
-			evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
-
-			if (m_best_solution.m_error == 0)
-				break;
-		}
-	}
-
-	void etc1_optimizer::init(const params& params, results& result)
-	{
-		m_pParams = &params;
-		m_pResult = &result;
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-
-		m_selectors.resize(n);
-		m_best_selectors.resize(n);
-		m_temp_selectors.resize(n);
-		m_trial_solution.m_selectors.resize(n);
-		m_best_solution.m_selectors.resize(n);
-
-		m_limit = m_pParams->m_use_color4 ? 15 : 31;
-
-		vec3F avg_color(0.0f);
-
-		m_luma.resize(n);
-		m_sorted_luma_indices.resize(n);
-		m_sorted_luma.resize(n);
-		
-		for (uint32_t i = 0; i < n; i++)
-		{
-			const color_rgba& c = m_pParams->m_pSrc_pixels[i];
-			const vec3F fc(c.r, c.g, c.b);
-
-			avg_color += fc;
-
-			m_luma[i] = static_cast<uint16_t>(c.r + c.g + c.b);
-			m_sorted_luma_indices[i] = i;
-		}
-		avg_color /= static_cast<float>(n);
-		m_avg_color = avg_color;
-
-		m_br = clamp<int>(static_cast<uint32_t>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
-		m_bg = clamp<int>(static_cast<uint32_t>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
-		m_bb = clamp<int>(static_cast<uint32_t>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-		printf("Avg block color: %u %u %u\n", m_br, m_bg, m_bb);
-#endif
-
-		if (m_pParams->m_quality <= cETCQualityMedium)
-		{
-			indirect_sort(n, &m_sorted_luma_indices[0], &m_luma[0]);
-
-			m_pSorted_luma = &m_sorted_luma[0];
-			m_pSorted_luma_indices = &m_sorted_luma_indices[0];
-			
-			for (uint32_t i = 0; i < n; i++)
-				m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
-		}
-
-		m_best_solution.m_coords.clear();
-		m_best_solution.m_valid = false;
-		m_best_solution.m_error = UINT64_MAX;
-
-		m_solutions_tried.clear();
-	}
-
-	bool etc1_optimizer::evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-	{
-		uint32_t k = coords.m_unscaled_color.r | (coords.m_unscaled_color.g << 8) | (coords.m_unscaled_color.b << 16);
-		if (!m_solutions_tried.insert(k).second)
-			return false;
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-		printf("Eval solution: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
-#endif
-
-		trial_solution.m_valid = false;
-
-		if (m_pParams->m_constrain_against_base_color5)
-		{
-			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
-			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
-			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
-
-			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
-			{
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
-#endif
-				return false;
-			}
-		}
-
-		const color_rgba base_color(coords.get_scaled_color());
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-		assert(trial_solution.m_selectors.size() == n);
-
-		trial_solution.m_error = UINT64_MAX;
-
-		const uint8_t *pSelectors_to_use = m_pParams->m_pForce_selectors;
-
-		for (uint32_t inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
-		{
-			const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-			color_rgba block_colors[4];
-			for (uint32_t s = 0; s < 4; s++)
-			{
-				const int yd = pInten_table[s];
-				block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
-			}
-
-			uint64_t total_error = 0;
-
-			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
-			for (uint32_t c = 0; c < n; c++)
-			{
-				const color_rgba& src_pixel = *pSrc_pixels++;
-
-				uint32_t best_selector_index = 0;
-				uint32_t best_error = 0;
-
-				if (pSelectors_to_use)
-				{
-					best_selector_index = pSelectors_to_use[c];
-					best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[best_selector_index], false);
-				}
-				else
-				{
-					best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[0], false);
-
-					uint32_t trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[1], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 1;
-					}
-
-					trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[2], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 2;
-					}
-
-					trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[3], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 3;
-					}
-				}
-
-				m_temp_selectors[c] = static_cast<uint8_t>(best_selector_index);
-
-				total_error += best_error;
-				if ((m_pParams->m_pEval_solution_override == nullptr) && (total_error >= trial_solution.m_error))
-					break;
-			}
-
-			if (m_pParams->m_pEval_solution_override)
-			{
-				if (!(*m_pParams->m_pEval_solution_override)(total_error, *m_pParams, block_colors, &m_temp_selectors[0], coords))
-					return false;
-			}
-
-			if (total_error < trial_solution.m_error)
-			{
-				trial_solution.m_error = total_error;
-				trial_solution.m_coords.m_inten_table = inten_table;
-				trial_solution.m_selectors.swap(m_temp_selectors);
-				trial_solution.m_valid = true;
-			}
-		}
-		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
-#endif
-
-		bool success = false;
-		if (pBest_solution)
-		{
-			if (trial_solution.m_error < pBest_solution->m_error)
-			{
-				*pBest_solution = trial_solution;
-				success = true;
-			}
-		}
-
-		return success;
-	}
-
-	bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-	{
-		uint32_t k = coords.m_unscaled_color.r | (coords.m_unscaled_color.g << 8) | (coords.m_unscaled_color.b << 16);
-		if (!m_solutions_tried.insert(k).second)
-			return false;
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-		printf("Eval solution fast: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
-#endif
-
-		if (m_pParams->m_constrain_against_base_color5)
-		{
-			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
-			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
-			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
-
-			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
-			{
-				trial_solution.m_valid = false;
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
-#endif
-				return false;
-			}
-		}
-
-		const color_rgba base_color(coords.get_scaled_color());
-
-		const uint32_t n = m_pParams->m_num_src_pixels;
-		assert(trial_solution.m_selectors.size() == n);
-
-		trial_solution.m_error = UINT64_MAX;
-
-		for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
-		{
-			const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-			uint32_t block_inten[4];
-			color_rgba block_colors[4];
-			for (uint32_t s = 0; s < 4; s++)
-			{
-				const int yd = pInten_table[s];
-				color_rgba block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
-				block_colors[s] = block_color;
-				block_inten[s] = block_color.r + block_color.g + block_color.b;
-			}
-
-			// evaluate_solution_fast() enforces/assumesd a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
-			// The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
-			// 0   1   2   3
-			//   01  12  23
-			const uint32_t block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
-
-			uint64_t total_error = 0;
-			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
-			if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
-			{
-				if (block_inten[0] > m_pSorted_luma[n - 1])
-				{
-					const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
-					if (min_error >= trial_solution.m_error)
-						continue;
-				}
-
-				memset(&m_temp_selectors[0], 0, n);
-
-				for (uint32_t c = 0; c < n; c++)
-					total_error += color_distance(block_colors[0], pSrc_pixels[c], false);
-			}
-			else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
-			{
-				if (m_pSorted_luma[0] > block_inten[3])
-				{
-					const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
-					if (min_error >= trial_solution.m_error)
-						continue;
-				}
-
-				memset(&m_temp_selectors[0], 3, n);
-
-				for (uint32_t c = 0; c < n; c++)
-					total_error += color_distance(block_colors[3], pSrc_pixels[c], false);
-			}
-			else
-			{
-				uint32_t cur_selector = 0, c;
-				for (c = 0; c < n; c++)
-				{
-					const uint32_t y = m_pSorted_luma[c];
-					while ((y * 2) >= block_inten_midpoints[cur_selector])
-						if (++cur_selector > 2)
-							goto done;
-					const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
-					m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
-					total_error += color_distance(block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
-				}
-			done:
-				while (c < n)
-				{
-					const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
-					m_temp_selectors[sorted_pixel_index] = 3;
-					total_error += color_distance(block_colors[3], pSrc_pixels[sorted_pixel_index], false);
-					++c;
-				}
-			}
-
-			if (total_error < trial_solution.m_error)
-			{
-				trial_solution.m_error = total_error;
-				trial_solution.m_coords.m_inten_table = inten_table;
-				trial_solution.m_selectors.swap(m_temp_selectors);
-				trial_solution.m_valid = true;
-				if (!total_error)
-					break;
-			}
-		}
-		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-
-#if BASISU_DEBUG_ETC_ENCODER_DEEPER
-		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
-#endif
-
-		bool success = false;
-		if (pBest_solution)
-		{
-			if (trial_solution.m_error < pBest_solution->m_error)
-			{
-				*pBest_solution = trial_solution;
-				success = true;
-			}
-		}
-
-		return success;
-	}
-
-} // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_pvrtc1_4.cpp b/thirdparty/basis_universal/basisu_pvrtc1_4.cpp
deleted file mode 100644
index f0122fcb6c..0000000000
--- a/thirdparty/basis_universal/basisu_pvrtc1_4.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-// basisu_pvrtc1_4.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "basisu_pvrtc1_4.h"
-
-namespace basisu
-{
-	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
-	{
-		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
-				
-		uint32_t min_d = width, max_v = y;
-		if (height < width)
-		{
-			min_d = height;
-			max_v = x;
-		}
-
-		// Interleave the XY LSB's
-		uint32_t shift_ofs = 0, swizzled = 0;
-		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
-		{
-			if (y & s_bit) swizzled |= d_bit;
-			if (x & s_bit) swizzled |= (2 * d_bit);
-		}
-
-		max_v >>= shift_ofs;
-		
-		// OR in the rest of the bits from the largest dimension
-		swizzled |= (max_v << (2 * shift_ofs));
-
-		return swizzled;
-	}
-
-	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
-	{
-		assert(endpoint_index < 2);
-		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
-
-		uint32_t r, g, b, a;
-		if (packed & 0x8000)
-		{
-			// opaque 554 or 555
-			if (!endpoint_index)
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = (packed >> 1) & 15;
-					
-				if (unpack)
-				{
-					b = (b << 1) | (b >> 3);
-				}
-			}
-			else
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = packed & 31;
-			}
-
-			a = unpack ? 255 : 7;
-		}
-		else
-		{
-			// translucent 4433 or 4443
-			if (!endpoint_index)
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = (packed >> 1) & 7;
-
-				if (unpack)
-				{
-					a = (a << 1);
-					a = (a << 4) | a;
-						
-					r = (r << 1) | (r >> 3);
-					g = (g << 1) | (g >> 3);
-					b = (b << 2) | (b >> 1);
-				}
-			}
-			else
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = packed & 15;
-
-				if (unpack)
-				{
-					a = (a << 1);
-					a = (a << 4) | a;
-
-					r = (r << 1) | (r >> 3);
-					g = (g << 1) | (g >> 3);
-					b = (b << 1) | (b >> 3);
-				}
-			}
-		}
-
-		if (unpack)
-		{
-			r = (r << 3) | (r >> 2);
-			g = (g << 3) | (g >> 2);
-			b = (b << 3) | (b >> 2);
-		}
-
-		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
-
-		return color_rgba(r, g, b, a);
-	}
-
-	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
-	{
-		assert(endpoint_index < 2);
-		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
-
-		uint32_t r, g, b, a;
-		if (packed & 0x8000)
-		{
-			// opaque 554 or 555
-			if (!endpoint_index)
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = (packed >> 1) & 15;
-
-				b = (b << 1) | (b >> 3);
-			}
-			else
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = packed & 31;
-			}
-
-			a = 15;
-		}
-		else
-		{
-			// translucent 4433 or 4443
-			if (!endpoint_index)
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = (packed >> 1) & 7;
-
-				a = a << 1;
-						
-				r = (r << 1) | (r >> 3);
-				g = (g << 1) | (g >> 3);
-				b = (b << 2) | (b >> 1);
-			}
-			else
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = packed & 15;
-
-				a = a << 1;
-						
-				r = (r << 1) | (r >> 3);
-				g = (g << 1) | (g >> 3);
-				b = (b << 1) | (b >> 3);
-			}
-		}
-						
-		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
-
-		return color_rgba(r, g, b, a);
-	}
-
-	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
-	{
-		assert((x < m_width) && (y < m_height));
-
-		int block_x0 = (static_cast<int>(x) - 2) >> 2;
-		int block_x1 = block_x0 + 1;
-		int block_y0 = (static_cast<int>(y) - 2) >> 2;
-		int block_y1 = block_y0 + 1;
-		
-		block_x0 = posmod(block_x0, m_block_width);
-		block_x1 = posmod(block_x1, m_block_width);
-		block_y0 = posmod(block_y0, m_block_height);
-		block_y1 = posmod(block_y1, m_block_height);
-		
-		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
-		{
-			for (uint32_t c = 0; c < 4; c++)
-			{
-				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
-				pColors[1][c] = static_cast<uint8_t>(m);
-				pColors[2][c] = static_cast<uint8_t>(m);
-			}
-			pColors[2][3] = 0;
-			return true;
-		}
-
-		for (uint32_t c = 0; c < 4; c++)
-		{
-			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
-			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
-		}
-
-		return false;
-	}
-		
-	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
-	{
-		assert((x < m_width) && (y < m_height));
-
-		int block_x0 = (static_cast<int>(x) - 2) >> 2;
-		int block_x1 = block_x0 + 1;
-		int block_y0 = (static_cast<int>(y) - 2) >> 2;
-		int block_y1 = block_y0 + 1;
-		
-		block_x0 = posmod(block_x0, m_block_width);
-		block_x1 = posmod(block_x1, m_block_width);
-		block_y0 = posmod(block_y0, m_block_height);
-		block_y1 = posmod(block_y1, m_block_height);
-		
-		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
-		{
-			if (m == 0)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-			else if (m == 3)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
-			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
-
-			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
-		}
-		else
-		{
-			if (m == 0)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-			else if (m == 3)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
-			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
-
-			if (m == 2)
-				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
-			else
-				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
-		}
-	}
-
-} // basisu
diff --git a/thirdparty/basis_universal/encoder/apg_bmp.c b/thirdparty/basis_universal/encoder/apg_bmp.c
new file mode 100644
index 0000000000..ef3d015e40
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/apg_bmp.c
@@ -0,0 +1,541 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3
+Licence: see apg_bmp.h
+C99
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS 1
+#endif
+
+#include "apg_bmp.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Maximum pixel dimensions of width or height of an image. Should accommodate max used in graphics APIs.
+   NOTE: 65536*65536 is the biggest number storable in 32 bits.
+   This needs to be multiplied by n_channels so actual memory indices are not uint32 but size_t to avoid overflow.
+   Note this will crash stb_image_write et al at maximum size which use 32bits, so reduce max size to accom. */
+#define _BMP_MAX_DIMS 65536
+#define _BMP_FILE_HDR_SZ 14
+#define _BMP_MIN_DIB_HDR_SZ 40
+#define _BMP_MIN_HDR_SZ ( _BMP_FILE_HDR_SZ + _BMP_MIN_DIB_HDR_SZ )
+#define _BMP_MAX_IMAGE_FILE_SIZE (1024ULL*1024ULL*1024ULL)
+
+#pragma pack( push, 1 ) // supported on GCC in addition to individual packing attribs
+/* All BMP files, regardless of type, start with this file header */
+typedef struct _bmp_file_header_t {
+  char file_type[2];
+  uint32_t file_sz;
+  uint16_t reserved1;
+  uint16_t reserved2;
+  uint32_t image_data_offset;
+} _bmp_file_header_t;
+
+/* Following the file header is the BMP type header. this is the most commonly used format */
+typedef struct _bmp_dib_BITMAPINFOHEADER_t {
+  uint32_t this_header_sz;
+  int32_t w;                      // in older headers w & h these are shorts and may be unsigned
+  int32_t h;                      //
+  uint16_t n_planes;              // must be 1
+  uint16_t bpp;                   // bits per pixel. 1,4,8,16,24,32.
+  uint32_t compression_method;    // 16 and 32-bit images must have a value of 3 here
+  uint32_t image_uncompressed_sz; // not consistently used in the wild, so ignored here.
+  int32_t horiz_pixels_per_meter; // not used.
+  int32_t vert_pixels_per_meter;  // not used.
+  uint32_t n_colours_in_palette;  //
+  uint32_t n_important_colours;   // not used.
+  /* NOTE(Anton) a DIB header may end here at 40-bytes. be careful using sizeof() */
+  /* if 'compression' value, above, is set to 3 ie the image is 16 or 32-bit, then these colour channel masks follow the headers.
+  these are big-endian order bit masks to assign bits of each pixel to different colours. bits used must be contiguous and not overlap. */
+  uint32_t bitmask_r;
+  uint32_t bitmask_g;
+  uint32_t bitmask_b;
+} _bmp_dib_BITMAPINFOHEADER_t;
+#pragma pack( pop )
+
+typedef enum _bmp_compression_t {
+  BI_RGB            = 0,
+  BI_RLE8           = 1,
+  BI_RLE4           = 2,
+  BI_BITFIELDS      = 3,
+  BI_JPEG           = 4,
+  BI_PNG            = 5,
+  BI_ALPHABITFIELDS = 6,
+  BI_CMYK           = 11,
+  BI_CMYKRLE8       = 12,
+  BI_CMYRLE4        = 13
+} _bmp_compression_t;
+
+/* convenience struct and file->memory function */
+typedef struct _entire_file_t {
+  void* data;
+  size_t sz;
+} _entire_file_t;
+
+/*
+RETURNS
+- true on success. record->data is allocated memory and must be freed by the caller.
+- false on any error. Any allocated memory is freed if false is returned */
+static bool _read_entire_file( const char* filename, _entire_file_t* record ) {
+  FILE* fp = fopen( filename, "rb" );
+  if ( !fp ) { return false; }
+  fseek( fp, 0L, SEEK_END );
+  record->sz   = (size_t)ftell( fp );
+
+  // Immediately bail on anything larger than _BMP_MAX_IMAGE_FILE_SIZE. 
+  if (record->sz > _BMP_MAX_IMAGE_FILE_SIZE) {
+    fclose( fp );
+    return false;
+  }
+
+  record->data = malloc( record->sz );
+  if ( !record->data ) {
+    fclose( fp );
+    return false;
+  }
+  rewind( fp );
+  size_t nr = fread( record->data, record->sz, 1, fp );
+  fclose( fp );
+  if ( 1 != nr ) { return false; }
+  return true;
+}
+
+static bool _validate_file_hdr( _bmp_file_header_t* file_hdr_ptr, size_t file_sz ) {
+  if ( !file_hdr_ptr ) { return false; }
+  if ( file_hdr_ptr->file_type[0] != 'B' || file_hdr_ptr->file_type[1] != 'M' ) { return false; }
+  if ( file_hdr_ptr->image_data_offset > file_sz ) { return false; }
+  return true;
+}
+
+static bool _validate_dib_hdr( _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr, size_t file_sz ) {
+  if ( !dib_hdr_ptr ) { return false; }
+  if ( _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz > file_sz ) { return false; }
+  if ( ( 32 == dib_hdr_ptr->bpp || 16 == dib_hdr_ptr->bpp ) && ( BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) ) {
+    return false;
+  }
+  if ( BI_RGB != dib_hdr_ptr->compression_method && BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) {
+    return false;
+  }
+  // NOTE(Anton) using abs() in the if-statement was blowing up on large negative numbers. switched to labs()
+  if ( 0 == dib_hdr_ptr->w || 0 == dib_hdr_ptr->h || labs( dib_hdr_ptr->w ) > _BMP_MAX_DIMS || labs( dib_hdr_ptr->h ) > _BMP_MAX_DIMS ) { return false; }
+
+  /* NOTE(Anton) if images reliably used n_colours_in_palette we could have done a palette/file size integrity check here.
+  because some always set 0 then we have to check every palette indexing as we read them */
+  return true;
+}
+
+/* NOTE(Anton) this could have ifdef branches on different compilers for the intrinsics versions for perf */
+static uint32_t _bitscan( uint32_t dword ) {
+  for ( uint32_t i = 0; i < 32; i++ ) {
+    if ( 1 & dword ) { return i; }
+    dword = dword >> 1;
+  }
+  return (uint32_t)-1;
+}
+
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans ) {
+  if ( !filename || !w || !h || !n_chans ) { return NULL; }
+
+  // read in the whole file into memory first - much faster than parsing on-the-fly
+  _entire_file_t record;
+  if ( !_read_entire_file( filename, &record ) ) { return NULL; }
+  if ( record.sz < _BMP_MIN_HDR_SZ ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grab and validate the first, file, header
+  _bmp_file_header_t* file_hdr_ptr = (_bmp_file_header_t*)record.data;
+  if ( !_validate_file_hdr( file_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grad and validate the second, DIB, header
+  _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr = (_bmp_dib_BITMAPINFOHEADER_t*)( (uint8_t*)record.data + _BMP_FILE_HDR_SZ );
+  if ( !_validate_dib_hdr( dib_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // bitmaps can have negative dims to indicate the image should be flipped
+  uint32_t width = *w = abs( dib_hdr_ptr->w );
+  uint32_t height = *h = abs( dib_hdr_ptr->h );
+
+  // TODO(Anton) flip image memory at the end if this is true. because doing it per row was making me write bugs.
+  // bool vertically_flip = dib_hdr_ptr->h > 0 ? false : true;
+
+  // channel count and palette are not well defined in the header so we make a good guess here
+  uint32_t n_dst_chans = 3, n_src_chans = 3;
+  bool has_palette = false;
+  switch ( dib_hdr_ptr->bpp ) {
+  case 32: n_dst_chans = n_src_chans = 4; break; // technically can be RGB but not supported
+  case 24: n_dst_chans = n_src_chans = 3; break; // technically can be RGBA but not supported
+  case 8:                                        // seems to always use a BGR0 palette, even for greyscale
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 4: // always has a palette - needed for a MS-saved BMP
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 1: // 1-bpp means the palette has 3 colour channels with 2 colours i.e. monochrome but not always black & white
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  default: // this includes 2bpp and 16bpp
+    free( record.data );
+    return NULL;
+  } // endswitch
+  *n_chans = n_dst_chans;
+  // NOTE(Anton) some image formats are not allowed a palette - could check for a bad header spec here also
+  if ( dib_hdr_ptr->n_colours_in_palette > 0 ) { has_palette = true; }
+
+#ifdef APG_BMP_DEBUG_OUTPUT
+  printf( "apg_bmp_debug: reading image\n|-filename `%s`\n|-dims %ux%u pixels\n|-bpp %u\n|-n_src_chans %u\n|-n_dst_chans %u\n", filename, *w, *h,
+    dib_hdr_ptr->bpp, n_src_chans, n_dst_chans );
+#endif
+
+  uint32_t palette_offset = _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz;
+  bool has_bitmasks       = false;
+  if ( BI_BITFIELDS == dib_hdr_ptr->compression_method || BI_ALPHABITFIELDS == dib_hdr_ptr->compression_method ) {
+    has_bitmasks = true;
+    palette_offset += 12;
+  }
+  if ( palette_offset > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // work out if any padding how much to skip at end of each row
+  uint32_t unpadded_row_sz = width * n_src_chans;
+  // bit-encoded palette indices have different padding properties
+  if ( 4 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 2 > 0 ? width / 2 + 1 : width / 2; // find how many whole bytes required for this bit width
+  }
+  if ( 1 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 8 > 0 ? width / 8 + 1 : width / 8; // find how many whole bytes required for this bit width
+  }
+  uint32_t row_padding_sz = 0 == unpadded_row_sz % 4 ? 0 : 4 - ( unpadded_row_sz % 4 ); // NOTE(Anton) didn't expect operator precedence of - over %
+
+  // another file size integrity check: partially validate source image data size
+  // 'image_data_offset' is by row padded to 4 bytes and is either colour data or palette indices.
+  if ( file_hdr_ptr->image_data_offset + ( unpadded_row_sz + row_padding_sz ) * height > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // find which bit number each colour channel starts at, so we can separate colours out
+  uint32_t bitshift_rgba[4] = {0, 0, 0, 0}; // NOTE(Anton) noticed this was int and not uint32_t so changed it. 17 Mar 2020
+  uint32_t bitmask_a        = 0;
+  if ( has_bitmasks ) {
+    bitmask_a        = ~( dib_hdr_ptr->bitmask_r | dib_hdr_ptr->bitmask_g | dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[0] = _bitscan( dib_hdr_ptr->bitmask_r );
+    bitshift_rgba[1] = _bitscan( dib_hdr_ptr->bitmask_g );
+    bitshift_rgba[2] = _bitscan( dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[3] = _bitscan( bitmask_a );
+  }
+
+  // allocate memory for the output pixels block. cast to size_t in case width and height are both the max of 65536 and n_dst_chans > 1
+  unsigned char* dst_img_ptr = malloc( (size_t)width * (size_t)height * (size_t)n_dst_chans );
+  if ( !dst_img_ptr ) {
+    free( record.data );
+    return NULL;
+  }
+
+  uint8_t* palette_data_ptr = (uint8_t*)record.data + palette_offset;
+  uint8_t* src_img_ptr      = (uint8_t*)record.data + file_hdr_ptr->image_data_offset;
+  size_t dst_stride_sz      = width * n_dst_chans;
+
+  //   == 32-bpp -> 32-bit RGBA. == 32-bit and 16-bit require bitmasks
+  if ( 32 == dib_hdr_ptr->bpp ) {
+    // check source image has enough data in it to read from
+    if ( (size_t)file_hdr_ptr->image_data_offset + (size_t)height * (size_t)width * (size_t)n_src_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = r * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        uint32_t pixel;
+        memcpy( &pixel, &src_img_ptr[src_byte_idx], 4 );
+        // NOTE(Anton) the below assumes 32-bits is always RGBA 1 byte per channel. 10,10,10 RGB exists though and isn't handled.
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_r ) >> bitshift_rgba[0] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_g ) >> bitshift_rgba[1] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_b ) >> bitshift_rgba[2] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & bitmask_a ) >> bitshift_rgba[3] );
+        src_byte_idx += 4;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 8-bpp -> 24-bit RGB ==
+  } else if ( 8 == dib_hdr_ptr->bpp && has_palette ) {
+    // validate indices (body of image data) fits in file
+    if ( file_hdr_ptr->image_data_offset + height * width > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // "most palettes are 4 bytes in RGB0 order but 3 for..." - it was actually BRG0 in old images -- Anton
+        uint8_t index = src_img_ptr[src_byte_idx]; // 8-bit index value per pixel
+
+        if ( palette_offset + index * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 4-bpp (16-colour) -> 24-bit RGB ==
+  } else if ( 4 == dib_hdr_ptr->bpp && has_palette ) {
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          free( dst_img_ptr );
+          return NULL;
+        }
+        // handle 2 pixels at a time
+        uint8_t pixel_duo = src_img_ptr[src_byte_idx];
+        uint8_t a_index   = ( 0xFF & pixel_duo ) >> 4;
+        uint8_t b_index   = 0xF & pixel_duo;
+
+        if ( palette_offset + a_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 0];
+        if ( ++c >= width ) { // advance a column
+          c = 0;
+          r++;
+          if ( r >= height ) { // done. no need to get second pixel. eg a 1x1 pixel image.
+            free( record.data );
+            return dst_img_ptr;
+          }
+          dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+        }
+
+        if ( palette_offset + b_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done. probably redundant check since checking r >= height.
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 1-bpp -> 24-bit RGB ==
+  } else if ( 1 == dib_hdr_ptr->bpp && has_palette ) {
+    /* encoding method for monochrome is not well documented.
+    a 2x2 pixel image is stored as 4 1-bit palette indexes
+    the palette is stored as any 2 RGB0 colours (not necessarily B&W)
+    so for an image with indexes like so:
+    1 1
+    0 1
+    it is bit-encoded as follows, starting at MSB:
+    01000000 00000000 00000000 00000000 (first byte val  64)
+    11000000 00000000 00000000 00000000 (first byte val 192)
+    data is still split by row and each row padded to 4 byte multiples
+     */
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      uint8_t bit_idx       = 0; // used in monochrome
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( 8 == bit_idx ) { // start reading from the next byte
+          src_byte_idx++;
+          bit_idx = 0;
+        }
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        uint8_t pixel_oct   = src_img_ptr[src_byte_idx];
+        uint8_t bit         = 128 >> bit_idx;
+        uint8_t masked      = pixel_oct & bit;
+        uint8_t palette_idx = masked > 0 ? 1 : 0;
+
+        if ( palette_offset + palette_idx * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 0];
+        bit_idx++;
+      }
+      src_byte_idx += ( row_padding_sz + 1 ); // 1bpp is special here
+    }
+
+    // == 24-bpp -> 24-bit RGB == (but also should handle some other n_chans cases)
+  } else {
+    // NOTE(Anton) this only supports 1 byte per channel
+    if ( file_hdr_ptr->image_data_offset + height * width * n_dst_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // re-orders from BGR to RGB
+        if ( n_dst_chans > 3 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 3]; }
+        if ( n_dst_chans > 2 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 2]; }
+        if ( n_dst_chans > 1 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 1]; }
+        dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx];
+        src_byte_idx += n_src_chans;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+  } // endif bpp
+
+  free( record.data );
+  return dst_img_ptr;
+}
+
+void apg_bmp_free( unsigned char* pixels_ptr ) {
+  if ( !pixels_ptr ) { return; }
+  free( pixels_ptr );
+}
+
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans ) {
+  if ( !filename || !pixels_ptr ) { return 0; }
+  if ( 0 == w || 0 == h ) { return 0; }
+  if ( labs( w ) > _BMP_MAX_DIMS || labs( h ) > _BMP_MAX_DIMS ) { return 0; }
+  if ( n_chans != 3 && n_chans != 4 ) { return 0; }
+
+  uint32_t height = (uint32_t)labs( h );
+  uint32_t width  = (uint32_t)labs( w );
+  // work out if any padding how much to skip at end of each row
+  const size_t unpadded_row_sz      = width * n_chans;
+  const size_t row_padding_sz       = 0 == unpadded_row_sz % 4 ? 0 : 4 - unpadded_row_sz % 4;
+  const size_t row_sz               = unpadded_row_sz + row_padding_sz;
+  const size_t dst_pixels_padded_sz = row_sz * height;
+
+  const size_t dib_hdr_sz = sizeof( _bmp_dib_BITMAPINFOHEADER_t );
+  _bmp_file_header_t file_hdr;
+  {
+    file_hdr.file_type[0]      = 'B';
+    file_hdr.file_type[1]      = 'M';
+    file_hdr.file_sz           = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz + (uint32_t)dst_pixels_padded_sz;
+    file_hdr.reserved1         = 0;
+    file_hdr.reserved2         = 0;
+    file_hdr.image_data_offset = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz;
+  }
+  _bmp_dib_BITMAPINFOHEADER_t dib_hdr;
+  {
+    dib_hdr.this_header_sz         = _BMP_MIN_DIB_HDR_SZ; // NOTE: must be 40 and not include the bitmask memory in size here
+    dib_hdr.w                      = w;
+    dib_hdr.h                      = h;
+    dib_hdr.n_planes               = 1;
+    dib_hdr.bpp                    = 3 == n_chans ? 24 : 32;
+    dib_hdr.compression_method     = 3 == n_chans ? BI_RGB : BI_BITFIELDS;
+    dib_hdr.image_uncompressed_sz  = 0;
+    dib_hdr.horiz_pixels_per_meter = 0;
+    dib_hdr.vert_pixels_per_meter  = 0;
+    dib_hdr.n_colours_in_palette   = 0;
+    dib_hdr.n_important_colours    = 0;
+    // big-endian masks. only used in BI_BITFIELDS and BI_ALPHABITFIELDS ( 16 and 32-bit images )
+    // important note: GIMP stores BMP data in this array order for 32-bit: [A][B][G][R]
+    dib_hdr.bitmask_r = 0xFF000000;
+    dib_hdr.bitmask_g = 0x00FF0000;
+    dib_hdr.bitmask_b = 0x0000FF00;
+  }
+
+  uint8_t* dst_pixels_ptr = malloc( dst_pixels_padded_sz );
+  if ( !dst_pixels_ptr ) { return 0; }
+  {
+    size_t dst_byte_idx = 0;
+    uint8_t padding[4]  = {0, 0, 0, 0};
+    uint8_t rgba[4]     = {0, 0, 0, 0};
+    uint8_t bgra[4]     = {0, 0, 0, 0};
+
+    for ( uint32_t row = 0; row < height; row++ ) {
+      size_t src_byte_idx = ( height - 1 - row ) * n_chans * width;
+      for ( uint32_t col = 0; col < width; col++ ) {
+        for ( uint32_t chan = 0; chan < n_chans; chan++ ) { rgba[chan] = pixels_ptr[src_byte_idx++]; }
+        if ( 3 == n_chans ) {
+          bgra[0] = rgba[2];
+          bgra[1] = rgba[1];
+          bgra[2] = rgba[0];
+        } else {
+          /* NOTE(Anton) RGBA with alpha channel would be better supported with an extended DIB header */
+          bgra[0] = rgba[3];
+          bgra[1] = rgba[2];
+          bgra[2] = rgba[1];
+          bgra[3] = rgba[0]; // alpha
+        }
+        memcpy( &dst_pixels_ptr[dst_byte_idx], bgra, n_chans );
+        dst_byte_idx += (size_t)n_chans;
+      } // endfor col
+      if ( row_padding_sz > 0 ) {
+        memcpy( &dst_pixels_ptr[dst_byte_idx], padding, row_padding_sz );
+        dst_byte_idx += row_padding_sz;
+      }
+    } // endfor row
+  }
+  {
+    FILE* fp = fopen( filename, "wb" );
+    if ( !fp ) {
+      free( dst_pixels_ptr );
+      return 0;
+    }
+    if ( 1 != fwrite( &file_hdr, _BMP_FILE_HDR_SZ, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( &dib_hdr, dib_hdr_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( dst_pixels_ptr, dst_pixels_padded_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    fclose( fp );
+  }
+  free( dst_pixels_ptr );
+
+  return 1;
+}
diff --git a/thirdparty/basis_universal/encoder/apg_bmp.h b/thirdparty/basis_universal/encoder/apg_bmp.h
new file mode 100644
index 0000000000..8cd73b62e0
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/apg_bmp.h
@@ -0,0 +1,123 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3.1 18 March 2020.
+Licence: see bottom of file.
+C89 ( Implementation is C99 )
+
+Contributors:
+- Anton Gerdelan - Initial code.
+- Saija Sorsa    - Fuzz testing.
+
+Instructions:
+- Just drop this header, and the matching .c file into your project.
+- To get debug printouts during parsing define APG_BMP_DEBUG_OUTPUT.
+
+Advantages:
+- The implementation is fast, simple, and supports more formats than most BMP reader libraries.
+- The reader function is fuzzed with AFL https://lcamtuf.coredump.cx/afl/.
+- The reader is robust to large files and malformed files, and will return any valid partial data in an image.
+- Reader supports 32bpp (with alpha channel), 24bpp, 8bpp, 4bpp, and 1bpp monochrome BMP images.
+- Reader handles indexed BMP images using a colour palette.
+- Writer supports 32bpp RGBA and 24bpp uncompressed RGB images.
+
+Current Limitations:
+- 16-bit images not supported (don't have any samples to test on).
+- No support for interleaved channel bit layouts eg RGB101010 RGB555 RGB565.
+- No support for compressed BMP images, although in practice these are not used.
+- Output images with alpha channel are written in BITMAPINFOHEADER format.
+  For better alpha support in other apps the 124-bit v5 header could be used instead,
+	at the cost of some backward compatibility and bloat.
+
+To Do:
+- FUZZING
+  - create a unique fuzz test set for (8,4,1 BPP).
+- (maybe) FEATURE Flipping the image based on negative width and height in header, and/or function arguments. 
+- (maybe) PERF ifdef intrinsics/asm for bitscan. Platform-specific code so won't include unless necessary.
+- (maybe) FEATURE Add parameter for padding output memory to eg 4-byte alignment or n channels.
+- (maybe) FEATURE Improved apps support in alpha channel writing (using v5 header).
+*/
+
+#ifndef APG_BMP_H_
+#define APG_BMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* CPP */
+
+/* Reads a bitmap from a file, allocates memory for the raw image data, and returns it.
+PARAMS
+  * w,h,     - Retrieves the width and height of the BMP in pixels.
+  * n_chans  - Retrieves the number of channels in the BMP.
+RETURNS
+  * Tightly-packed pixel memory in RGBA order. The caller must call free() on the memory.
+  * NULL on any error. Any allocated memory is freed before returning NULL. */
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans );
+
+/* Calls free() on memory created by apg_bmp_read */
+void apg_bmp_free( unsigned char* pixels_ptr );
+
+/* Writes a bitmap to a file.
+PARAMS
+  * filename   - e.g."my_bitmap.bmp". Must not be NULL.
+  * pixels_ptr - Pointer to tightly-packed pixel memory in RGBA order. Must not be NULL. There must be abs(w)*abs(h)*n_chans bytes in the memory pointed to.
+  * w,h,       - Width and height of the image in pixels.
+  * n_chans    - The number of channels in the BMP. 3 or 4 supported for writing, which means RGB or RGBA memory, respectively.
+RETURNS
+  * Zero on any error, non zero on success. */
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans );
+
+#ifdef __cplusplus
+}
+#endif /* CPP */
+
+#endif /*_APG_BMP_H_ */
+
+/*
+-------------------------------------------------------------------------------------
+This software is available under two licences - you may use it under either licence.
+-------------------------------------------------------------------------------------
+FIRST LICENCE OPTION
+
+>                                  Apache License
+>                            Version 2.0, January 2004
+>                         http://www.apache.org/licenses/
+>    Copyright 2019 Anton Gerdelan.
+>    Licensed under the Apache License, Version 2.0 (the "License");
+>    you may not use this file except in compliance with the License.
+>    You may obtain a copy of the License at
+>        http://www.apache.org/licenses/LICENSE-2.0
+>    Unless required by applicable law or agreed to in writing, software
+>    distributed under the License is distributed on an "AS IS" BASIS,
+>    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+>    See the License for the specific language governing permissions and
+>    limitations under the License.
+-------------------------------------------------------------------------------------
+SECOND LICENCE OPTION
+
+> This is free and unencumbered software released into the public domain.
+>
+> Anyone is free to copy, modify, publish, use, compile, sell, or
+> distribute this software, either in source code form or as a compiled
+> binary, for any purpose, commercial or non-commercial, and by any
+> means.
+>
+> In jurisdictions that recognize copyright laws, the author or authors
+> of this software dedicate any and all copyright interest in the
+> software to the public domain. We make this dedication for the benefit
+> of the public at large and to the detriment of our heirs and
+> successors. We intend this dedication to be an overt act of
+> relinquishment in perpetuity of all present and future rights to this
+> software under copyright law.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+> IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+> OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+> ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+> OTHER DEALINGS IN THE SOFTWARE.
+>
+> For more information, please refer to <http://unlicense.org>
+-------------------------------------------------------------------------------------
+*/
diff --git a/thirdparty/basis_universal/basisu_astc_decomp.cpp b/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
index cc0a6ced7a..53bccfc515 100644
--- a/thirdparty/basis_universal/basisu_astc_decomp.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
@@ -50,6 +50,13 @@ typedef uint64_t deUint64;
 
 #define DE_ASSERT assert
 
+#ifdef _MSC_VER
+#pragma warning (disable:4505) // unreferenced local function has been removed
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 namespace basisu_astc
 {
 	static bool inBounds(int v, int l, int h)
@@ -150,7 +157,7 @@ namespace basisu_astc
 
 		UVec4 asUint() const
 		{
-			return UVec4(std::max(0, m_c[0]), std::max(0, m_c[1]), std::max(0, m_c[2]), std::max(0, m_c[3]));
+			return UVec4(basisu::maximum(0, m_c[0]), basisu::maximum(0, m_c[1]), basisu::maximum(0, m_c[2]), basisu::maximum(0, m_c[3]));
 		}
 
 		int32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
@@ -1256,7 +1263,7 @@ void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeigh
 	const int		numWeightsPerTexel	= blockMode.isDualPlane ? 2 : 1;
 	const deUint32	scaleX				= (1024 + blockWidth/2) / (blockWidth-1);
 	const deUint32	scaleY				= (1024 + blockHeight/2) / (blockHeight-1);
-	DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= DE_LENGTH_OF_ARRAY(unquantizedWeights));
+	DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights));
 	for (int texelY = 0; texelY < blockHeight; texelY++)
 	{
 		for (int texelX = 0; texelX < blockWidth; texelX++)
@@ -1548,3 +1555,7 @@ bool decompress(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth
 
 } // astc
 } // basisu_astc
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/thirdparty/basis_universal/basisu_astc_decomp.h b/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
index 6cd053b7b6..9ec2e46076 100644
--- a/thirdparty/basis_universal/basisu_astc_decomp.h
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
@@ -23,7 +23,7 @@
  * \brief ASTC Utilities.
  *//*--------------------------------------------------------------------*/
 
-#include "transcoder/basisu.h" // to pick up the iterator debug level madness
+#include "../transcoder/basisu.h" // to pick up the iterator debug level madness
 #include <vector>
 #include <stdint.h>
 
diff --git a/thirdparty/basis_universal/basisu_backend.cpp b/thirdparty/basis_universal/encoder/basisu_backend.cpp
index 3a689e58d7..19911fcbb4 100644
--- a/thirdparty/basis_universal/basisu_backend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_backend.cpp
@@ -1,5 +1,5 @@
 // basisu_backend.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
 //
 #include "basisu_backend.h"
 
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_FASTER_SELECTOR_REORDERING 0
 #define BASISU_BACKEND_VERIFY(c) verify(c, __LINE__);
 
@@ -176,64 +181,117 @@ namespace basisu
 	void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec& all_endpoint_indices)
 	{
 		basisu_frontend& r = *m_pFront_end;
-		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+		//const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 
-		if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+		if (m_params.m_used_global_codebooks)
 		{
-			// We're changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
-			uint_vec new_block_endpoints(get_total_blocks());
-
-			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+			m_endpoint_remap_table_old_to_new.clear();
+			m_endpoint_remap_table_old_to_new.resize(r.get_total_endpoint_clusters());
+			for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++)
+				m_endpoint_remap_table_old_to_new[i] = i;
+		}
+		else
+		{
+			//if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+			if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 1))
 			{
-				const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
-				const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
-				const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+				// We've changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
+				uint_vec new_block_endpoints(get_total_blocks());
 
-				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
-					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-						new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
-			}
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+				{
+					const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 
-			int_vec old_to_new_endpoint_indices;
-			r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+							new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
+				}
 
-			create_endpoint_palette();
+				int_vec old_to_new_endpoint_indices;
+				r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
 
-			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
-			{
-				const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+				create_endpoint_palette();
 
-				const uint32_t width = m_slices[slice_index].m_width;
-				const uint32_t height = m_slices[slice_index].m_height;
-				const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
-				const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
-
-				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 				{
-					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+					//const uint32_t width = m_slices[slice_index].m_width;
+					//const uint32_t height = m_slices[slice_index].m_height;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 					{
-						const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+						{
+							//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+							encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
-						m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
-					} // block_x
-				} // block_y
-			} // slice_index
+							m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
+						} // block_x
+					} // block_y
+				} // slice_index
+
+				for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
+					all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];
+
+			} //if (total_block_endpoints_remapped)
+
+			// Sort endpoint codebook
+			palette_index_reorderer reorderer;
+			reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0);
+			m_endpoint_remap_table_old_to_new = reorderer.get_remap_table();
+		}
 
-			for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
-				all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];
+		// For endpoints, old_to_new[] may not be bijective! 
+		// Some "old" entries may be unused and don't get remapped into the "new" array.
 
-		} //if (total_block_endpoints_remapped)
+		m_old_endpoint_was_used.clear();
+		m_old_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
 
-		// Sort endpoint codebook
-		palette_index_reorderer reorderer;
-		reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0);
-		m_endpoint_remap_table_old_to_new = reorderer.get_remap_table();
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
 
+					m_old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::reoptimize_and_sort_endpoints_codebook: First old entry index: %u\n", first_old_entry_index);
+						
+		m_new_endpoint_was_used.clear();
+		m_new_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+
+		m_endpoint_remap_table_new_to_old.clear();
 		m_endpoint_remap_table_new_to_old.resize(r.get_total_endpoint_clusters());
-		for (uint32_t i = 0; i < m_endpoint_remap_table_old_to_new.size(); i++)
-			m_endpoint_remap_table_new_to_old[m_endpoint_remap_table_old_to_new[i]] = i;
+		
+		// Set unused entries in the new array to point to the first used entry in the old array.
+		m_endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		for (uint32_t old_index = 0; old_index < m_endpoint_remap_table_old_to_new.size(); old_index++)
+		{
+			if (m_old_endpoint_was_used[old_index])
+			{
+				const uint32_t new_index = m_endpoint_remap_table_old_to_new[old_index];
+				
+				m_new_endpoint_was_used[new_index] = true;
+
+				m_endpoint_remap_table_new_to_old[new_index] = old_index;
+			}
+		}
 	}
 
 	void basisu_backend::sort_selector_codebook()
@@ -242,7 +300,7 @@ namespace basisu
 
 		m_selector_remap_table_new_to_old.resize(r.get_total_selector_clusters());
 
-		if (m_params.m_compression_level == 0)
+		if ((m_params.m_compression_level == 0) || (m_params.m_used_global_codebooks))
 		{
 			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
 				m_selector_remap_table_new_to_old[i] = i;
@@ -336,10 +394,10 @@ namespace basisu
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
 			const bool is_iframe = m_slices[slice_index].m_iframe;
-			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 			const int prev_frame_slice_index = find_video_frame(slice_index, -1);
@@ -393,6 +451,7 @@ namespace basisu
 
 		BASISU_BACKEND_VERIFY(total_invalid_crs == 0);
 	}
+
 	void basisu_backend::create_encoder_blocks()
 	{
 		basisu_frontend& r = *m_pFront_end;
@@ -411,8 +470,8 @@ namespace basisu
 			const bool is_iframe = m_slices[slice_index].m_iframe;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 
@@ -590,7 +649,7 @@ namespace basisu
 	{
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 			const uint32_t width = m_slices[slice_index].m_width;
 			const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
@@ -603,7 +662,7 @@ namespace basisu
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
-					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
 					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
@@ -662,7 +721,7 @@ namespace basisu
 		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1);
 		histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
 
-		std::vector<uint_vec> selector_syms(m_slices.size());
+		basisu::vector<uint_vec> selector_syms(m_slices.size());
 
 		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters();
 		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + basist::MAX_SELECTOR_HISTORY_BUF_SIZE;
@@ -672,7 +731,7 @@ namespace basisu
 		histogram delta_endpoint_histogram(r.get_total_endpoint_clusters());
 
 		histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS);
-		std::vector<uint_vec> endpoint_pred_syms(m_slices.size());
+		basisu::vector<uint_vec> endpoint_pred_syms(m_slices.size());
 
 		uint32_t total_endpoint_indices_remapped = 0;
 
@@ -680,11 +739,11 @@ namespace basisu
 
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
-			const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
+			//const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			//const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 
@@ -702,7 +761,7 @@ namespace basisu
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
-					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
 					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
@@ -723,6 +782,7 @@ namespace basisu
 
 				}  // block_x
 			} // block_y
+
 			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
@@ -821,6 +881,10 @@ namespace basisu
 									if (trial_idx == new_endpoint_index)
 										continue;
 
+									// Skip it if this new endpoint palette entry is actually never used.
+									if (!m_new_endpoint_was_used[trial_idx])
+										continue;
+
 									const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]];
 									trial_etc_blk.set_block_color5_etc1s(p.m_color5);
 									trial_etc_blk.set_inten_tables_etc1s(p.m_inten5);
@@ -884,23 +948,32 @@ namespace basisu
 						{
 							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
 
-							etc_block etc_blk(r.get_output_block(block_index));
+							const etc_block& etc_blk = r.get_output_block(block_index);
 
 							color_rgba etc_blk_unpacked[16];
 							unpack_etc1(etc_blk, etc_blk_unpacked);
 
 							uint64_t cur_err = 0;
-							for (uint32_t p = 0; p < 16; p++)
-								cur_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-
+							if (r.get_params().m_perceptual)
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(true, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+							else
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(false, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+														
 							uint64_t best_trial_err = UINT64_MAX;
 							int best_trial_idx = 0;
 							uint32_t best_trial_history_buf_idx = 0;
 
-
 							const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f;
 							const bool use_strict_search = (m_params.m_compression_level == 0) && (selector_remap_thresh == 1.0f);
 
+							const uint64_t limit_err = (uint64_t)ceilf(cur_err * selector_remap_thresh);
+							
 							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
 							{
 								const int trial_idx = selector_history_buf[j];
@@ -917,31 +990,43 @@ namespace basisu
 								}
 								else
 								{
-									for (uint32_t sy = 0; sy < 4; sy++)
-										for (uint32_t sx = 0; sx < 4; sx++)
-											etc_blk.set_selector(sx, sy, m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](sx, sy));
+									uint64_t trial_err = 0;
+									const uint64_t thresh_err = minimum(limit_err, best_trial_err);
 
-									// TODO: Optimize this
-									unpack_etc1(etc_blk, etc_blk_unpacked);
+									color_rgba block_colors[4];
+									etc_blk.get_block_colors(block_colors, 0);
 
-									uint64_t trial_err = 0;
-									const uint64_t thresh_err = minimum((uint64_t)ceilf(cur_err * selector_remap_thresh), best_trial_err);
-									for (uint32_t p = 0; p < 16; p++)
+									const uint8_t* pSelectors = &m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](0, 0);
+									
+									if (r.get_params().m_perceptual)
 									{
-										trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-										if (trial_err > thresh_err)
-											break;
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
 									}
-
-									if (trial_err <= cur_err * selector_remap_thresh)
+									else
 									{
-										if (trial_err < best_trial_err)
+										for (uint32_t p = 0; p < 16; p++)
 										{
-											best_trial_err = trial_err;
-											best_trial_idx = trial_idx;
-											best_trial_history_buf_idx = j;
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
 										}
 									}
+
+									if ((trial_err < best_trial_err) && (trial_err <= thresh_err))
+									{
+										assert(trial_err <= limit_err);
+										
+										best_trial_err = trial_err;
+										best_trial_idx = trial_idx;
+										best_trial_history_buf_idx = j;
+									}
 								}
 							}
 
@@ -1086,7 +1171,8 @@ namespace basisu
 			total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(),
 			total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks());
 
-		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		//if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 1) && (!m_params.m_used_global_codebooks))
 		{
 			int_vec unused;
 			r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices);
@@ -1168,8 +1254,8 @@ namespace basisu
 
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 
@@ -1296,10 +1382,53 @@ namespace basisu
 	{
 		const basisu_frontend& r = *m_pFront_end;
 
+		// The endpoint indices may have been changed by the backend's RDO step, so go and figure out which ones are actually used again.
+		bool_vec old_endpoint_was_used(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
+
+					old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::encode_endpoint_palette: first_old_entry_index: %u\n", first_old_entry_index);
+
 		// Maps NEW to OLD endpoints
-		uint_vec endpoint_remap_table_inv(r.get_total_endpoint_clusters());
+		uint_vec endpoint_remap_table_new_to_old(r.get_total_endpoint_clusters());
+		endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		bool_vec new_endpoint_was_used(r.get_total_endpoint_clusters());
+
 		for (uint32_t old_endpoint_index = 0; old_endpoint_index < m_endpoint_remap_table_old_to_new.size(); old_endpoint_index++)
-			endpoint_remap_table_inv[m_endpoint_remap_table_old_to_new[old_endpoint_index]] = old_endpoint_index;
+		{
+			if (old_endpoint_was_used[old_endpoint_index])
+			{
+				const uint32_t new_endpoint_index = m_endpoint_remap_table_old_to_new[old_endpoint_index];
+				
+				new_endpoint_was_used[new_endpoint_index] = true;
+
+				endpoint_remap_table_new_to_old[new_endpoint_index] = old_endpoint_index;
+			}
+		}
+
+		// TODO: Some new endpoint palette entries may actually be unused and aren't worth coding. Fix that.
+
+		uint32_t total_unused_new_entries = 0;
+		for (uint32_t i = 0; i < new_endpoint_was_used.size(); i++)
+			if (!new_endpoint_was_used[i])
+				total_unused_new_entries++;
+		debug_printf("basisu_backend::encode_endpoint_palette: total_unused_new_entries: %u out of %u\n", total_unused_new_entries, new_endpoint_was_used.size());
 
 		bool is_grayscale = true;
 		for (uint32_t old_endpoint_index = 0; old_endpoint_index < (uint32_t)m_endpoint_palette.size(); old_endpoint_index++)
@@ -1324,7 +1453,7 @@ namespace basisu
 
 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
-			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];
 
 			int delta_inten = m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten;
 			inten_delta_hist.inc(delta_inten & 7);
@@ -1390,7 +1519,7 @@ namespace basisu
 
 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
-			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];
 
 			int delta_inten = (m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten) & 7;
 			coder.put_code(delta_inten, inten_delta_model);
@@ -1644,9 +1773,11 @@ namespace basisu
 
 	uint32_t basisu_backend::encode()
 	{
-		const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+		//const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 		m_output.m_slice_desc = m_slices;
 		m_output.m_etc1s = m_params.m_etc1s;
+		m_output.m_uses_global_codebooks = m_params.m_used_global_codebooks;
+		m_output.m_srgb = m_pFront_end->get_params().m_perceptual;
 
 		create_endpoint_palette();
 		create_selector_palette();
diff --git a/thirdparty/basis_universal/basisu_backend.h b/thirdparty/basis_universal/encoder/basisu_backend.h
index 1c72fa8cc8..393dccd22f 100644
--- a/thirdparty/basis_universal/basisu_backend.h
+++ b/thirdparty/basis_universal/encoder/basisu_backend.h
@@ -1,5 +1,5 @@
 // basisu_backend.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@
 // limitations under the License.
 #pragma once
 
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_enc.h"
-#include "transcoder/basisu_transcoder_internal.h"
-#include "transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_transcoder_internal.h"
+#include "../transcoder/basisu_global_selector_palette.h"
 #include "basisu_frontend.h"
 
 namespace basisu
@@ -49,7 +49,7 @@ namespace basisu
 		}
 	};
 
-	typedef std::vector<encoder_block> encoder_block_vec;
+	typedef basisu::vector<encoder_block> encoder_block_vec;
 	typedef vector2D<encoder_block> encoder_block_vec2D;
 
 	struct etc1_endpoint_palette_entry
@@ -69,7 +69,7 @@ namespace basisu
 		}
 	};
 
-	typedef std::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
+	typedef basisu::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
 
 	struct basisu_backend_params
 	{
@@ -84,6 +84,8 @@ namespace basisu
 		uint32_t m_global_sel_codebook_mod_bits;
 		bool m_use_hybrid_sel_codebooks;
 
+		bool m_used_global_codebooks;
+
 		basisu_backend_params()
 		{
 			clear();
@@ -102,6 +104,7 @@ namespace basisu
 			m_global_sel_codebook_pal_bits = ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS;
 			m_global_sel_codebook_mod_bits = basist::etc1_global_palette_entry_modifier::cTotalBits;
 			m_use_hybrid_sel_codebooks = false;
+			m_used_global_codebooks = false;
 		}
 	};
 
@@ -111,10 +114,12 @@ namespace basisu
 		{
 			clear();
 		}
+
 		void clear()
 		{
 			clear_obj(*this);
 		}
+
 		uint32_t m_first_block_index;
 
 		uint32_t m_orig_width;
@@ -135,11 +140,15 @@ namespace basisu
 		bool m_iframe;
 	};
 
-	typedef std::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
+	typedef basisu::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
 
 	struct basisu_backend_output
 	{
+		basist::basis_tex_format m_tex_format;
+
 		bool m_etc1s;
+		bool m_uses_global_codebooks;
+		bool m_srgb;
 
 		uint32_t m_num_endpoints;
 		uint32_t m_num_selectors;
@@ -150,7 +159,7 @@ namespace basisu
 		basisu_backend_slice_desc_vec m_slice_desc;
 
 		uint8_vec m_slice_image_tables;
-		std::vector<uint8_vec> m_slice_image_data;
+		basisu::vector<uint8_vec> m_slice_image_data;
 		uint16_vec m_slice_image_crcs;
 
 		basisu_backend_output()
@@ -160,7 +169,10 @@ namespace basisu
 
 		void clear()
 		{
+			m_tex_format = basist::basis_tex_format::cETC1S;
 			m_etc1s = false;
+			m_uses_global_codebooks = false;
+			m_srgb = true;
 
 			m_num_endpoints = 0;
 			m_num_selectors = 0;
@@ -198,6 +210,7 @@ namespace basisu
 		uint32_t encode();
 
 		const basisu_backend_output &get_output() const { return m_output; }
+		const basisu_backend_params& get_params() const { return m_params; }
 
 	private:
 		basisu_frontend *m_pFront_end;
@@ -216,15 +229,17 @@ namespace basisu
 			bool m_was_used;
 		};
 
-		typedef std::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
+		typedef basisu::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
 
 		etc1_global_selector_cb_entry_desc_vec m_global_selector_palette_desc;
 
-		std::vector<encoder_block_vec2D> m_slice_encoder_blocks;
+		basisu::vector<encoder_block_vec2D> m_slice_encoder_blocks;
 
 		// Maps OLD to NEW endpoint/selector indices
 		uint_vec m_endpoint_remap_table_old_to_new;
 		uint_vec m_endpoint_remap_table_new_to_old;
+		bool_vec m_old_endpoint_was_used;
+		bool_vec m_new_endpoint_was_used;
 
 		uint_vec m_selector_remap_table_old_to_new;
 
diff --git a/thirdparty/basis_universal/basisu_basis_file.cpp b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
index 3e6b1906b9..f4c77bef23 100644
--- a/thirdparty/basis_universal/basisu_basis_file.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
@@ -1,5 +1,5 @@
 // basisu_basis_file.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "basisu_basis_file.h"
-#include "transcoder/basisu_transcoder.h"
+#include "../transcoder/basisu_transcoder.h"
 
 // The output file version. Keep in sync with BASISD_SUPPORTED_BASIS_VERSION.
 #define BASIS_FILE_VERSION (0x13)
@@ -31,15 +31,26 @@ namespace basisu
 		m_header.m_total_images = 0;
 		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
 			m_header.m_total_images = maximum<uint32_t>(m_header.m_total_images, encoder_output.m_slice_desc[i].m_source_file_index + 1);
-				
-		m_header.m_format = 0;// basist::block_format::cETC1;
+		
+		m_header.m_tex_format = (int)encoder_output.m_tex_format;
 		m_header.m_flags = 0;
 		
 		if (encoder_output.m_etc1s)
+		{
+			assert(encoder_output.m_tex_format == basist::basis_tex_format::cETC1S);
 			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagETC1S;
+		}
+		else
+		{
+			assert(encoder_output.m_tex_format != basist::basis_tex_format::cETC1S);
+		}
 
 		if (y_flipped)
 			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagYFlipped;
+		if (encoder_output.m_uses_global_codebooks)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagUsesGlobalCodebook;
+		if (encoder_output.m_srgb)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagSRGB;
 				
 		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
 		{
@@ -57,12 +68,26 @@ namespace basisu
 		m_header.m_userdata1 = userdata1;
 
 		m_header.m_total_endpoints = encoder_output.m_num_endpoints;
-		m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
-		m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
+			m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		}
+		else
+		{
+			assert(!m_endpoint_cb_file_ofs);
+		}
 
 		m_header.m_total_selectors = encoder_output.m_num_selectors;
-		m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
-		m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
+			m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		}
+		else
+		{
+			assert(!m_selector_cb_file_ofs);
+		}
 
 		m_header.m_tables_file_ofs = m_tables_file_ofs;
 		m_header.m_tables_file_size = (uint32_t)encoder_output.m_slice_image_tables.size();
@@ -85,7 +110,7 @@ namespace basisu
 			m_images_descs[i].m_level_index = slice_descs[i].m_mip_index;
 			
 			if (slice_descs[i].m_alpha)
-				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsIsAlphaData;
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsHasAlpha;
 			if (slice_descs[i].m_iframe)
 				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsFrameIsIFrame;
 
@@ -127,14 +152,26 @@ namespace basisu
 		assert(m_comp_data.size() == m_slice_descs_file_ofs);
 		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&m_images_descs[0]), m_images_descs.size() * sizeof(m_images_descs[0]));
 
-		assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			if (encoder_output.m_endpoint_palette.size())
+			{
+				assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+			}
 
-		assert(m_comp_data.size() == m_selector_cb_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			if (encoder_output.m_selector_palette.size())
+			{
+				assert(m_comp_data.size() == m_selector_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			}
+		}
 
-		assert(m_comp_data.size() == m_tables_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		if (encoder_output.m_slice_image_tables.size())
+		{
+			assert(m_comp_data.size() == m_tables_file_ofs);
+			append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		}
 
 		assert(m_comp_data.size() == m_first_image_file_ofs);
 		for (uint32_t i = 0; i < slice_descs.size(); i++)
@@ -163,8 +200,17 @@ namespace basisu
 		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
 
 		// The Basis file uses 32-bit fields for lots of stuff, so make sure it's not too large.
-		uint64_t check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() + 
+		uint64_t check_size = 0;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
 			(uint64_t)encoder_output.m_endpoint_palette.size() + (uint64_t)encoder_output.m_selector_palette.size() + (uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+				(uint64_t)encoder_output.m_slice_image_tables.size();
+		}
 		if (check_size >= 0xFFFF0000ULL)
 		{
 			error_printf("basisu_file::init: File is too large!\n");
@@ -173,10 +219,29 @@ namespace basisu
 
 		m_header_file_ofs = 0;
 		m_slice_descs_file_ofs = sizeof(basist::basis_file_header);
-		m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
-		m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
-		m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
-		m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		if (encoder_output.m_tex_format == basist::basis_tex_format::cETC1S)
+		{
+			if (encoder_output.m_uses_global_codebooks)
+			{
+				m_endpoint_cb_file_ofs = 0;
+				m_selector_cb_file_ofs = 0;
+				m_tables_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+			}
+			else
+			{
+				m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+				m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
+				m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
+			}
+			m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+		}
 				
 		uint64_t total_file_size = m_first_image_file_ofs;
 		for (uint32_t i = 0; i < encoder_output.m_slice_image_data.size(); i++)
diff --git a/thirdparty/basis_universal/basisu_basis_file.h b/thirdparty/basis_universal/encoder/basisu_basis_file.h
index df3abbdcfd..98498a0121 100644
--- a/thirdparty/basis_universal/basisu_basis_file.h
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.h
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_file_headers.h"
 #include "basisu_backend.h"
 
 namespace basisu
@@ -49,7 +49,7 @@ namespace basisu
 
 	private:
 		basist::basis_file_header m_header;
-		std::vector<basist::basis_slice_desc> m_images_descs;
+		basisu::vector<basist::basis_slice_desc> m_images_descs;
 
 		uint8_vec m_comp_data;
 
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
new file mode 100644
index 0000000000..06aa7eb8b1
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
@@ -0,0 +1,1984 @@
+// File: basisu_bc7enc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_bc7enc.h"
+
+#ifdef _DEBUG
+#define BC7ENC_CHECK_OVERALL_ERROR 1
+#else
+#define BC7ENC_CHECK_OVERALL_ERROR 0
+#endif
+
+using namespace basist;
+
+namespace basisu
+{
+
+// Helpers
+static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; }
+static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; }
+static inline bc7enc_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); }
+static inline bc7enc_vec4F*vec4F_set_scalar(bc7enc_vec4F*pV, float x) {	pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x;	pV->m_c[3] = x;	return pV; }
+static inline bc7enc_vec4F*vec4F_set(bc7enc_vec4F*pV, float x, float y, float z, float w) {	pV->m_c[0] = x;	pV->m_c[1] = y;	pV->m_c[2] = z;	pV->m_c[3] = w;	return pV; }
+static inline bc7enc_vec4F*vec4F_saturate_in_place(bc7enc_vec4F*pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; }
+static inline bc7enc_vec4F vec4F_saturate(const bc7enc_vec4F*pV) { bc7enc_vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_from_color(const color_quad_u8 *pC) { bc7enc_vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_add(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_sub(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; }
+static inline float vec4F_dot(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; }
+static inline bc7enc_vec4F vec4F_mul(const bc7enc_vec4F*pLHS, float s) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; }
+static inline bc7enc_vec4F* vec4F_normalize_in_place(bc7enc_vec4F*pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; }
+
+// Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
+const float g_bc7_weights1x[2 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights2x[4 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights3x[8 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f,
+	0.079102f, 0.718750f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
+	0.451416f, 0.328125f, 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
+	0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.015625f, 0.109375f, 0.765625f, 0.125000f, 0.035156f, 0.152344f, 0.660156f, 0.187500f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
+	0.451416f, 0.328125f, 0.152588f, 0.238037f, 0.371338f, 0.390625f, 0.205322f, 0.247803f, 0.299072f, 0.453125f, 0.299072f, 0.247803f, 0.205322f, 0.546875f, 0.371338f, 0.238037f, 0.152588f, 0.609375f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
+	0.660156f, 0.152344f, 0.035156f, 0.812500f, 0.765625f, 0.109375f, 0.015625f, 0.875000f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights5x[32 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000977f, 0.030273f, 0.938477f, 0.031250f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.008789f, 0.084961f, 0.821289f,
+	0.093750f, 0.015625f, 0.109375f, 0.765625f, 0.125000f, 0.024414f, 0.131836f, 0.711914f, 0.156250f, 0.035156f, 0.152344f, 0.660156f, 0.187500f, 0.047852f, 0.170898f, 0.610352f, 0.218750f, 0.062500f, 0.187500f,
+	0.562500f, 0.250000f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.097656f, 0.214844f, 0.472656f, 0.312500f, 0.118164f, 0.225586f, 0.430664f, 0.343750f, 0.140625f, 0.234375f, 0.390625f, 0.375000f, 0.165039f,
+	0.241211f, 0.352539f, 0.406250f, 0.191406f, 0.246094f, 0.316406f, 0.437500f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.316406f, 0.246094f, 0.191406f, 0.562500f,
+	0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.390625f, 0.234375f, 0.140625f, 0.625000f, 0.430664f, 0.225586f, 0.118164f, 0.656250f, 0.472656f, 0.214844f, 0.097656f, 0.687500f, 0.516602f, 0.202148f, 0.079102f,
+	0.718750f, 0.562500f, 0.187500f, 0.062500f, 0.750000f, 0.610352f, 0.170898f, 0.047852f, 0.781250f, 0.660156f, 0.152344f, 0.035156f, 0.812500f, 0.711914f, 0.131836f, 0.024414f, 0.843750f, 0.765625f, 0.109375f,
+	0.015625f, 0.875000f, 0.821289f, 0.084961f, 0.008789f, 0.906250f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 0.938477f, 0.030273f, 0.000977f, 0.968750f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights_3levelsx[3 * 4] = {
+	0.000000f, 0.000000f, 1.000000f, 0.000000f,
+	.5f * .5f, (1.0f - .5f) * .5f, (1.0f - .5f) * (1.0f - .5f), .5f,
+	1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+static endpoint_err g_bc7_mode_1_optimal_endpoints[256][2]; // [c][pbit]
+static const uint32_t BC7ENC_MODE_1_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_4bit_3bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_4bit_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range7_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range13_4bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_range13_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range11_5bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX = 13; // not 1, which is optimal, because 26 losslessly maps to BC7 4-bit weights
+
+astc_quant_bin g_astc_sorted_order_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+
+static uint8_t g_astc_nearest_sorted_index[BC7ENC_TOTAL_ASTC_RANGES][256];
+
+static void astc_init()
+{
+	for (uint32_t range = 0; range < BC7ENC_TOTAL_ASTC_RANGES; range++)
+	{
+		if (!astc_is_valid_endpoint_range(range))
+			continue;
+				
+		const uint32_t levels = astc_get_levels(range);
+
+		uint32_t vals[256];
+		// TODO
+		for (uint32_t i = 0; i < levels; i++)
+			vals[i] = (unquant_astc_endpoint_val(i, range) << 8) | i;
+		
+		std::sort(vals, vals + levels);
+
+		for (uint32_t i = 0; i < levels; i++)
+		{
+			uint32_t order = vals[i] & 0xFF;
+			uint32_t unq = vals[i] >> 8;
+						
+			g_astc_sorted_order_unquant[range][i].m_unquant = (uint8_t)unq;
+			g_astc_sorted_order_unquant[range][i].m_index = (uint8_t)order;
+			
+		} // i
+
+#if 0
+		if (g_astc_bise_range_table[range][1] || g_astc_bise_range_table[range][2])
+		{
+			printf("// Range: %u, Levels: %u, Bits: %u, Trits: %u, Quints: %u\n", range, levels, g_astc_bise_range_table[range][0], g_astc_bise_range_table[range][1], g_astc_bise_range_table[range][2]);
+
+			printf("{");
+			for (uint32_t i = 0; i < levels; i++)
+			{
+				printf("{%u,%u}", g_astc_sorted_order_unquant[range][i].m_index, g_astc_sorted_order_unquant[range][i].m_unquant);
+				if (i != (levels - 1))
+					printf(",");
+			}
+			printf("}\n");
+		}
+#endif
+
+#if 0
+		if (g_astc_bise_range_table[range][1] || g_astc_bise_range_table[range][2])
+		{
+			printf("// Range: %u, Levels: %u, Bits: %u, Trits: %u, Quints: %u\n", range, levels, g_astc_bise_range_table[range][0], g_astc_bise_range_table[range][1], g_astc_bise_range_table[range][2]);
+
+			printf("{");
+			for (uint32_t i = 0; i < levels; i++)
+			{
+				printf("{%u,%u}", g_astc_unquant[range][i].m_index, g_astc_unquant[range][i].m_unquant);
+				if (i != (levels - 1))
+					printf(",");
+			}
+			printf("}\n");
+		}
+#endif
+
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			uint32_t best_index = 0;
+			int best_err = INT32_MAX;
+
+			for (uint32_t j = 0; j < levels; j++)
+			{
+				int err = g_astc_sorted_order_unquant[range][j].m_unquant - i;
+				if (err < 0)
+					err = -err;
+				if (err < best_err)
+				{
+					best_err = err;
+					best_index = j;
+				}
+			}
+
+			g_astc_nearest_sorted_index[range][i] = (uint8_t)best_index;
+		} // i
+	} // range
+}
+
+static inline uint32_t astc_interpolate(uint32_t l, uint32_t h, uint32_t w)
+{
+	// This is for linear values, not sRGB.
+	l = (l << 8) | l;
+	h = (h << 8) | h;
+	uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
+	return k >> 8;
+}
+
+// Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding.
+void bc7enc_compress_block_init()
+{
+	astc_init();
+			
+	// BC7 666.1
+	for (int c = 0; c < 256; c++)
+	{
+		for (uint32_t lp = 0; lp < 2; lp++)
+		{
+			endpoint_err best;
+			best.m_error = (uint16_t)UINT16_MAX;
+			for (uint32_t l = 0; l < 64; l++)
+			{
+				uint32_t low = ((l << 1) | lp) << 1;
+				low |= (low >> 7);
+				for (uint32_t h = 0; h < 64; h++)
+				{
+					uint32_t high = ((h << 1) | lp) << 1;
+					high |= (high >> 7);
+					const int k = (low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
+					const int err = (k - c) * (k - c);
+					if (err < best.m_error)
+					{
+						best.m_error = (uint16_t)err;
+						best.m_lo = (uint8_t)l;
+						best.m_hi = (uint8_t)h;
+					}
+				} // h
+			} // l
+			g_bc7_mode_1_optimal_endpoints[c][lp] = best;
+		} // lp
+	} // c
+
+	// ASTC [0,15] 3-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 16; l++)
+		{
+			uint32_t low = (l << 4) | l;
+			
+			for (uint32_t h = 0; h < 16; h++)
+			{
+				uint32_t high = (h << 4) | h;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_4bit_3bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC [0,15] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 16; l++)
+		{
+			uint32_t low = (l << 4) | l;
+			
+			for (uint32_t h = 0; h < 16; h++)
+			{
+				uint32_t high = (h << 4) | h;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_4bit_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 7 [0,11] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 12; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[7][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 12; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[7][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range7_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 13 [0,47] 4-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 48; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[13][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 48; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_astc_weights4[BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range13_4bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 13 [0,47] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 48; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[13][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 48; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range13_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 11 [0,31] 5-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 32; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[11][l].m_unquant;
+
+			for (uint32_t h = 0; h < 32; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[11][h].m_unquant;
+
+				const int k = astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+
+		g_astc_range11_5bit_optimal_endpoints[c] = best;
+
+	} // c
+}
+
+static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F* pSelector_weights, bc7enc_vec4F* pXl, bc7enc_vec4F* pXh, const color_quad_u8 *pColors)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	double q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	double q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+	double q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
+	
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+		q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+	q10_a = t_a - q00_a;
+
+	z01 = z10;
+
+	double det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	double iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a);
+
+	for (uint32_t c = 0; c < 4; c++)
+	{
+		if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f))
+		{
+			uint32_t lo_v = UINT32_MAX, hi_v = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				lo_v = minimumu(lo_v, pColors[i].m_c[c]);
+				hi_v = maximumu(hi_v, pColors[i].m_c[c]);
+			}
+
+			if (lo_v == hi_v)
+			{
+				pXl->m_c[c] = (float)lo_v;
+				pXh->m_c[c] = (float)hi_v;
+			}
+		}
+	}
+}
+
+static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F*pSelector_weights, bc7enc_vec4F*pXl, bc7enc_vec4F*pXh, const color_quad_u8 *pColors)
+{
+	double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	double q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	double q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	double det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	double iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f;
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f))
+		{
+			uint32_t lo_v = UINT32_MAX, hi_v = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				lo_v = minimumu(lo_v, pColors[i].m_c[c]);
+				hi_v = maximumu(hi_v, pColors[i].m_c[c]);
+			}
+
+			if (lo_v == hi_v)
+			{
+				pXl->m_c[c] = (float)lo_v;
+				pXh->m_c[c] = (float)hi_v;
+			}
+		}
+	}
+}
+
+static inline color_quad_u8 scale_color(const color_quad_u8* pC, const color_cell_compressor_params* pParams)
+{
+	color_quad_u8 results;
+
+	if (pParams->m_astc_endpoint_range)
+	{
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			results.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pC->m_c[i]].m_unquant;
+		}
+	}
+	else
+	{
+		const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0);
+		assert((n >= 4) && (n <= 8));
+
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t v = pC->m_c[i] << (8 - n);
+			v |= (v >> n);
+			assert(v <= 255);
+			results.m_c[i] = (uint8_t)(v);
+		}
+	}
+
+	return results;
+}
+
+static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4])
+{
+	int dr, dg, db;
+
+	if (perceptual)
+	{
+		const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
+		const int cr1 = ((int)pE1->m_c[0] << 9) - l1;
+		const int cb1 = ((int)pE1->m_c[2] << 9) - l1;
+		const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37;
+		const int cr2 = ((int)pE2->m_c[0] << 9) - l2;
+		const int cb2 = ((int)pE2->m_c[2] << 9) - l2;
+		dr = (l1 - l2) >> 8;
+		dg = (cr1 - cr2) >> 8;
+		db = (cb1 - cb2) >> 8;
+	}
+	else
+	{
+		dr = (int)pE1->m_c[0] - (int)pE2->m_c[0];
+		dg = (int)pE1->m_c[1] - (int)pE2->m_c[1];
+		db = (int)pE1->m_c[2] - (int)pE2->m_c[2];
+	}
+
+	return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db);
+}
+
+static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4])
+{
+	int da = (int)pE1->m_c[3] - (int)pE2->m_c[3];
+	return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da));
+}
+
+static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	uint32_t best_err = UINT_MAX;
+	uint32_t best_p = 0;
+
+	for (uint32_t p = 0; p < 2; p++)
+	{
+		uint32_t err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
+		if (err < best_err)
+		{
+			best_err = err;
+			best_p = p;
+		}
+	}
+
+	const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
+	const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
+	const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = best_p;
+	pResults->m_pbits[1] = 0;
+
+	memset(pSelectors, BC7ENC_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		low |= (low >> 7);
+
+		uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		high |= (high >> 7);
+
+		p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_4bit_3bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	const endpoint_err *pEr = &g_astc_4bit_3bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_4bit_3bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_4bit_3bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
+		uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_4bit_2bit_to_one_color_rgba(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint32_t a, uint8_t *pSelectors)
+{
+	const endpoint_err *pEr = &g_astc_4bit_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_4bit_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_4bit_2bit_optimal_endpoints[b];
+	const endpoint_err *pEa = &g_astc_4bit_2bit_optimal_endpoints[a];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, pEa->m_lo);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, pEa->m_hi);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
+		uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
+	}
+	
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgba(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range7_2bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 7 && pParams->m_num_selector_weights == 4);
+
+	const endpoint_err *pEr = &g_astc_range7_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_range7_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_range7_2bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[7][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[7][pResults->m_high_endpoint.m_c[i]].m_unquant;
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range13_2bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 13 && pParams->m_num_selector_weights == 4 && !pParams->m_has_alpha);
+
+	const endpoint_err *pEr = &g_astc_range13_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_range13_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_range13_2bit_optimal_endpoints[b];
+	
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 47);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 47);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[13][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[13][pResults->m_high_endpoint.m_c[i]].m_unquant;
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
+	}
+	
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range11_5bit_to_one_color(const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t* pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 11 && pParams->m_num_selector_weights == 32 && !pParams->m_has_alpha);
+
+	const endpoint_err* pEr = &g_astc_range11_5bit_optimal_endpoints[r];
+	const endpoint_err* pEg = &g_astc_range11_5bit_optimal_endpoints[g];
+	const endpoint_err* pEb = &g_astc_range11_5bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 31);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 31);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[11][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[11][pResults->m_high_endpoint.m_c[i]].m_unquant;
+
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
+	}
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	color_quad_u8 quantMinColor = *pLow;
+	color_quad_u8 quantMaxColor = *pHigh;
+
+	if (pParams->m_has_pbits)
+	{
+		uint32_t minPBit, maxPBit;
+
+		if (pParams->m_endpoints_share_pbit)
+			maxPBit = minPBit = pbits[0];
+		else
+		{
+			minPBit = pbits[0];
+			maxPBit = pbits[1];
+		}
+
+		quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit);
+		quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit);
+		quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit);
+		quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit);
+
+		quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit);
+		quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit);
+		quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit);
+		quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit);
+	}
+
+	color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams);
+	color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams);
+
+	const uint32_t N = pParams->m_num_selector_weights;
+	assert(N >= 1 && N <= 32);
+
+	color_quad_u8 weightedColors[32];
+	weightedColors[0] = actualMinColor;
+	weightedColors[N - 1] = actualMaxColor;
+
+	const uint32_t nc = pParams->m_has_alpha ? 4 : 3;
+	if (pParams->m_astc_endpoint_range)
+	{
+		for (uint32_t i = 1; i < (N - 1); i++)
+		{
+			for (uint32_t j = 0; j < nc; j++)
+				weightedColors[i].m_c[j] = (uint8_t)(astc_interpolate(actualMinColor.m_c[j], actualMaxColor.m_c[j], pParams->m_pSelector_weights[i]));
+		}
+	}
+	else
+	{
+		for (uint32_t i = 1; i < (N - 1); i++)
+			for (uint32_t j = 0; j < nc; j++)
+				weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6);
+	}
+
+	const int lr = actualMinColor.m_c[0];
+	const int lg = actualMinColor.m_c[1];
+	const int lb = actualMinColor.m_c[2];
+	const int dr = actualMaxColor.m_c[0] - lr;
+	const int dg = actualMaxColor.m_c[1] - lg;
+	const int db = actualMaxColor.m_c[2] - lb;
+	
+	uint64_t total_err = 0;
+	
+	if (pParams->m_pForce_selectors)
+	{
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pParams->m_pPixels[i];
+			
+			const uint8_t sel = pParams->m_pForce_selectors[i];
+			assert(sel < N);
+			
+			total_err += (pParams->m_has_alpha ? compute_color_distance_rgba : compute_color_distance_rgb)(&weightedColors[sel], pC, pParams->m_perceptual, pParams->m_weights);
+
+			pResults->m_pSelectors_temp[i] = sel;
+		}
+	}
+	else if (!pParams->m_perceptual)
+	{
+		if (pParams->m_has_alpha)
+		{
+			const int la = actualMinColor.m_c[3];
+			const int da = actualMaxColor.m_c[3] - la;
+
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+				int a = pC->m_c[3];
+
+				int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
+				best_sel = clampi(best_sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC_FALSE, pParams->m_weights);
+
+				if (err0 == err1)
+				{
+					// Prefer non-interpolation
+					if ((best_sel - 1) == 0)
+						best_sel = 0;
+				}
+				else if (err1 > err0)
+				{
+					err1 = err0;
+					--best_sel;
+				}
+				total_err += err1;
+								
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+		else
+		{
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+
+				int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
+				sel = clampi(sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC_FALSE, pParams->m_weights);
+
+				int best_sel = sel;
+				uint64_t best_err = err1;
+				if (err0 == err1)
+				{
+					// Prefer non-interpolation
+					if ((best_sel - 1) == 0)
+						best_sel = 0;
+				}
+				else if (err0 < best_err)
+				{
+					best_err = err0;
+					best_sel = sel - 1;
+				}
+
+				total_err += best_err;
+
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_sel = 0;
+
+			if (pParams->m_has_alpha)
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+					// Prefer non-interpolation
+					else if ((err == best_err) && (j == (N - 1)))
+						best_sel = j;
+				}
+			}
+			else
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+					// Prefer non-interpolation
+					else if ((err == best_err) && (j == (N - 1)))
+						best_sel = j;
+				}
+			}
+
+			total_err += best_err;
+
+			pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+		}
+	}
+
+	if (total_err < pResults->m_best_overall_err)
+	{
+		pResults->m_best_overall_err = total_err;
+
+		pResults->m_low_endpoint = *pLow;
+		pResults->m_high_endpoint = *pHigh;
+
+		pResults->m_pbits[0] = pbits[0];
+		pResults->m_pbits[1] = pbits[1];
+
+		memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+	}
+				
+	return total_err;
+}
+
+static bool areDegenerateEndpoints(color_quad_u8* pTrialMinColor, color_quad_u8* pTrialMaxColor, const bc7enc_vec4F* pXl, const bc7enc_vec4F* pXh)
+{
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+		{
+			if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const bc7enc_vec4F*pXl, const bc7enc_vec4F*pXh, uint32_t iscale, int flags)
+{
+	if (mode == 255)
+	{
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+			{
+				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.000125f)
+				{
+					if (flags & 1)
+					{
+						if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+					}
+					if (flags & 2)
+					{
+						if (pTrialMaxColor->m_c[i] < iscale)
+							pTrialMaxColor->m_c[i]++;
+					}
+				}
+			}
+		}
+	}
+	else if (mode == 1)
+	{
+		// fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+			{
+				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.000125f)
+				{
+					if (pTrialMinColor->m_c[i] > (iscale >> 1))
+					{
+						if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+						else
+							if (pTrialMaxColor->m_c[i] < iscale)
+								pTrialMaxColor->m_c[i]++;
+					}
+					else
+					{
+						if (pTrialMaxColor->m_c[i] < iscale)
+							pTrialMaxColor->m_c[i]++;
+						else if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+					}
+				}
+			}
+		}
+	}
+}
+
+static uint64_t find_optimal_solution(uint32_t mode, bc7enc_vec4F xl, bc7enc_vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh);
+
+	if (pParams->m_astc_endpoint_range)
+	{
+		const uint32_t levels = astc_get_levels(pParams->m_astc_endpoint_range);
+
+		const float scale = 255.0f;
+
+		color_quad_u8 trialMinColor8Bit, trialMaxColor8Bit;
+		color_quad_u8_set_clamped(&trialMinColor8Bit, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
+		color_quad_u8_set_clamped(&trialMaxColor8Bit, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
+
+		color_quad_u8 trialMinColor, trialMaxColor;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			trialMinColor.m_c[i] = g_astc_nearest_sorted_index[pParams->m_astc_endpoint_range][trialMinColor8Bit.m_c[i]];
+			trialMaxColor.m_c[i] = g_astc_nearest_sorted_index[pParams->m_astc_endpoint_range][trialMaxColor8Bit.m_c[i]];
+		}
+
+		if (areDegenerateEndpoints(&trialMinColor, &trialMaxColor, &xl, &xh))
+		{
+			color_quad_u8 trialMinColorOrig(trialMinColor), trialMaxColorOrig(trialMaxColor);
+
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 1);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 0);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 2);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 3);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+		}
+		else
+		{
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+			{
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+			}
+		}
+
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+			pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+		}
+	}
+	else if (pParams->m_has_pbits)
+	{
+		const int iscalep = (1 << (pParams->m_comp_bits + 1)) - 1;
+		const float scalep = (float)iscalep;
+
+		const int32_t totalComps = pParams->m_has_alpha ? 4 : 3;
+
+		uint32_t best_pbits[2];
+		color_quad_u8 bestMinColor, bestMaxColor;
+
+		if (!pParams->m_endpoints_share_pbit)
+		{
+			float best_err0 = 1e+9;
+			float best_err1 = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+
+				// Notes: The pbit controls which quantization intervals are selected.
+				// total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
+				// pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
+				// rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
+				// rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err0 = 0, err1 = 0;
+				for (int i = 0; i < totalComps; i++)
+				{
+					err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
+					err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
+				}
+
+				if (err0 < best_err0)
+				{
+					best_err0 = err0;
+					best_pbits[0] = p;
+
+					bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
+					bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
+					bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
+					bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
+				}
+
+				if (err1 < best_err1)
+				{
+					best_err1 = err1;
+					best_pbits[1] = p;
+
+					bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
+					bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
+					bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
+					bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
+				}
+			}
+		}
+		else
+		{
+			// Endpoints share pbits
+			float best_err = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err = 0;
+				for (int i = 0; i < totalComps; i++)
+					err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
+
+				if (err < best_err)
+				{
+					best_err = err;
+					best_pbits[0] = p;
+					best_pbits[1] = p;
+					for (uint32_t j = 0; j < 4; j++)
+					{
+						bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
+						bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
+					}
+				}
+			}
+		}
+						
+		fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1, 0);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1]))
+			evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults);
+	}
+	else
+	{
+		const int iscale = (1 << pParams->m_comp_bits) - 1;
+		const float scale = (float)iscale;
+
+		color_quad_u8 trialMinColor, trialMaxColor;
+		color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
+		color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
+
+		fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale, 0);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+			evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+	}
+
+	return pResults->m_best_overall_err;
+}
+
+void check_best_overall_error(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	const uint32_t n = pParams->m_num_selector_weights;
+
+	assert(n <= 32);
+
+	color_quad_u8 colors[32];
+	for (uint32_t c = 0; c < 4; c++)
+	{
+		colors[0].m_c[c] = g_astc_unquant[pParams->m_astc_endpoint_range][pResults->m_astc_low_endpoint.m_c[c]].m_unquant;
+		assert(colors[0].m_c[c] == g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[c]].m_unquant);
+
+		colors[n-1].m_c[c] = g_astc_unquant[pParams->m_astc_endpoint_range][pResults->m_astc_high_endpoint.m_c[c]].m_unquant;
+		assert(colors[n-1].m_c[c] == g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[c]].m_unquant);
+	}
+	
+	for (uint32_t i = 1; i < pParams->m_num_selector_weights - 1; i++)
+		for (uint32_t c = 0; c < 4; c++)
+			colors[i].m_c[c] = (uint8_t)astc_interpolate(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]);
+
+	uint64_t total_err = 0;
+	for (uint32_t p = 0; p < pParams->m_num_pixels; p++)
+	{
+		const color_quad_u8 &orig = pParams->m_pPixels[p];
+		const color_quad_u8 &packed = colors[pResults->m_pSelectors[p]];
+				
+		if (pParams->m_has_alpha)
+			total_err += compute_color_distance_rgba(&orig, &packed, pParams->m_perceptual, pParams->m_weights);
+		else
+			total_err += compute_color_distance_rgb(&orig, &packed, pParams->m_perceptual, pParams->m_weights);
+	}
+	assert(total_err == pResults->m_best_overall_err);
+	
+	// HACK HACK
+	//if (total_err != pResults->m_best_overall_err)
+	//	printf("X");
+}
+
+static bool is_solid_rgb(const color_cell_compressor_params *pParams, uint32_t &r, uint32_t &g, uint32_t &b)
+{
+	r = pParams->m_pPixels[0].m_c[0];
+	g = pParams->m_pPixels[0].m_c[1];
+	b = pParams->m_pPixels[0].m_c[2];
+
+	bool allSame = true;
+	for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
+	{
+		if ((r != pParams->m_pPixels[i].m_c[0]) || (g != pParams->m_pPixels[i].m_c[1]) || (b != pParams->m_pPixels[i].m_c[2]))
+		{
+			allSame = false;
+			break;
+		}
+	}
+
+	return allSame;
+}
+
+static bool is_solid_rgba(const color_cell_compressor_params *pParams, uint32_t &r, uint32_t &g, uint32_t &b, uint32_t &a)
+{
+	r = pParams->m_pPixels[0].m_c[0];
+	g = pParams->m_pPixels[0].m_c[1];
+	b = pParams->m_pPixels[0].m_c[2];
+	a = pParams->m_pPixels[0].m_c[3];
+
+	bool allSame = true;
+	for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
+	{
+		if ((r != pParams->m_pPixels[i].m_c[0]) || (g != pParams->m_pPixels[i].m_c[1]) || (b != pParams->m_pPixels[i].m_c[2]) || (a != pParams->m_pPixels[i].m_c[3]))
+		{
+			allSame = false;
+			break;
+		}
+	}
+
+	return allSame;
+}
+
+uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, const bc7enc_compress_block_params *pComp_params)
+{
+	if (!pParams->m_astc_endpoint_range)
+	{
+		assert((mode == 6) || (!pParams->m_has_alpha));
+	}
+	assert(pParams->m_num_selector_weights >= 1 && pParams->m_num_selector_weights <= 32);
+	assert(pParams->m_pSelector_weights[0] == 0);
+	assert(pParams->m_pSelector_weights[pParams->m_num_selector_weights - 1] == 64);
+
+	pResults->m_best_overall_err = UINT64_MAX;
+
+	uint32_t cr, cg, cb, ca;
+
+	// If the partition's colors are all the same, then just pack them as a single color.
+	if (!pParams->m_pForce_selectors)
+	{
+		if (mode == 1)
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 8) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_4bit_3bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 7) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range7_2bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 4) && (pParams->m_has_alpha))
+		{
+			if (is_solid_rgba(pParams, cr, cg, cb, ca))
+				return pack_astc_4bit_2bit_to_one_color_rgba(pParams, pResults, cr, cg, cb, ca, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 13) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range13_2bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 11) && (pParams->m_num_selector_weights == 32) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range11_5bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+	}
+
+	// Compute partition's mean color and principle axis.
+	bc7enc_vec4F meanColor, axis;
+	vec4F_set_scalar(&meanColor, 0.0f);
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+		meanColor = vec4F_add(&meanColor, &color);
+	}
+				
+	bc7enc_vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels));
+
+	meanColor = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels * 255.0f));
+	vec4F_saturate_in_place(&meanColor);
+	
+	if (pParams->m_has_alpha)
+	{
+		// Use incremental PCA for RGBA PCA, because it's simple.
+		vec4F_set_scalar(&axis, 0.0f);
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+			color = vec4F_sub(&color, &meanColorScaled);
+			bc7enc_vec4F a = vec4F_mul(&color, color.m_c[0]);
+			bc7enc_vec4F b = vec4F_mul(&color, color.m_c[1]);
+			bc7enc_vec4F c = vec4F_mul(&color, color.m_c[2]);
+			bc7enc_vec4F d = vec4F_mul(&color, color.m_c[3]);
+			bc7enc_vec4F n = i ? axis : color;
+			vec4F_normalize_in_place(&n);
+			axis.m_c[0] += vec4F_dot(&a, &n);
+			axis.m_c[1] += vec4F_dot(&b, &n);
+			axis.m_c[2] += vec4F_dot(&c, &n);
+			axis.m_c[3] += vec4F_dot(&d, &n);
+		}
+		vec4F_normalize_in_place(&axis);
+	}
+	else
+	{
+		// Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
+		float cov[6] = { 0, 0, 0, 0, 0, 0 };
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			const color_quad_u8 *pV = &pParams->m_pPixels[i];
+			float r = pV->m_c[0] - meanColorScaled.m_c[0];
+			float g = pV->m_c[1] - meanColorScaled.m_c[1];
+			float b = pV->m_c[2] - meanColorScaled.m_c[2];
+			cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
+		}
+
+		float xr = .9f, xg = 1.0f, xb = .7f;
+		for (uint32_t iter = 0; iter < 3; iter++)
+		{
+			float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
+			float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
+			float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
+
+			float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+			if (m > 1e-10f)
+			{
+				m = 1.0f / m;
+				r *= m; g *= m; b *= m;
+			}
+
+			xr = r; xg = g; xb = b;
+		}
+
+		float len = xr * xr + xg * xg + xb * xb;
+		if (len < 1e-10f)
+			vec4F_set_scalar(&axis, 0.0f);
+		else
+		{
+			len = 1.0f / sqrtf(len);
+			xr *= len; xg *= len; xb *= len;
+			vec4F_set(&axis, xr, xg, xb, 0);
+		}
+	}
+				
+	if (vec4F_dot(&axis, &axis) < .5f)
+	{
+		if (pParams->m_perceptual)
+			vec4F_set(&axis, .213f, .715f, .072f, pParams->m_has_alpha ? .715f : 0);
+		else
+			vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams->m_has_alpha ? 1.0f : 0);
+		vec4F_normalize_in_place(&axis);
+	}
+			
+	bc7enc_vec4F minColor, maxColor;
+
+	float l = 1e+9f, h = -1e+9f;
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+
+		bc7enc_vec4F q = vec4F_sub(&color, &meanColorScaled);
+		float d = vec4F_dot(&q, &axis);
+
+		l = minimumf(l, d);
+		h = maximumf(h, d);
+	}
+
+	l *= (1.0f / 255.0f);
+	h *= (1.0f / 255.0f);
+
+	bc7enc_vec4F b0 = vec4F_mul(&axis, l);
+	bc7enc_vec4F b1 = vec4F_mul(&axis, h);
+	bc7enc_vec4F c0 = vec4F_add(&meanColor, &b0);
+	bc7enc_vec4F c1 = vec4F_add(&meanColor, &b1);
+	minColor = vec4F_saturate(&c0);
+	maxColor = vec4F_saturate(&c1);
+				
+	bc7enc_vec4F whiteVec;
+	vec4F_set_scalar(&whiteVec, 1.0f);
+	if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
+	{
+#if 1
+		std::swap(minColor.m_c[0], maxColor.m_c[0]);
+		std::swap(minColor.m_c[1], maxColor.m_c[1]);
+		std::swap(minColor.m_c[2], maxColor.m_c[2]);
+		std::swap(minColor.m_c[3], maxColor.m_c[3]);
+#elif 0
+		// Fails to compile correctly with MSVC 2019 (code generation bug)
+		std::swap(minColor, maxColor);
+#else
+		// Fails with MSVC 2019
+		bc7enc_vec4F temp = minColor;
+		minColor = maxColor;
+		maxColor = temp;
+#endif
+	}
+
+	// First find a solution using the block's PCA.
+	if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
+		return 0;
+	
+	for (uint32_t i = 0; i < pComp_params->m_least_squares_passes; i++)
+	{
+		// Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
+		bc7enc_vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+	}
+	
+	if ((!pParams->m_pForce_selectors) && (pComp_params->m_uber_level > 0))
+	{
+		// In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
+		// then try decrementing the selectrors, then try both.
+		uint8_t selectors_temp[16], selectors_temp1[16];
+		memcpy(selectors_temp, pResults->m_pSelectors, pParams->m_num_pixels);
+
+		const int max_selector = pParams->m_num_selector_weights - 1;
+
+		uint32_t min_sel = 256;
+		uint32_t max_sel = 0;
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			min_sel = minimumu(min_sel, sel);
+			max_sel = maximumu(max_sel, sel);
+		}
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		bc7enc_vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			else if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		// In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
+		const uint32_t uber_err_thresh = (pParams->m_num_pixels * 56) >> 4;
+		if ((pComp_params->m_uber_level >= 2) && (pResults->m_best_overall_err > uber_err_thresh))
+		{
+			const int Q = (pComp_params->m_uber_level >= 4) ? (pComp_params->m_uber_level - 2) : 1;
+			for (int ly = -Q; ly <= 1; ly++)
+			{
+				for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
+				{
+					if ((ly == 0) && (hy == max_selector))
+						continue;
+
+					for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+						selectors_temp1[i] = (uint8_t)clampf(floorf((float)max_selector * ((float)selectors_temp[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_selector);
+
+					//bc7enc_vec4F xl, xh;
+					vec4F_set_scalar(&xl, 0.0f);
+					vec4F_set_scalar(&xh, 0.0f);
+					if (pParams->m_has_alpha)
+						compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+					else
+						compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+					xl = vec4F_mul(&xl, (1.0f / 255.0f));
+					xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+					if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+						return 0;
+				}
+			}
+		}
+	}
+	
+	if (!pParams->m_pForce_selectors)
+	{
+		// Try encoding the partition as a single color by using the optimal single colors tables to encode the block to its mean.
+		if (mode == 1)
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 8) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_4bit_3bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 7) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range7_2bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 4) && (pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f), a = (int)(.5f + meanColor.m_c[3] * 255.0f);
+			uint64_t avg_err = pack_astc_4bit_2bit_to_one_color_rgba(pParams, &avg_results, r, g, b, a, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 13) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range13_2bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 11) && (pParams->m_num_selector_weights == 32) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range11_5bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+	}
+
+#if BC7ENC_CHECK_OVERALL_ERROR
+	check_best_overall_error(pParams, pResults);
+#endif
+		
+	return pResults->m_best_overall_err;
+}
+
+uint64_t color_cell_compression_est_astc(
+	uint32_t num_weights, uint32_t num_comps, const uint32_t *pWeight_table,
+	uint32_t num_pixels, const color_quad_u8* pPixels, 
+	uint64_t best_err_so_far, const uint32_t weights[4])
+{
+	assert(num_comps == 3 || num_comps == 4);
+	assert(num_weights >= 1 && num_weights <= 32);
+	assert(pWeight_table[0] == 0 && pWeight_table[num_weights - 1] == 64);
+
+	// Find RGB bounds as an approximation of the block's principle axis
+	uint32_t lr = 255, lg = 255, lb = 255, la = 255;
+	uint32_t hr = 0, hg = 0, hb = 0, ha = 0;
+	if (num_comps == 4)
+	{
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pPixels[i];
+			if (pC->m_c[0] < lr) lr = pC->m_c[0];
+			if (pC->m_c[1] < lg) lg = pC->m_c[1];
+			if (pC->m_c[2] < lb) lb = pC->m_c[2];
+			if (pC->m_c[3] < la) la = pC->m_c[3];
+
+			if (pC->m_c[0] > hr) hr = pC->m_c[0];
+			if (pC->m_c[1] > hg) hg = pC->m_c[1];
+			if (pC->m_c[2] > hb) hb = pC->m_c[2];
+			if (pC->m_c[3] > ha) ha = pC->m_c[3];
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pPixels[i];
+			if (pC->m_c[0] < lr) lr = pC->m_c[0];
+			if (pC->m_c[1] < lg) lg = pC->m_c[1];
+			if (pC->m_c[2] < lb) lb = pC->m_c[2];
+
+			if (pC->m_c[0] > hr) hr = pC->m_c[0];
+			if (pC->m_c[1] > hg) hg = pC->m_c[1];
+			if (pC->m_c[2] > hb) hb = pC->m_c[2];
+		}
+		la = 255;
+		ha = 255;
+	}
+
+	color_quad_u8 lowColor, highColor;
+	color_quad_u8_set(&lowColor, lr, lg, lb, la);
+	color_quad_u8_set(&highColor, hr, hg, hb, ha);
+
+	// Place endpoints at bbox diagonals and compute interpolated colors 
+	color_quad_u8 weightedColors[32];
+
+	weightedColors[0] = lowColor;
+	weightedColors[num_weights - 1] = highColor;
+	for (uint32_t i = 1; i < (num_weights - 1); i++)
+	{
+		weightedColors[i].m_c[0] = (uint8_t)astc_interpolate(lowColor.m_c[0], highColor.m_c[0], pWeight_table[i]);
+		weightedColors[i].m_c[1] = (uint8_t)astc_interpolate(lowColor.m_c[1], highColor.m_c[1], pWeight_table[i]);
+		weightedColors[i].m_c[2] = (uint8_t)astc_interpolate(lowColor.m_c[2], highColor.m_c[2], pWeight_table[i]);
+		weightedColors[i].m_c[3] = (num_comps == 4) ? (uint8_t)astc_interpolate(lowColor.m_c[3], highColor.m_c[3], pWeight_table[i]) : 255;
+	}
+
+	// Compute dots and thresholds
+	const int ar = highColor.m_c[0] - lowColor.m_c[0];
+	const int ag = highColor.m_c[1] - lowColor.m_c[1];
+	const int ab = highColor.m_c[2] - lowColor.m_c[2];
+	const int aa = highColor.m_c[3] - lowColor.m_c[3];
+
+	int dots[32];
+	if (num_comps == 4)
+	{
+		for (uint32_t i = 0; i < num_weights; i++)
+			dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab + weightedColors[i].m_c[3] * aa;
+	}
+	else
+	{
+		assert(aa == 0);
+		for (uint32_t i = 0; i < num_weights; i++)
+			dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
+	}
+
+	int thresh[32 - 1];
+	for (uint32_t i = 0; i < (num_weights - 1); i++)
+		thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
+
+	uint64_t total_err = 0;
+	if ((weights[0] | weights[1] | weights[2] | weights[3]) == 1)
+	{
+		if (num_comps == 4)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+				int da = (int)pE1->m_c[3] - (int)pC->m_c[3];
+
+				total_err += (dr * dr) + (dg * dg) + (db * db) + (da * da);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+
+				total_err += (dr * dr) + (dg * dg) + (db * db);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+	}
+	else
+	{
+		if (num_comps == 4)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+				int da = (int)pE1->m_c[3] - (int)pC->m_c[3];
+
+				total_err += weights[0] * (dr * dr) + weights[1] * (dg * dg) + weights[2] * (db * db) + weights[3] * (da * da);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+
+				total_err += weights[0] * (dr * dr) + weights[1] * (dg * dg) + weights[2] * (db * db);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+	}
+
+	return total_err;
+}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.h b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
new file mode 100644
index 0000000000..23469912e2
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
@@ -0,0 +1,131 @@
+// File: basisu_bc7enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+
+#define BC7ENC_MAX_PARTITIONS1 (64)
+#define BC7ENC_MAX_UBER_LEVEL (4)
+
+	typedef uint8_t bc7enc_bool;
+
+#define BC7ENC_TRUE (1)
+#define BC7ENC_FALSE (0)
+		
+	typedef struct { float m_c[4]; } bc7enc_vec4F;
+
+	extern const float g_bc7_weights1x[2 * 4];
+	extern const float g_bc7_weights2x[4 * 4];
+	extern const float g_bc7_weights3x[8 * 4];
+	extern const float g_bc7_weights4x[16 * 4];
+	extern const float g_astc_weights4x[16 * 4];
+	extern const float g_astc_weights5x[32 * 4];
+	extern const float g_astc_weights_3levelsx[3 * 4];
+			
+	extern basist::astc_quant_bin g_astc_sorted_order_unquant[basist::BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+	
+	struct color_cell_compressor_params
+	{
+		uint32_t m_num_pixels;
+		const basist::color_quad_u8* m_pPixels;
+
+		uint32_t m_num_selector_weights;
+		const uint32_t* m_pSelector_weights;
+
+		const bc7enc_vec4F* m_pSelector_weightsx;
+		uint32_t m_comp_bits;
+
+		const uint8_t *m_pForce_selectors;
+
+		// Non-zero m_astc_endpoint_range enables ASTC mode. m_comp_bits and m_has_pbits are always false. We only support 2, 3, or 4 bit weight encodings.
+		uint32_t m_astc_endpoint_range;
+
+		uint32_t m_weights[4];
+		bc7enc_bool m_has_alpha;
+		bc7enc_bool m_has_pbits;
+		bc7enc_bool m_endpoints_share_pbit;
+		bc7enc_bool m_perceptual;
+	};
+
+	struct color_cell_compressor_results
+	{
+		uint64_t m_best_overall_err;
+		basist::color_quad_u8 m_low_endpoint;
+		basist::color_quad_u8 m_high_endpoint;
+		uint32_t m_pbits[2];
+		uint8_t* m_pSelectors;
+		uint8_t* m_pSelectors_temp;
+
+		// Encoded ASTC indices, if ASTC mode is enabled
+		basist::color_quad_u8 m_astc_low_endpoint;
+		basist::color_quad_u8 m_astc_high_endpoint;
+	};
+
+	struct bc7enc_compress_block_params
+	{
+		// m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_max_partitions_mode1;
+
+		// Relative RGBA or YCbCrA weights.
+		uint32_t m_weights[4];
+
+		// m_uber_level may range from 0 to BC7ENC_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_uber_level;
+
+		// If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
+		bc7enc_bool m_perceptual;
+
+		uint32_t m_least_squares_passes;
+	};
+
+	uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, const bc7enc_compress_block_params* pComp_params);
+		
+	uint64_t color_cell_compression_est_astc(
+		uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeight_table,
+		uint32_t num_pixels, const basist::color_quad_u8* pPixels,
+		uint64_t best_err_so_far, const uint32_t weights[4]);
+		
+	inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_FALSE;
+		p->m_weights[0] = 1;
+		p->m_weights[1] = 1;
+		p->m_weights[2] = 1;
+		p->m_weights[3] = 1;
+	}
+
+	inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_TRUE;
+		p->m_weights[0] = 128;
+		p->m_weights[1] = 64;
+		p->m_weights[2] = 16;
+		p->m_weights[3] = 32;
+	}
+
+	inline void bc7enc_compress_block_params_init(bc7enc_compress_block_params* p)
+	{
+		p->m_max_partitions_mode1 = BC7ENC_MAX_PARTITIONS1;
+		p->m_least_squares_passes = 1;
+		p->m_uber_level = 0;
+		bc7enc_compress_block_params_init_perceptual_weights(p);
+	}
+
+	// bc7enc_compress_block_init() MUST be called before calling bc7enc_compress_block() (or you'll get artifacts).
+	void bc7enc_compress_block_init();
+				
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp
new file mode 100644
index 0000000000..dc4ae11539
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp
@@ -0,0 +1,2113 @@
+// basisu_comp.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_comp.h"
+#include "basisu_enc.h"
+#include <unordered_set>
+#include <atomic>
+
+// basisu_transcoder.cpp is where basisu_miniz lives now, we just need the declarations here.
+#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+#include "basisu_miniz.h"
+
+#if !BASISD_SUPPORT_KTX2
+#error BASISD_SUPPORT_KTX2 must be enabled (set to 1).
+#endif
+
+#if BASISD_SUPPORT_KTX2_ZSTD
+#include "../zstd/zstd.h"
+#endif
+
+// Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all)
+#define BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND (0)
+
+// Set to 1 to disable writing all KTX2 key values, triggering the validator bug.
+#define BASISU_DISABLE_KTX2_KEY_VALUES (0)
+
+using namespace buminiz;
+
+#define BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN 0
+#define DEBUG_CROP_TEXTURE_TO_64x64 (0)
+#define DEBUG_RESIZE_TEXTURE (0)
+#define DEBUG_EXTRACT_SINGLE_BLOCK (0)
+
+namespace basisu
+{
+   basis_compressor::basis_compressor() :
+		m_basis_file_size(0),
+		m_basis_bits_per_texel(0.0f),
+		m_total_blocks(0),
+		m_auto_global_sel_pal(false),
+		m_any_source_image_has_alpha(false)
+	{
+		debug_printf("basis_compressor::basis_compressor\n");
+	}
+
+	bool basis_compressor::init(const basis_compressor_params &params)
+	{
+		debug_printf("basis_compressor::init\n");
+
+		m_params = params;
+
+		if (m_params.m_debug)
+		{
+			debug_printf("basis_compressor::init:\n");
+
+#define PRINT_BOOL_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
+#define PRINT_INT_VALUE(v) debug_printf("%s: %i %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
+#define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<uint32_t>(m_params.v), m_params.v.was_changed());
+#define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast<float>(m_params.v), m_params.v.was_changed());
+
+			debug_printf("Has global selector codebook: %i\n", m_params.m_pSel_codebook != nullptr);
+
+			debug_printf("Source images: %u, source filenames: %u, source alpha filenames: %i, Source mipmap images: %u\n",
+				m_params.m_source_images.size(), m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), m_params.m_source_mipmap_images.size());
+
+			if (m_params.m_source_mipmap_images.size())
+			{
+				debug_printf("m_source_mipmap_images array sizes:\n");
+				for (uint32_t i = 0; i < m_params.m_source_mipmap_images.size(); i++)
+					debug_printf("%u ", m_params.m_source_mipmap_images[i].size());
+				debug_printf("\n");
+			}
+
+			PRINT_BOOL_VALUE(m_uastc);
+			PRINT_BOOL_VALUE(m_y_flip);
+			PRINT_BOOL_VALUE(m_debug);
+			PRINT_BOOL_VALUE(m_validate);
+			PRINT_BOOL_VALUE(m_debug_images);
+			PRINT_BOOL_VALUE(m_global_sel_pal);
+			PRINT_BOOL_VALUE(m_auto_global_sel_pal);
+			PRINT_INT_VALUE(m_compression_level);
+			PRINT_BOOL_VALUE(m_no_hybrid_sel_cb);
+			PRINT_BOOL_VALUE(m_perceptual);
+			PRINT_BOOL_VALUE(m_no_endpoint_rdo);
+			PRINT_BOOL_VALUE(m_no_selector_rdo);
+			PRINT_BOOL_VALUE(m_read_source_images);
+			PRINT_BOOL_VALUE(m_write_output_basis_files);
+			PRINT_BOOL_VALUE(m_compute_stats);
+			PRINT_BOOL_VALUE(m_check_for_alpha);
+			PRINT_BOOL_VALUE(m_force_alpha);
+			debug_printf("swizzle: %d,%d,%d,%d\n",
+				m_params.m_swizzle[0],
+				m_params.m_swizzle[1],
+				m_params.m_swizzle[2],
+				m_params.m_swizzle[3]);
+			PRINT_BOOL_VALUE(m_renormalize);
+			PRINT_BOOL_VALUE(m_multithreading);
+			PRINT_BOOL_VALUE(m_disable_hierarchical_endpoint_codebooks);
+			
+			PRINT_FLOAT_VALUE(m_hybrid_sel_cb_quality_thresh);
+			
+			PRINT_INT_VALUE(m_global_pal_bits);
+			PRINT_INT_VALUE(m_global_mod_bits);
+
+			PRINT_FLOAT_VALUE(m_endpoint_rdo_thresh);
+			PRINT_FLOAT_VALUE(m_selector_rdo_thresh);
+			
+			PRINT_BOOL_VALUE(m_mip_gen);
+			PRINT_BOOL_VALUE(m_mip_renormalize);
+			PRINT_BOOL_VALUE(m_mip_wrapping);
+			PRINT_BOOL_VALUE(m_mip_fast);
+			PRINT_BOOL_VALUE(m_mip_srgb);
+			PRINT_FLOAT_VALUE(m_mip_premultiplied);
+			PRINT_FLOAT_VALUE(m_mip_scale);
+			PRINT_INT_VALUE(m_mip_smallest_dimension);
+			debug_printf("m_mip_filter: %s\n", m_params.m_mip_filter.c_str());
+
+			debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters);
+			debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters);
+			debug_printf("m_quality_level: %i\n", m_params.m_quality_level);
+
+			debug_printf("m_tex_type: %u\n", m_params.m_tex_type);
+			debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1);
+			debug_printf("m_us_per_frame: %i (%f fps)\n", m_params.m_us_per_frame, m_params.m_us_per_frame ? 1.0f / (m_params.m_us_per_frame / 1000000.0f) : 0);
+			debug_printf("m_pack_uastc_flags: 0x%X\n", m_params.m_pack_uastc_flags);
+			
+			PRINT_BOOL_VALUE(m_rdo_uastc);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_quality_scalar);
+			PRINT_INT_VALUE(m_rdo_uastc_dict_size);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_max_allowed_rms_increase_ratio);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_skip_block_rms_thresh);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_max_smooth_block_error_scale);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_smooth_block_max_std_dev);
+			PRINT_BOOL_VALUE(m_rdo_uastc_favor_simpler_modes_in_rdo_mode)
+			PRINT_BOOL_VALUE(m_rdo_uastc_multithreading);
+
+			PRINT_INT_VALUE(m_resample_width);
+			PRINT_INT_VALUE(m_resample_height);
+			PRINT_FLOAT_VALUE(m_resample_factor);
+			debug_printf("Has global codebooks: %u\n", m_params.m_pGlobal_codebooks ? 1 : 0);
+			if (m_params.m_pGlobal_codebooks)
+			{
+				debug_printf("Global codebook endpoints: %u selectors: %u\n", m_params.m_pGlobal_codebooks->get_endpoints().size(), m_params.m_pGlobal_codebooks->get_selectors().size());
+			}
+
+			PRINT_BOOL_VALUE(m_create_ktx2_file);
+
+			debug_printf("KTX2 UASTC supercompression: %u\n", m_params.m_ktx2_uastc_supercompression);
+			debug_printf("KTX2 Zstd supercompression level: %i\n", (int)m_params.m_ktx2_zstd_supercompression_level);
+			debug_printf("KTX2 sRGB transfer func: %u\n", (int)m_params.m_ktx2_srgb_transfer_func);
+			debug_printf("Total KTX2 key values: %u\n", m_params.m_ktx2_key_values.size());
+			for (uint32_t i = 0; i < m_params.m_ktx2_key_values.size(); i++)
+			{
+				debug_printf("Key: \"%s\"\n", m_params.m_ktx2_key_values[i].m_key.data());
+				debug_printf("Value size: %u\n", m_params.m_ktx2_key_values[i].m_value.size());
+			}
+						
+#undef PRINT_BOOL_VALUE
+#undef PRINT_INT_VALUE
+#undef PRINT_UINT_VALUE
+#undef PRINT_FLOAT_VALUE
+		}
+
+		if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size()))
+		{
+			assert(0);
+			return false;
+		}
+
+		return true;
+	}
+		
+	basis_compressor::error_code basis_compressor::process()
+	{
+		debug_printf("basis_compressor::process\n");
+
+		if (!read_source_images())
+			return cECFailedReadingSourceImages;
+
+		if (!validate_texture_type_constraints())
+			return cECFailedValidating;
+
+		if (m_params.m_create_ktx2_file)
+		{
+			if (!validate_ktx2_constraints())
+				return cECFailedValidating;
+		}
+
+		if (!extract_source_blocks())
+			return cECFailedFrontEnd;
+
+		if (m_params.m_uastc)
+		{
+			error_code ec = encode_slices_to_uastc();
+			if (ec != cECSuccess)
+				return ec;
+		}
+		else
+		{
+			if (!process_frontend())
+				return cECFailedFrontEnd;
+
+			if (!extract_frontend_texture_data())
+				return cECFailedFontendExtract;
+
+			if (!process_backend())
+				return cECFailedBackend;
+		}
+
+		if (!create_basis_file_and_transcode())
+			return cECFailedCreateBasisFile;
+		
+		if (m_params.m_create_ktx2_file)
+		{
+			if (!create_ktx2_file())
+				return cECFailedCreateKTX2File;
+		}
+
+		if (!write_output_files_and_compute_stats())
+			return cECFailedWritingOutput;
+
+		return cECSuccess;
+	}
+
+	basis_compressor::error_code basis_compressor::encode_slices_to_uastc()
+	{
+		debug_printf("basis_compressor::encode_slices_to_uastc\n");
+
+		m_uastc_slice_textures.resize(m_slice_descs.size());
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height);
+
+		m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4;
+		m_uastc_backend_output.m_etc1s = false;
+		m_uastc_backend_output.m_slice_desc = m_slice_descs;
+		m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size());
+		m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size());
+				
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			gpu_image& tex = m_uastc_slice_textures[slice_index];
+			basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+			(void)slice_desc;
+
+			const uint32_t num_blocks_x = tex.get_blocks_x();
+			const uint32_t num_blocks_y = tex.get_blocks_y();
+			const uint32_t total_blocks = tex.get_total_blocks();
+			const image& source_image = m_slice_images[slice_index];
+			
+			std::atomic<uint32_t> total_blocks_processed;
+			total_blocks_processed = 0;
+
+			const uint32_t N = 256;
+			for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(total_blocks, block_index_iter + N);
+
+				// FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten.
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed]
+					{
+#endif
+						BASISU_NOTE_UNUSED(num_blocks_y);
+						
+						uint32_t uastc_flags = m_params.m_pack_uastc_flags;
+						if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode))
+							uastc_flags |= cPackUASTCFavorSimplerModes;
+
+						for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+						{
+							const uint32_t block_x = block_index % num_blocks_x;
+							const uint32_t block_y = block_index / num_blocks_x;
+
+							color_rgba block_pixels[4][4];
+
+							source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4);
+
+							basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y);
+
+							encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags);
+
+							total_blocks_processed++;
+							
+							uint32_t val = total_blocks_processed;
+							if ((val & 16383) == 16383)
+							{
+								debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast<float>(val) * 100.0f / total_blocks);
+							}
+
+						}
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			if (m_params.m_rdo_uastc)
+			{
+				uastc_rdo_params rdo_params;
+				rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar;
+				rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio;
+				rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh;
+				rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size;
+				rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale;
+				rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev;
+								
+				bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(),
+					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr,
+					(m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
+				if (!status)
+				{
+					return cECFailedUASTCRDOPostProcess;
+				}
+			}
+
+			m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes());
+			memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes());
+			
+			m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0);
+						
+		} // slice_index
+				
+		return cECSuccess;
+	}
+
+	bool basis_compressor::generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha)
+	{
+		debug_printf("basis_compressor::generate_mipmaps\n");
+
+		interval_timer tm;
+		tm.start();
+
+		uint32_t total_levels = 1;
+		uint32_t w = img.get_width(), h = img.get_height();
+		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
+		{
+			w = maximum(w >> 1U, 1U);
+			h = maximum(h >> 1U, 1U);
+			total_levels++;
+		}
+
+#if BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN
+		// Requires stb_image_resize
+		stbir_filter filter = STBIR_FILTER_DEFAULT;
+		if (m_params.m_mip_filter == "box")
+			filter = STBIR_FILTER_BOX;
+		else if (m_params.m_mip_filter == "triangle")
+			filter = STBIR_FILTER_TRIANGLE;
+		else if (m_params.m_mip_filter == "cubic")
+			filter = STBIR_FILTER_CUBICBSPLINE;
+		else if (m_params.m_mip_filter == "catmull")
+			filter = STBIR_FILTER_CATMULLROM;
+		else if (m_params.m_mip_filter == "mitchell")
+			filter = STBIR_FILTER_MITCHELL;
+
+		for (uint32_t level = 1; level < total_levels; level++)
+		{
+			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
+			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
+
+			image &level_img = *enlarge_vector(mips, 1);
+			level_img.resize(level_width, level_height);
+						
+			int result = stbir_resize_uint8_generic( 
+				(const uint8_t *)img.get_ptr(), img.get_width(), img.get_height(), img.get_pitch() * sizeof(color_rgba),
+            (uint8_t *)level_img.get_ptr(), level_img.get_width(), level_img.get_height(), level_img.get_pitch() * sizeof(color_rgba),
+            has_alpha ? 4 : 3, has_alpha ? 3 : STBIR_ALPHA_CHANNEL_NONE, m_params.m_mip_premultiplied ? STBIR_FLAG_ALPHA_PREMULTIPLIED : 0,
+            m_params.m_mip_wrapping ? STBIR_EDGE_WRAP : STBIR_EDGE_CLAMP, filter, m_params.m_mip_srgb ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, 
+				nullptr);
+
+			if (result == 0)
+			{
+				error_printf("basis_compressor::generate_mipmaps: stbir_resize_uint8_generic() failed!\n");
+				return false;
+			}
+			
+			if (m_params.m_mip_renormalize)
+				level_img.renormalize_normal_map();
+		}
+#else
+		for (uint32_t level = 1; level < total_levels; level++)
+		{
+			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
+			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
+
+			image& level_img = *enlarge_vector(mips, 1);
+			level_img.resize(level_width, level_height);
+
+			const image* pSource_image = &img;
+
+			if (m_params.m_mip_fast)
+			{
+				if (level > 1)
+					pSource_image = &mips[level - 1];
+			}
+
+			bool status = image_resample(*pSource_image, level_img, m_params.m_mip_srgb, m_params.m_mip_filter.c_str(), m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
+			if (!status)
+			{
+				error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n");
+				return false;
+			}
+
+			if (m_params.m_mip_renormalize)
+				level_img.renormalize_normal_map();
+		}
+#endif
+
+		if (m_params.m_debug)
+			debug_printf("Total mipmap generation time: %f secs\n", tm.get_elapsed_secs());
+
+		return true;
+	}
+
+	bool basis_compressor::read_source_images()
+	{
+		debug_printf("basis_compressor::read_source_images\n");
+
+		const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : (uint32_t)m_params.m_source_images.size();
+		if (!total_source_files)
+			return false;
+
+		m_stats.resize(0);
+		m_slice_descs.resize(0);
+		m_slice_images.resize(0);
+
+		m_total_blocks = 0;
+		uint32_t total_macroblocks = 0;
+
+		m_any_source_image_has_alpha = false;
+
+		basisu::vector<image> source_images;
+		basisu::vector<std::string> source_filenames;
+		
+		// First load all source images, and determine if any have an alpha channel.
+		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+		{
+			const char *pSource_filename = "";
+
+			image file_image;
+			
+			if (m_params.m_read_source_images)
+			{
+				pSource_filename = m_params.m_source_filenames[source_file_index].c_str();
+
+				// Load the source image
+				if (!load_image(pSource_filename, file_image))
+				{
+					error_printf("Failed reading source image: %s\n", pSource_filename);
+					return false;
+				}
+
+				printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
+
+				// Optionally load another image and put a grayscale version of it into the alpha channel.
+				if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
+				{
+					const char *pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str();
+
+					image alpha_data;
+
+					if (!load_image(pSource_alpha_image, alpha_data))
+					{
+						error_printf("Failed reading source image: %s\n", pSource_alpha_image);
+						return false;
+					}
+
+					printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height());
+
+					alpha_data.crop(file_image.get_width(), file_image.get_height());
+
+					for (uint32_t y = 0; y < file_image.get_height(); y++)
+						for (uint32_t x = 0; x < file_image.get_width(); x++)
+							file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma();
+				}
+			}
+			else
+			{
+				file_image = m_params.m_source_images[source_file_index];
+			}
+
+			if (m_params.m_renormalize)
+				file_image.renormalize_normal_map();
+
+			bool alpha_swizzled = false;
+			if (m_params.m_swizzle[0] != 0 ||
+				m_params.m_swizzle[1] != 1 ||
+				m_params.m_swizzle[2] != 2 ||
+				m_params.m_swizzle[3] != 3)
+			{
+				// Used for XY normal maps in RG - puts X in color, Y in alpha
+				for (uint32_t y = 0; y < file_image.get_height(); y++)
+					for (uint32_t x = 0; x < file_image.get_width(); x++)
+					{
+						const color_rgba &c = file_image(x, y);
+						file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+					}
+				alpha_swizzled = m_params.m_swizzle[3] != 3;
+			}
+						
+			bool has_alpha = false;
+			if (m_params.m_force_alpha || alpha_swizzled)
+				has_alpha = true;
+			else if (!m_params.m_check_for_alpha)
+				file_image.set_alpha(255);
+			else if (file_image.has_alpha())
+				has_alpha = true;
+
+			if (has_alpha)
+				m_any_source_image_has_alpha = true;
+
+			debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, file_image.get_width(), file_image.get_height(), has_alpha);
+												
+			if (m_params.m_y_flip)
+				file_image.flip_y();
+
+#if DEBUG_EXTRACT_SINGLE_BLOCK
+			image block_image(4, 4);
+			const uint32_t block_x = 0;
+			const uint32_t block_y = 0;
+			block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0);
+			file_image = block_image;
+#endif
+
+#if DEBUG_CROP_TEXTURE_TO_64x64
+			file_image.resize(64, 64);
+#endif
+
+			if (m_params.m_resample_width > 0 && m_params.m_resample_height > 0)
+			{
+				int new_width = basisu::minimum<int>(m_params.m_resample_width, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				int new_height = basisu::minimum<int>(m_params.m_resample_height, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
+				image temp_img(new_width, new_height);
+				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+				temp_img.swap(file_image);
+			}
+			else if (m_params.m_resample_factor > 0.0f)
+			{
+				int new_width = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				int new_height = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
+				image temp_img(new_width, new_height);
+				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+				temp_img.swap(file_image);
+			}
+
+			if ((!file_image.get_width()) || (!file_image.get_height()))
+			{
+				error_printf("basis_compressor::read_source_images: Source image has a zero width and/or height!\n");
+				return false;
+			}
+
+			if ((file_image.get_width() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (file_image.get_height() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION))
+			{
+				error_printf("basis_compressor::read_source_images: Source image is too large!\n");
+				return false;
+			}
+
+			source_images.push_back(file_image);
+			source_filenames.push_back(pSource_filename);
+		}
+
+		// Check if the caller has generated their own mipmaps. 
+		if (m_params.m_source_mipmap_images.size())
+		{
+			// Make sure they've passed us enough mipmap chains.
+			if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size()))
+			{
+				error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n");
+				return false;
+			}
+
+			// Check if any of the user-supplied mipmap levels has alpha.
+			// We're assuming the user has already preswizzled their mipmap source images.
+			if (!m_any_source_image_has_alpha)
+			{
+				for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+				{
+					for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
+					{
+						const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+
+						if (mip_img.has_alpha())
+						{
+							m_any_source_image_has_alpha = true;
+							break;
+						}
+					}
+
+					if (m_any_source_image_has_alpha)
+						break;
+				}
+			}
+		}
+
+		debug_printf("Any source image has alpha: %u\n", m_any_source_image_has_alpha);
+
+		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+		{
+			image &file_image = source_images[source_file_index];
+			const std::string &source_filename = source_filenames[source_file_index];
+
+			// Now, for each source image, create the slices corresponding to that image.
+			basisu::vector<image> slices;
+			
+			slices.reserve(32);
+			
+			// The first (largest) mipmap level.
+			slices.push_back(file_image);
+			
+			if (m_params.m_source_mipmap_images.size())
+			{
+				// User-provided mipmaps for each layer or image in the texture array.
+				for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
+				{
+					image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+
+					if (m_params.m_swizzle[0] != 0 ||
+						m_params.m_swizzle[1] != 1 ||
+						m_params.m_swizzle[2] != 2 ||
+						m_params.m_swizzle[3] != 3)
+					{
+						// Used for XY normal maps in RG - puts X in color, Y in alpha
+						for (uint32_t y = 0; y < mip_img.get_height(); y++)
+							for (uint32_t x = 0; x < mip_img.get_width(); x++)
+							{
+								const color_rgba &c = mip_img(x, y);
+								mip_img(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+							}
+					}
+
+					slices.push_back(mip_img);
+				}
+			}
+			else if (m_params.m_mip_gen)
+			{
+				// Automatically generate mipmaps.
+				if (!generate_mipmaps(file_image, slices, m_any_source_image_has_alpha))
+					return false;
+			}
+
+			uint_vec mip_indices(slices.size());
+			for (uint32_t i = 0; i < slices.size(); i++)
+				mip_indices[i] = i;
+						
+			if ((m_any_source_image_has_alpha) && (!m_params.m_uastc))
+			{
+				// For ETC1S, if source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. 
+				basisu::vector<image> alpha_slices;
+				uint_vec new_mip_indices;
+
+				alpha_slices.reserve(slices.size() * 2);
+
+				for (uint32_t i = 0; i < slices.size(); i++)
+				{
+					image lvl_rgb(slices[i]);
+					image lvl_a(lvl_rgb);
+
+					for (uint32_t y = 0; y < lvl_a.get_height(); y++)
+					{
+						for (uint32_t x = 0; x < lvl_a.get_width(); x++)
+						{
+							uint8_t a = lvl_a(x, y).a;
+							lvl_a(x, y).set_noclamp_rgba(a, a, a, 255);
+						}
+					}
+					
+					lvl_rgb.set_alpha(255);
+
+					alpha_slices.push_back(lvl_rgb);
+					new_mip_indices.push_back(i);
+
+					alpha_slices.push_back(lvl_a);
+					new_mip_indices.push_back(i);
+				}
+
+				slices.swap(alpha_slices);
+				mip_indices.swap(new_mip_indices);
+			}
+
+			assert(slices.size() == mip_indices.size());
+						
+			for (uint32_t slice_index = 0; slice_index < slices.size(); slice_index++)
+			{
+				image& slice_image = slices[slice_index];
+				const uint32_t orig_width = slice_image.get_width();
+				const uint32_t orig_height = slice_image.get_height();
+
+				bool is_alpha_slice = false;
+				if (m_any_source_image_has_alpha)
+				{
+					if (m_params.m_uastc)
+					{
+						is_alpha_slice = slice_image.has_alpha();
+					}
+					else
+					{
+						is_alpha_slice = (slice_index & 1) != 0;
+					}
+				}
+
+				// Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks.
+				slice_image.crop_dup_borders(slice_image.get_block_width(4) * 4, slice_image.get_block_height(4) * 4);
+
+				if (m_params.m_debug_images)
+				{
+					save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), slice_image);
+				}
+
+				enlarge_vector(m_stats, 1);
+				enlarge_vector(m_slice_images, 1);
+				enlarge_vector(m_slice_descs, 1);
+
+				const uint32_t dest_image_index = (uint32_t)m_stats.size() - 1;
+
+				m_stats[dest_image_index].m_filename = source_filename.c_str();
+				m_stats[dest_image_index].m_width = orig_width;
+				m_stats[dest_image_index].m_height = orig_height;
+
+				m_slice_images[dest_image_index] = slice_image;
+
+				debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), orig_width, orig_height, slice_image.get_width(), slice_image.get_height());
+
+				basisu_backend_slice_desc &slice_desc = m_slice_descs[dest_image_index];
+
+				slice_desc.m_first_block_index = m_total_blocks;
+
+				slice_desc.m_orig_width = orig_width;
+				slice_desc.m_orig_height = orig_height;
+
+				slice_desc.m_width = slice_image.get_width();
+				slice_desc.m_height = slice_image.get_height();
+
+				slice_desc.m_num_blocks_x = slice_image.get_block_width(4);
+				slice_desc.m_num_blocks_y = slice_image.get_block_height(4);
+
+				slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1;
+				slice_desc.m_num_macroblocks_y = (slice_desc.m_num_blocks_y + 1) >> 1;
+
+				slice_desc.m_source_file_index = source_file_index;
+				
+				slice_desc.m_mip_index = mip_indices[slice_index];
+
+				slice_desc.m_alpha = is_alpha_slice;
+				slice_desc.m_iframe = false;
+				if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
+				{
+					slice_desc.m_iframe = (source_file_index == 0);
+				}
+
+				m_total_blocks += slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
+				total_macroblocks += slice_desc.m_num_macroblocks_x * slice_desc.m_num_macroblocks_y;
+			
+			} // slice_index
+
+		} // source_file_index
+
+		debug_printf("Total blocks: %u, Total macroblocks: %u\n", m_total_blocks, total_macroblocks);
+
+		// Make sure we don't have too many slices
+		if (m_slice_descs.size() > BASISU_MAX_SLICES)
+		{
+			error_printf("Too many slices!\n");
+			return false;
+		}
+				
+		// Basic sanity check on the slices
+		for (uint32_t i = 1; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &prev_slice_desc = m_slice_descs[i - 1];
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			// Make sure images are in order
+			int image_delta = (int)slice_desc.m_source_file_index - (int)prev_slice_desc.m_source_file_index;
+			if (image_delta > 1)
+				return false;	
+
+			// Make sure mipmap levels are in order
+			if (!image_delta)
+			{
+				int level_delta = (int)slice_desc.m_mip_index - (int)prev_slice_desc.m_mip_index;
+				if (level_delta > 1)
+					return false;
+			}
+		}
+
+		if (m_params.m_status_output)
+		{
+			printf("Total basis file slices: %u\n", (uint32_t)m_slice_descs.size());
+		}
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			if (m_params.m_status_output)
+			{
+				printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n",
+					i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
+			}
+
+			if (m_any_source_image_has_alpha)
+			{
+				if (!m_params.m_uastc)
+				{
+					// For ETC1S, alpha slices must be at odd slice indices.
+					if (slice_desc.m_alpha)
+					{
+						if ((i & 1) == 0)
+							return false;
+
+						const basisu_backend_slice_desc& prev_slice_desc = m_slice_descs[i - 1];
+
+						// Make sure previous slice has this image's color data
+						if (prev_slice_desc.m_source_file_index != slice_desc.m_source_file_index)
+							return false;
+						if (prev_slice_desc.m_alpha)
+							return false;
+						if (prev_slice_desc.m_mip_index != slice_desc.m_mip_index)
+							return false;
+						if (prev_slice_desc.m_num_blocks_x != slice_desc.m_num_blocks_x)
+							return false;
+						if (prev_slice_desc.m_num_blocks_y != slice_desc.m_num_blocks_y)
+							return false;
+					}
+					else if (i & 1)
+						return false;
+				}
+			}
+			else if (slice_desc.m_alpha)
+			{
+				return false;
+			}
+
+			if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height))
+				return false;
+			if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+			{
+				if (!slice_desc.m_iframe)
+					return false;
+			}
+		}
+
+		return true;
+	}
+
+	// Do some basic validation for 2D arrays, cubemaps, video, and volumes.
+	bool basis_compressor::validate_texture_type_constraints() 
+	{
+		debug_printf("basis_compressor::validate_texture_type_constraints\n");
+
+		// In 2D mode anything goes (each image may have a different resolution and # of mipmap levels).
+		if (m_params.m_tex_type == basist::cBASISTexType2D)
+			return true;
+				
+		uint32_t total_basis_images = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+				
+			total_basis_images = maximum<uint32_t>(total_basis_images, slice_desc.m_source_file_index + 1);
+		}
+
+		if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+		{
+			// For cubemaps, validate that the total # of Basis images is a multiple of 6.
+			if ((total_basis_images % 6) != 0)
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: For cubemaps the total number of input images is not a multiple of 6!\n");
+				return false;
+			}
+		}
+
+		// Now validate that all the mip0's have the same dimensions, and that each image has the same # of mipmap levels.
+		uint_vec image_mipmap_levels(total_basis_images);
+
+		int width = -1, height = -1;
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+
+			image_mipmap_levels[slice_desc.m_source_file_index] = maximum(image_mipmap_levels[slice_desc.m_source_file_index], slice_desc.m_mip_index + 1);
+				
+			if (slice_desc.m_mip_index != 0)
+				continue;
+
+			if (width < 0)
+			{
+				width = slice_desc.m_orig_width;
+				height = slice_desc.m_orig_height;
+			}
+			else if ((width != (int)slice_desc.m_orig_width) || (height != (int)slice_desc.m_orig_height))
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: The source image resolutions are not all equal!\n");
+				return false;
+			}
+		}
+
+		for (size_t i = 1; i < image_mipmap_levels.size(); i++)
+		{
+			if (image_mipmap_levels[0] != image_mipmap_levels[i])
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: Each image must have the same number of mipmap levels!\n");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::extract_source_blocks()
+	{
+		debug_printf("basis_compressor::extract_source_blocks\n");
+
+		m_source_blocks.resize(m_total_blocks);
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
+			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
+
+			const image& source_image = m_slice_images[slice_index];
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					source_image.extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4);
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::process_frontend()
+	{
+		debug_printf("basis_compressor::process_frontend\n");
+						
+#if 0
+		// TODO
+		basis_etc1_pack_params pack_params;
+		pack_params.m_quality = cETCQualityMedium;
+		pack_params.m_perceptual = m_params.m_perceptual;
+		pack_params.m_use_color4 = false;
+
+		pack_etc1_block_context pack_context;
+
+		std::unordered_set<uint64_t> endpoint_hash;
+		std::unordered_set<uint32_t> selector_hash;
+
+		for (uint32_t i = 0; i < m_source_blocks.size(); i++)
+		{
+			etc_block blk;
+			pack_etc1_block(blk, m_source_blocks[i].get_ptr(), pack_params, pack_context);
+
+			const color_rgba c0(blk.get_block_color(0, false));
+			endpoint_hash.insert((c0.r | (c0.g << 5) | (c0.b << 10)) | (blk.get_inten_table(0) << 16));
+
+			const color_rgba c1(blk.get_block_color(1, false));
+			endpoint_hash.insert((c1.r | (c1.g << 5) | (c1.b << 10)) | (blk.get_inten_table(1) << 16));
+
+			selector_hash.insert(blk.get_raw_selector_bits());
+		}
+
+		const uint32_t total_unique_endpoints = (uint32_t)endpoint_hash.size();
+		const uint32_t total_unique_selectors = (uint32_t)selector_hash.size();
+
+		if (m_params.m_debug)
+		{
+			debug_printf("Unique endpoints: %u, unique selectors: %u\n", total_unique_endpoints, total_unique_selectors);
+		}
+#endif
+
+		const double total_texels = m_total_blocks * 16.0f;
+
+		int endpoint_clusters = m_params.m_max_endpoint_clusters;
+		int selector_clusters = m_params.m_max_selector_clusters;
+
+		if (endpoint_clusters > basisu_frontend::cMaxEndpointClusters)
+		{
+			error_printf("Too many endpoint clusters! (%u but max is %u)\n", endpoint_clusters, basisu_frontend::cMaxEndpointClusters);
+			return false;
+		}
+		if (selector_clusters > basisu_frontend::cMaxSelectorClusters)
+		{
+			error_printf("Too many selector clusters! (%u but max is %u)\n", selector_clusters, basisu_frontend::cMaxSelectorClusters);
+			return false;
+		}
+		
+		if (m_params.m_quality_level != -1)
+		{
+			const float quality = saturate(m_params.m_quality_level / 255.0f);
+									
+			const float bits_per_endpoint_cluster = 14.0f;
+			const float max_desired_endpoint_cluster_bits_per_texel = 1.0f; // .15f
+			int max_endpoints = static_cast<int>((max_desired_endpoint_cluster_bits_per_texel * total_texels) / bits_per_endpoint_cluster);
+			
+			const float mid = 128.0f / 255.0f;
+
+			float color_endpoint_quality = quality;
+
+			const float endpoint_split_point = 0.5f;
+			
+			// In v1.2 and in previous versions, the endpoint codebook size at quality 128 was 3072. This wasn't quite large enough.
+			const int ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE = 4800;
+			const int MAX_ENDPOINT_CODEBOOK_SIZE = 8192;
+
+			if (color_endpoint_quality <= mid)
+			{
+				color_endpoint_quality = lerp(0.0f, endpoint_split_point, powf(color_endpoint_quality / mid, .65f));
+
+				max_endpoints = clamp<int>(max_endpoints, 256, ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE);
+				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
+								
+				if (max_endpoints < 64)
+					max_endpoints = 64;
+				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(32, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
+			}
+			else
+			{
+				color_endpoint_quality = powf((color_endpoint_quality - mid) / (1.0f - mid), 1.6f);
+
+				max_endpoints = clamp<int>(max_endpoints, 256, MAX_ENDPOINT_CODEBOOK_SIZE);
+				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
+								
+				if (max_endpoints < ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE)
+					max_endpoints = ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE;
+				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
+			}
+						
+			float bits_per_selector_cluster = m_params.m_global_sel_pal ? 21.0f : 14.0f;
+
+			const float max_desired_selector_cluster_bits_per_texel = 1.0f; // .15f
+			int max_selectors = static_cast<int>((max_desired_selector_cluster_bits_per_texel * total_texels) / bits_per_selector_cluster);
+			max_selectors = clamp<int>(max_selectors, 256, basisu_frontend::cMaxSelectorClusters);
+			max_selectors = minimum<uint32_t>(max_selectors, m_total_blocks);
+
+			float color_selector_quality = quality;
+			//color_selector_quality = powf(color_selector_quality, 1.65f);
+			color_selector_quality = powf(color_selector_quality, 2.62f);
+
+			if (max_selectors < 96)
+				max_selectors = 96;
+			selector_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(96, static_cast<float>(max_selectors), color_selector_quality)), 8, basisu_frontend::cMaxSelectorClusters);
+
+			debug_printf("Max endpoints: %u, max selectors: %u\n", endpoint_clusters, selector_clusters);
+
+			if (m_params.m_quality_level >= 223)
+			{
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+				{
+					if (!m_params.m_endpoint_rdo_thresh.was_changed())
+						m_params.m_endpoint_rdo_thresh *= .25f;
+					
+					if (!m_params.m_selector_rdo_thresh.was_changed())
+						m_params.m_selector_rdo_thresh *= .25f;
+				}
+			}
+			else if (m_params.m_quality_level >= 192)
+			{
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= .5f;
+
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= .5f;
+			}
+			else if (m_params.m_quality_level >= 160)
+			{
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= .75f;
+
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= .75f;
+			}
+			else if (m_params.m_quality_level >= 129)
+			{
+				float l = (quality - 129 / 255.0f) / ((160 - 129) / 255.0f);
+
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= lerp<float>(1.0f, .75f, l);
+				
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= lerp<float>(1.0f, .75f, l);
+			}
+		}
+
+		m_auto_global_sel_pal = false;
+		if (!m_params.m_global_sel_pal && m_params.m_auto_global_sel_pal)
+		{
+			const float bits_per_selector_cluster = 31.0f;
+			double selector_codebook_bpp_est = (bits_per_selector_cluster * selector_clusters) / total_texels;
+			debug_printf("selector_codebook_bpp_est: %f\n", selector_codebook_bpp_est);
+			const float force_global_sel_pal_bpp_threshold = .15f;
+			if ((total_texels <= 128.0f*128.0f) && (selector_codebook_bpp_est > force_global_sel_pal_bpp_threshold))
+			{
+				m_auto_global_sel_pal = true;
+				debug_printf("Auto global selector palette enabled\n");
+			}
+		}
+
+		basisu_frontend::params p;
+		p.m_num_source_blocks = m_total_blocks;
+		p.m_pSource_blocks = &m_source_blocks[0];
+		p.m_max_endpoint_clusters = endpoint_clusters;
+		p.m_max_selector_clusters = selector_clusters;
+		p.m_perceptual = m_params.m_perceptual;
+		p.m_debug_stats = m_params.m_debug;
+		p.m_debug_images = m_params.m_debug_images;
+		p.m_compression_level = m_params.m_compression_level;
+		p.m_tex_type = m_params.m_tex_type;
+		p.m_multithreaded = m_params.m_multithreading;
+		p.m_disable_hierarchical_endpoint_codebooks = m_params.m_disable_hierarchical_endpoint_codebooks;
+		p.m_validate = m_params.m_validate;
+		p.m_pJob_pool = m_params.m_pJob_pool;
+		p.m_pGlobal_codebooks = m_params.m_pGlobal_codebooks;
+
+		if ((m_params.m_global_sel_pal) || (m_auto_global_sel_pal))
+		{
+			p.m_pGlobal_sel_codebook = m_params.m_pSel_codebook;
+			p.m_num_global_sel_codebook_pal_bits = m_params.m_global_pal_bits;
+			p.m_num_global_sel_codebook_mod_bits = m_params.m_global_mod_bits;
+			p.m_use_hybrid_selector_codebooks = !m_params.m_no_hybrid_sel_cb;
+			p.m_hybrid_codebook_quality_thresh = m_params.m_hybrid_sel_cb_quality_thresh;
+		}
+
+		if (!m_frontend.init(p))
+		{
+			error_printf("basisu_frontend::init() failed!\n");
+			return false;
+		}
+
+		m_frontend.compress();
+
+		if (m_params.m_debug_images)
+		{
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			{
+				char filename[1024];
+#ifdef _WIN32				
+				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
+#else
+				snprintf(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
+#endif				
+				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, true);
+
+#ifdef _WIN32
+				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
+#else
+				snprintf(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
+#endif				
+				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, false);
+			}
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::extract_frontend_texture_data()
+	{
+		debug_printf("basis_compressor::extract_frontend_texture_data\n");
+
+		m_frontend_output_textures.resize(m_slice_descs.size());
+		m_best_etc1s_images.resize(m_slice_descs.size());
+		m_best_etc1s_images_unpacked.resize(m_slice_descs.size());
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
+			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
+
+			const uint32_t width = num_blocks_x * 4;
+			const uint32_t height = num_blocks_y * 4;
+
+			m_frontend_output_textures[i].init(texture_format::cETC1, width, height);
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					memcpy(m_frontend_output_textures[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_output_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
+
+#if 0
+			if (m_params.m_debug_images)
+			{
+				char filename[1024];
+				sprintf_s(filename, sizeof(filename), "rdo_etc_frontend_%u_", i);
+				write_etc1_vis_images(m_frontend_output_textures[i], filename);
+			}
+#endif
+
+			m_best_etc1s_images[i].init(texture_format::cETC1, width, height);
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					memcpy(m_best_etc1s_images[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_etc1s_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
+
+			m_best_etc1s_images[i].unpack(m_best_etc1s_images_unpacked[i]);
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::process_backend()
+	{
+		debug_printf("basis_compressor::process_backend\n");
+
+		basisu_backend_params backend_params;
+		backend_params.m_debug = m_params.m_debug;
+		backend_params.m_debug_images = m_params.m_debug_images;
+		backend_params.m_etc1s = true;
+		backend_params.m_compression_level = m_params.m_compression_level;
+		
+		if (!m_params.m_no_endpoint_rdo)
+			backend_params.m_endpoint_rdo_quality_thresh = m_params.m_endpoint_rdo_thresh;
+
+		if (!m_params.m_no_selector_rdo)
+			backend_params.m_selector_rdo_quality_thresh = m_params.m_selector_rdo_thresh;
+				
+		backend_params.m_use_global_sel_codebook = (m_frontend.get_params().m_pGlobal_sel_codebook != NULL);
+		backend_params.m_global_sel_codebook_pal_bits = m_frontend.get_params().m_num_global_sel_codebook_pal_bits;
+		backend_params.m_global_sel_codebook_mod_bits = m_frontend.get_params().m_num_global_sel_codebook_mod_bits;
+		backend_params.m_use_hybrid_sel_codebooks = m_frontend.get_params().m_use_hybrid_selector_codebooks;
+		backend_params.m_used_global_codebooks = m_frontend.get_params().m_pGlobal_codebooks != nullptr;
+
+		m_backend.init(&m_frontend, backend_params, m_slice_descs, m_params.m_pSel_codebook);
+		uint32_t total_packed_bytes = m_backend.encode();
+
+		if (!total_packed_bytes)
+		{
+			error_printf("basis_compressor::encode() failed!\n");
+			return false;
+		}
+
+		debug_printf("Total packed bytes (estimated): %u\n", total_packed_bytes);
+
+		return true;
+	}
+
+	bool basis_compressor::create_basis_file_and_transcode()
+	{
+		debug_printf("basis_compressor::create_basis_file_and_transcode\n");
+
+		const basisu_backend_output& encoded_output = m_params.m_uastc ? m_uastc_backend_output : m_backend.get_output();
+
+		if (!m_basis_file.init(encoded_output, m_params.m_tex_type, m_params.m_userdata0, m_params.m_userdata1, m_params.m_y_flip, m_params.m_us_per_frame))
+		{
+			error_printf("basis_compressor::create_basis_file_and_transcode: basisu_backend:init() failed!\n");
+			return false;
+		}
+	
+		const uint8_vec &comp_data = m_basis_file.get_compressed_data();
+
+		m_output_basis_file = comp_data;
+
+		interval_timer tm;
+		tm.start();
+
+		basist::basisu_transcoder_init();
+
+		debug_printf("basist::basisu_transcoder_init: Took %f ms\n", tm.get_elapsed_ms());
+
+		// Verify the compressed data by transcoding it to ASTC (or ETC1)/BC7 and validating the CRC's.
+		basist::basisu_transcoder decoder(m_params.m_pSel_codebook);
+		if (!decoder.validate_file_checksums(&comp_data[0], (uint32_t)comp_data.size(), true))
+		{
+			error_printf("decoder.validate_file_checksums() failed!\n");
+			return false;
+		}
+
+		m_decoded_output_textures.resize(m_slice_descs.size());
+		m_decoded_output_textures_unpacked.resize(m_slice_descs.size());
+
+		m_decoded_output_textures_bc7.resize(m_slice_descs.size());
+		m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size());
+								
+		tm.start();
+		if (m_params.m_pGlobal_codebooks)
+		{
+			decoder.set_global_codebooks(m_params.m_pGlobal_codebooks);
+		}
+
+		if (!decoder.start_transcoding(&comp_data[0], (uint32_t)comp_data.size()))
+		{
+			error_printf("decoder.start_transcoding() failed!\n");
+			return false;
+		}
+
+		double start_transcoding_time = tm.get_elapsed_secs();
+
+		debug_printf("basisu_compressor::start_transcoding() took %3.3fms\n", start_transcoding_time * 1000.0f);
+
+		uint32_t total_orig_pixels = 0;
+		uint32_t total_texels = 0;
+
+		double total_time_etc1s_or_astc = 0;
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			gpu_image decoded_texture;
+			decoded_texture.init(m_params.m_uastc ? texture_format::cASTC4x4 : texture_format::cETC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+						
+			tm.start();
+
+			basist::block_format format = m_params.m_uastc ? basist::block_format::cASTC_4x4 : basist::block_format::cETC1;
+			uint32_t bytes_per_block = m_params.m_uastc ? 16 : 8;
+						
+			if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+				reinterpret_cast<etc_block *>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, format, bytes_per_block))
+			{
+				error_printf("Transcoding failed on slice %u!\n", i);
+				return false;
+			}
+
+			total_time_etc1s_or_astc += tm.get_elapsed_secs();
+
+			if (encoded_output.m_tex_format == basist::basis_tex_format::cETC1S)
+			{
+				uint32_t image_crc16 = basist::crc16(decoded_texture.get_ptr(), decoded_texture.get_size_in_bytes(), 0);
+				if (image_crc16 != encoded_output.m_slice_image_crcs[i])
+				{
+					error_printf("Decoded image data CRC check failed on slice %u!\n", i);
+					return false;
+				}
+				debug_printf("Decoded image data CRC check succeeded on slice %i\n", i);
+			}
+
+			m_decoded_output_textures[i] = decoded_texture;
+
+			total_orig_pixels += m_slice_descs[i].m_orig_width * m_slice_descs[i].m_orig_height;
+			total_texels += m_slice_descs[i].m_width * m_slice_descs[i].m_height;
+		}
+												
+		double total_time_bc7 = 0;
+
+		if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) &&
+			basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S))
+		{
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			{
+				gpu_image decoded_texture;
+				decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+
+				tm.start();
+
+				if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+					reinterpret_cast<etc_block*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+				{
+					error_printf("Transcoding failed to BC7 on slice %u!\n", i);
+					return false;
+				}
+
+				total_time_bc7 += tm.get_elapsed_secs();
+
+				m_decoded_output_textures_bc7[i] = decoded_texture;
+			}
+		}
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
+
+			if (m_decoded_output_textures_bc7[i].get_pixel_width())
+				m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
+		}
+
+		debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc);
+
+		if (total_time_bc7 != 0)
+			debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7);
+
+		debug_printf("Total .basis output file size: %u, %3.3f bits/texel\n", comp_data.size(), comp_data.size() * 8.0f / total_orig_pixels);
+				
+		uint32_t total_orig_texels = 0;
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+
+			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
+
+			const uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
+			BASISU_NOTE_UNUSED(total_blocks);
+
+			assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks);
+		}
+
+		m_basis_file_size = (uint32_t)comp_data.size();
+		m_basis_bits_per_texel = (comp_data.size() * 8.0f) / total_orig_texels;
+
+		return true;
+	}
+
+	bool basis_compressor::write_output_files_and_compute_stats()
+	{
+		debug_printf("basis_compressor::write_output_files_and_compute_stats\n");
+
+		const uint8_vec& comp_data = m_params.m_create_ktx2_file ? m_output_ktx2_file : m_basis_file.get_compressed_data();
+		if (m_params.m_write_output_basis_files)
+		{
+			const std::string& output_filename = m_params.m_out_filename;
+
+			if (!write_vec_to_file(output_filename.c_str(), comp_data))
+			{
+				error_printf("Failed writing output data to file \"%s\"\n", output_filename.c_str());
+				return false;
+			}
+
+			printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str());
+		}
+
+		size_t comp_size = 0;
+		if ((m_params.m_compute_stats) && (m_params.m_uastc) && (comp_data.size()))
+		{
+			void* pComp_data = tdefl_compress_mem_to_heap(&comp_data[0], comp_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
+			size_t decomp_size = 0;
+			void* pDecomp_data = tinfl_decompress_mem_to_heap(pComp_data, comp_size, &decomp_size, 0);
+			if ((decomp_size != comp_data.size()) || (memcmp(pDecomp_data, &comp_data[0], decomp_size) != 0))
+			{
+				printf("basis_compressor::create_basis_file_and_transcode:: miniz compression or decompression failed!\n");
+				return false;
+			}
+
+			mz_free(pComp_data);
+			mz_free(pDecomp_data);
+
+			uint32_t total_texels = 0;
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+				total_texels += (m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y) * 16;
+			
+			m_basis_bits_per_texel = comp_size * 8.0f / total_texels;
+
+			debug_printf(".basis file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n",
+				(uint32_t)comp_data.size(),
+				(uint32_t)comp_size,
+				m_basis_bits_per_texel);
+		}
+
+		m_stats.resize(m_slice_descs.size());
+		
+		uint32_t total_orig_texels = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+						
+			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
+
+			if (m_params.m_compute_stats)
+			{
+				printf("Slice: %u\n", slice_index);
+
+				image_stats &s = m_stats[slice_index];
+
+				// TODO: We used to output SSIM (during heavy encoder development), but this slowed down compression too much. We'll be adding it back.
+
+				image_metrics em;
+								
+				// ---- .basis stats
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
+				em.print(".basis RGB Avg:          ");
+				s.m_basis_rgb_avg_psnr = em.m_psnr;
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
+				em.print(".basis RGBA Avg:         ");
+				s.m_basis_rgba_avg_psnr = em.m_psnr;
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
+				em.print(".basis R   Avg:          ");
+				
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
+				em.print(".basis G   Avg:          ");
+				
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
+				em.print(".basis B   Avg:          ");
+
+				if (m_params.m_uastc)
+				{
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
+					em.print(".basis A   Avg:          ");
+
+					s.m_basis_a_avg_psnr = em.m_psnr;
+				}
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
+				em.print(".basis 709 Luma:         ");
+				s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
+				s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
+				em.print(".basis 601 Luma:         ");
+				s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
+								
+				if (m_slice_descs.size() == 1)
+				{
+					const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
+					debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+					debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+				}
+
+				if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
+				{
+					// ---- BC7 stats
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
+					em.print("BC7 RGB Avg:             ");
+					s.m_bc7_rgb_avg_psnr = em.m_psnr;
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
+					em.print("BC7 RGBA Avg:            ");
+					s.m_bc7_rgba_avg_psnr = em.m_psnr;
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
+					em.print("BC7 R   Avg:             ");
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
+					em.print("BC7 G   Avg:             ");
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
+					em.print("BC7 B   Avg:             ");
+
+					if (m_params.m_uastc)
+					{
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
+						em.print("BC7 A   Avg:             ");
+
+						s.m_bc7_a_avg_psnr = em.m_psnr;
+					}
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
+					em.print("BC7 709 Luma:            ");
+					s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
+					s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
+					em.print("BC7 601 Luma:            ");
+					s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
+				}
+
+				if (!m_params.m_uastc)
+				{
+					// ---- Nearly best possible ETC1S stats
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
+					em.print("Unquantized ETC1S 709 Luma:    ");
+
+					s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
+					s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
+					em.print("Unquantized ETC1S 601 Luma:    ");
+
+					s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
+
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
+					em.print("Unquantized ETC1S RGB Avg:     ");
+
+					s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
+				}
+			}
+		
+			std::string out_basename;
+			if (m_params.m_out_filename.size())
+				string_get_filename(m_params.m_out_filename.c_str(), out_basename);
+			else if (m_params.m_source_filenames.size())
+				string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
+
+			string_remove_extension(out_basename);
+			out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
+
+			if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images))
+			{
+				// Write "best" ETC1S debug images
+				if (!m_params.m_uastc)
+				{
+					gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]);
+					best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image);
+
+					image best_etc1s_unpacked;
+					best_etc1s_gpu_image.unpack(best_etc1s_unpacked);
+					save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked);
+				}
+			}
+
+			if (m_params.m_debug_images)
+			{
+				// Write decoded ETC1S/ASTC debug images
+				{
+					gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]);
+					decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc);
+
+					image temp(m_decoded_output_textures_unpacked[slice_index]);
+					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp);
+				}
+
+				// Write decoded BC7 debug images
+				if (m_decoded_output_textures_bc7[slice_index].get_pixel_width())
+				{
+					gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]);
+					decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7);
+
+					image temp(m_decoded_output_textures_unpacked_bc7[slice_index]);
+					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					save_png(out_basename + "_transcoded_bc7.png", temp);
+				}
+			}
+		}
+				
+		return true;
+	}
+	
+	// Make sure all the mip 0's have the same dimensions and number of mipmap levels, or we can't encode the KTX2 file.
+	bool basis_compressor::validate_ktx2_constraints()
+	{
+		uint32_t base_width = 0, base_height = 0;
+		uint32_t total_layers = 0;
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			if (m_slice_descs[i].m_mip_index == 0)
+			{
+				if (!base_width)
+				{
+					base_width = m_slice_descs[i].m_orig_width;
+					base_height = m_slice_descs[i].m_orig_height;
+				}
+				else
+				{
+					if ((m_slice_descs[i].m_orig_width != base_width) || (m_slice_descs[i].m_orig_height != base_height))
+					{
+						return false;
+					}
+				}
+
+				total_layers = maximum<uint32_t>(total_layers, m_slice_descs[i].m_source_file_index + 1);
+			}
+		}
+
+		basisu::vector<uint32_t> total_mips(total_layers);
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			total_mips[m_slice_descs[i].m_source_file_index] = maximum<uint32_t>(total_mips[m_slice_descs[i].m_source_file_index], m_slice_descs[i].m_mip_index + 1);
+
+		for (uint32_t i = 1; i < total_layers; i++)
+		{
+			if (total_mips[0] != total_mips[i])
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	static uint8_t g_ktx2_etc1s_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_uastc_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+		
+	void basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header)
+	{
+		const uint8_t* pDFD;
+		uint32_t dfd_len;
+
+		if (m_params.m_uastc)
+		{
+			if (m_any_source_image_has_alpha)
+			{
+				pDFD = g_ktx2_uastc_alpha_dfd;
+				dfd_len = sizeof(g_ktx2_uastc_alpha_dfd);
+			}
+			else
+			{
+				pDFD = g_ktx2_uastc_nonalpha_dfd;
+				dfd_len = sizeof(g_ktx2_uastc_nonalpha_dfd);
+			}
+		}
+		else
+		{
+			if (m_any_source_image_has_alpha)
+			{
+				pDFD = g_ktx2_etc1s_alpha_dfd;
+				dfd_len = sizeof(g_ktx2_etc1s_alpha_dfd);
+			}
+			else
+			{
+				pDFD = g_ktx2_etc1s_nonalpha_dfd;
+				dfd_len = sizeof(g_ktx2_etc1s_nonalpha_dfd);
+			}
+		}
+				
+		assert(dfd_len >= 44);
+
+		dfd.resize(dfd_len);
+		memcpy(dfd.data(), pDFD, dfd_len);
+
+		uint32_t dfd_bits = basisu::read_le_dword(dfd.data() + 3 * sizeof(uint32_t));
+		
+		dfd_bits &= ~(0xFF << 16);
+
+		if (m_params.m_ktx2_srgb_transfer_func)
+			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16);
+		else
+			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16);
+
+		basisu::write_le_dword(dfd.data() + 3 * sizeof(uint32_t), dfd_bits);
+
+		if (header.m_supercompression_scheme != basist::KTX2_SS_NONE)
+		{
+			uint32_t plane_bits = basisu::read_le_dword(dfd.data() + 5 * sizeof(uint32_t));
+
+			plane_bits &= ~0xFF;
+
+			basisu::write_le_dword(dfd.data() + 5 * sizeof(uint32_t), plane_bits);
+		}
+
+		// Fix up the DFD channel(s)
+		uint32_t dfd_chan0 = basisu::read_le_dword(dfd.data() + 7 * sizeof(uint32_t));
+
+		if (m_params.m_uastc)
+		{
+			dfd_chan0 &= ~(0xF << 24);
+			
+			// TODO: Allow the caller to override this
+			if (m_any_source_image_has_alpha)
+				dfd_chan0 |= (basist::KTX2_DF_CHANNEL_UASTC_RGBA << 24);
+			else
+				dfd_chan0 |= (basist::KTX2_DF_CHANNEL_UASTC_RGB << 24);
+		}
+
+		basisu::write_le_dword(dfd.data() + 7 * sizeof(uint32_t), dfd_chan0);
+	}
+
+	bool basis_compressor::create_ktx2_file()
+	{
+		if (m_params.m_uastc)
+		{
+			if ((m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_NONE) && (m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_ZSTANDARD))
+				return false;
+		}
+
+		const basisu_backend_output& backend_output = m_backend.get_output();
+
+		// Determine the width/height, number of array layers, mipmap levels, and the number of faces (1 for 2D, 6 for cubemap).
+		// This does not support 1D or 3D.
+		uint32_t base_width = 0, base_height = 0, total_layers = 0, total_levels = 0, total_faces = 1;
+				
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			if ((m_slice_descs[i].m_mip_index == 0) && (!base_width))
+			{
+				base_width = m_slice_descs[i].m_orig_width;
+				base_height = m_slice_descs[i].m_orig_height;
+			}
+
+			total_layers = maximum<uint32_t>(total_layers, m_slice_descs[i].m_source_file_index + 1);
+
+			if (!m_slice_descs[i].m_source_file_index)
+				total_levels = maximum<uint32_t>(total_levels, m_slice_descs[i].m_mip_index + 1);
+		}
+
+		if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+		{
+			assert((total_layers % 6) == 0);
+			
+			total_layers /= 6;
+			assert(total_layers >= 1);
+
+			total_faces = 6;
+		}
+
+		basist::ktx2_header header;
+		memset(&header, 0, sizeof(header));
+
+		memcpy(header.m_identifier, basist::g_ktx2_file_identifier, sizeof(basist::g_ktx2_file_identifier));
+		header.m_pixel_width = base_width;
+		header.m_pixel_height = base_height;
+		header.m_face_count = total_faces;
+		header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED;
+		header.m_type_size = 1;
+		header.m_level_count = total_levels;
+		header.m_layer_count = (total_layers > 1) ? total_layers : 0;
+
+		if (m_params.m_uastc)
+		{
+			switch (m_params.m_ktx2_uastc_supercompression)
+			{
+			case basist::KTX2_SS_NONE:
+			{
+				header.m_supercompression_scheme = basist::KTX2_SS_NONE;
+				break;
+			}
+			case basist::KTX2_SS_ZSTANDARD:
+			{
+#if BASISD_SUPPORT_KTX2_ZSTD
+				header.m_supercompression_scheme = basist::KTX2_SS_ZSTANDARD;
+#else
+				header.m_supercompression_scheme = basist::KTX2_SS_NONE;
+#endif
+				break;
+			}
+			default: assert(0); return false;
+			}
+		}
+
+		basisu::vector<uint8_vec> level_data_bytes(total_levels);
+		basisu::vector<uint8_vec> compressed_level_data_bytes(total_levels);
+		uint_vec slice_level_offsets(m_slice_descs.size());
+
+		// This will append the texture data in the correct order (for each level: layer, then face).
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+			slice_level_offsets[slice_index] = level_data_bytes[slice_desc.m_mip_index].size();
+
+			if (m_params.m_uastc)
+				append_vector(level_data_bytes[slice_desc.m_mip_index], m_uastc_backend_output.m_slice_image_data[slice_index]);
+			else
+				append_vector(level_data_bytes[slice_desc.m_mip_index], backend_output.m_slice_image_data[slice_index]);
+		}
+
+		// UASTC supercompression
+		if ((m_params.m_uastc) && (header.m_supercompression_scheme == basist::KTX2_SS_ZSTANDARD))
+		{
+#if BASISD_SUPPORT_KTX2_ZSTD
+			for (uint32_t level_index = 0; level_index < total_levels; level_index++)
+			{
+				compressed_level_data_bytes[level_index].resize(ZSTD_compressBound(level_data_bytes[level_index].size()));
+
+				size_t result = ZSTD_compress(compressed_level_data_bytes[level_index].data(), compressed_level_data_bytes[level_index].size(),
+					level_data_bytes[level_index].data(), level_data_bytes[level_index].size(),
+					m_params.m_ktx2_zstd_supercompression_level);
+
+				if (ZSTD_isError(result))
+					return false;
+
+				compressed_level_data_bytes[level_index].resize(result);
+			}
+#else
+			// Can't get here
+			assert(0);
+			return false;
+#endif
+		}
+		else
+		{
+			// No supercompression
+			compressed_level_data_bytes = level_data_bytes;
+		}
+				
+		uint8_vec etc1s_global_data;
+
+		// Create ETC1S global supercompressed data
+		if (!m_params.m_uastc)
+		{
+			basist::ktx2_etc1s_global_data_header etc1s_global_data_header;
+			clear_obj(etc1s_global_data_header);
+
+			etc1s_global_data_header.m_endpoint_count = backend_output.m_num_endpoints;
+			etc1s_global_data_header.m_selector_count = backend_output.m_num_selectors;
+			etc1s_global_data_header.m_endpoints_byte_length = backend_output.m_endpoint_palette.size();
+			etc1s_global_data_header.m_selectors_byte_length = backend_output.m_selector_palette.size();
+			etc1s_global_data_header.m_tables_byte_length = backend_output.m_slice_image_tables.size();
+
+			basisu::vector<basist::ktx2_etc1s_image_desc> etc1s_image_descs(total_levels * total_layers * total_faces);
+			memset(etc1s_image_descs.data(), 0, etc1s_image_descs.size_in_bytes());
+
+			for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			{
+				const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+				const uint32_t level_index = slice_desc.m_mip_index;
+				uint32_t layer_index = slice_desc.m_source_file_index;
+				uint32_t face_index = 0;
+
+				if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+				{
+					face_index = layer_index % 6;
+					layer_index /= 6;
+				}
+
+				const uint32_t etc1s_image_index = level_index * (total_layers * total_faces) + layer_index * total_faces + face_index;
+
+				if (slice_desc.m_alpha)
+				{
+					etc1s_image_descs[etc1s_image_index].m_alpha_slice_byte_length = backend_output.m_slice_image_data[slice_index].size();
+					etc1s_image_descs[etc1s_image_index].m_alpha_slice_byte_offset = slice_level_offsets[slice_index];
+				}
+				else
+				{
+					if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
+						etc1s_image_descs[etc1s_image_index].m_image_flags = !slice_desc.m_iframe ? basist::KTX2_IMAGE_IS_P_FRAME : 0;
+
+					etc1s_image_descs[etc1s_image_index].m_rgb_slice_byte_length = backend_output.m_slice_image_data[slice_index].size();
+					etc1s_image_descs[etc1s_image_index].m_rgb_slice_byte_offset = slice_level_offsets[slice_index];
+				}
+			} // slice_index
+
+			append_vector(etc1s_global_data, (const uint8_t*)&etc1s_global_data_header, sizeof(etc1s_global_data_header));
+			append_vector(etc1s_global_data, (const uint8_t*)etc1s_image_descs.data(), etc1s_image_descs.size_in_bytes());
+			append_vector(etc1s_global_data, backend_output.m_endpoint_palette);
+			append_vector(etc1s_global_data, backend_output.m_selector_palette);
+			append_vector(etc1s_global_data, backend_output.m_slice_image_tables);
+			
+			header.m_supercompression_scheme = basist::KTX2_SS_BASISLZ;
+		}
+
+		// Key values
+		basist::ktx2_transcoder::key_value_vec key_values(m_params.m_ktx2_key_values);
+		key_values.enlarge(1);
+		
+		const char* pKTXwriter = "KTXwriter";
+		key_values.back().m_key.resize(strlen(pKTXwriter) + 1);
+		memcpy(key_values.back().m_key.data(), pKTXwriter, strlen(pKTXwriter) + 1);
+
+		char writer_id[128];
+#ifdef _MSC_VER
+		sprintf_s(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING);
+#else
+		snprintf(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING);
+#endif
+		key_values.back().m_value.resize(strlen(writer_id) + 1);
+		memcpy(key_values.back().m_value.data(), writer_id, strlen(writer_id) + 1);
+
+		key_values.sort();
+
+#if BASISU_DISABLE_KTX2_KEY_VALUES
+		// HACK HACK - Clear the key values array, which causes no key values to be written (triggering the ktx2check validator bug).
+		key_values.clear();
+#endif
+
+		uint8_vec key_value_data;
+
+		// DFD
+		uint8_vec dfd;
+		get_dfd(dfd, header);
+
+		const uint32_t kvd_file_offset = sizeof(header) + sizeof(basist::ktx2_level_index) * total_levels + dfd.size();
+
+		for (uint32_t pass = 0; pass < 2; pass++)
+		{
+			for (uint32_t i = 0; i < key_values.size(); i++)
+			{
+				if (key_values[i].m_key.size() < 2)
+					return false;
+
+				if (key_values[i].m_key.back() != 0)
+					return false;
+
+				const uint64_t total_len = (uint64_t)key_values[i].m_key.size() + (uint64_t)key_values[i].m_value.size();
+				if (total_len >= UINT32_MAX)
+					return false;
+
+				packed_uint<4> le_len((uint32_t)total_len);
+				append_vector(key_value_data, (const uint8_t*)&le_len, sizeof(le_len));
+
+				append_vector(key_value_data, key_values[i].m_key);
+				append_vector(key_value_data, key_values[i].m_value);
+
+				const uint32_t ofs = key_value_data.size() & 3;
+				const uint32_t padding = (4 - ofs) & 3;
+				for (uint32_t p = 0; p < padding; p++)
+					key_value_data.push_back(0);
+			}
+
+			if (header.m_supercompression_scheme != basist::KTX2_SS_NONE)
+				break;
+
+#if BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND
+			break;
+#endif
+			
+			// Hack to ensure the KVD block ends on a 16 byte boundary, because we have no other official way of aligning the data.
+			uint32_t kvd_end_file_offset = kvd_file_offset + key_value_data.size();
+			uint32_t bytes_needed_to_pad = (16 - (kvd_end_file_offset & 15)) & 15;
+			if (!bytes_needed_to_pad)
+			{
+				// We're good. No need to add a dummy key.
+				break;
+			}
+
+			assert(!pass);
+			if (pass)
+				return false;
+
+			if (bytes_needed_to_pad < 6)
+				bytes_needed_to_pad += 16;
+
+			printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad);
+			
+			// We're not good - need to add a dummy key large enough to force file alignment so the mip level array gets aligned. 
+			// We can't just add some bytes before the mip level array because ktx2check will see that as extra data in the file that shouldn't be there in ktxValidator::validateDataSize().
+			key_values.enlarge(1);
+			for (uint32_t i = 0; i < (bytes_needed_to_pad - 4 - 1 - 1); i++)
+				key_values.back().m_key.push_back(127);
+			
+			key_values.back().m_key.push_back(0);
+
+			key_values.back().m_value.push_back(0);
+
+			key_values.sort();
+
+			key_value_data.resize(0);
+			
+			// Try again
+		}
+
+		basisu::vector<basist::ktx2_level_index> level_index_array(total_levels);
+		memset(level_index_array.data(), 0, level_index_array.size_in_bytes());
+				
+		m_output_ktx2_file.clear();
+		m_output_ktx2_file.reserve(m_output_basis_file.size());
+
+		// Dummy header
+		m_output_ktx2_file.resize(sizeof(header));
+
+		// Level index array
+		append_vector(m_output_ktx2_file, (const uint8_t*)level_index_array.data(), level_index_array.size_in_bytes());
+				
+		// DFD
+		const uint8_t* pDFD = dfd.data();
+		uint32_t dfd_len = dfd.size();
+
+		header.m_dfd_byte_offset = m_output_ktx2_file.size();
+		header.m_dfd_byte_length = dfd_len;
+		append_vector(m_output_ktx2_file, pDFD, dfd_len);
+
+		// Key value data
+		if (key_value_data.size())
+		{
+			assert(kvd_file_offset == m_output_ktx2_file.size());
+
+			header.m_kvd_byte_offset = m_output_ktx2_file.size();
+			header.m_kvd_byte_length = key_value_data.size();
+			append_vector(m_output_ktx2_file, key_value_data);
+		}
+
+		// Global Supercompressed Data
+		if (etc1s_global_data.size())
+		{
+			uint32_t ofs = m_output_ktx2_file.size() & 7;
+			uint32_t padding = (8 - ofs) & 7;
+			for (uint32_t i = 0; i < padding; i++)
+				m_output_ktx2_file.push_back(0);
+
+			header.m_sgd_byte_length = etc1s_global_data.size();
+			header.m_sgd_byte_offset = m_output_ktx2_file.size();
+
+			append_vector(m_output_ktx2_file, etc1s_global_data);
+		}
+
+		// mipPadding
+		if (header.m_supercompression_scheme == basist::KTX2_SS_NONE)
+		{
+			// We currently can't do this or the validator will incorrectly give an error.
+			uint32_t ofs = m_output_ktx2_file.size() & 15;
+			uint32_t padding = (16 - ofs) & 15;
+
+			// Make sure we're always aligned here (due to a validator bug).
+			if (padding)
+			{
+				printf("Warning: KTX2 mip level data is not 16-byte aligned. This may trigger a ktx2check validation bug. Writing %u bytes of mipPadding.\n", padding);
+			}
+
+			for (uint32_t i = 0; i < padding; i++)
+				m_output_ktx2_file.push_back(0);
+		}
+
+		// Level data - write the smallest mipmap first.
+		for (int level = total_levels - 1; level >= 0; level--)
+		{
+			level_index_array[level].m_byte_length = compressed_level_data_bytes[level].size();
+			if (m_params.m_uastc)
+				level_index_array[level].m_uncompressed_byte_length = level_data_bytes[level].size();
+
+			level_index_array[level].m_byte_offset = m_output_ktx2_file.size();
+			append_vector(m_output_ktx2_file, compressed_level_data_bytes[level]);
+		}
+		
+		// Write final header
+		memcpy(m_output_ktx2_file.data(), &header, sizeof(header));
+
+		// Write final level index array
+		memcpy(m_output_ktx2_file.data() + sizeof(header), level_index_array.data(), level_index_array.size_in_bytes());
+
+		debug_printf("Total .ktx2 output file size: %u\n", m_output_ktx2_file.size());
+
+		return true;
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_comp.h b/thirdparty/basis_universal/encoder/basisu_comp.h
index 1c201ddbed..2c3af968f7 100644
--- a/thirdparty/basis_universal/basisu_comp.h
+++ b/thirdparty/basis_universal/encoder/basisu_comp.h
@@ -1,5 +1,5 @@
 // basisu_comp.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,8 +16,23 @@
 #include "basisu_frontend.h"
 #include "basisu_backend.h"
 #include "basisu_basis_file.h"
-#include "transcoder/basisu_global_selector_palette.h"
-#include "transcoder/basisu_transcoder.h"
+#include "../transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_uastc_enc.h"
+
+#define BASISU_LIB_VERSION 115
+#define BASISU_LIB_VERSION_STRING "1.15"
+
+#ifndef BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 is undefined
+#endif
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#error BASISD_SUPPORT_KTX2_ZSTD is undefined
+#endif
+
+#if !BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 must be enabled when building the encoder. To reduce code size if KTX2 support is not needed, set BASISD_SUPPORT_KTX2_ZSTD to 0
+#endif
 
 namespace basisu
 {
@@ -40,6 +55,10 @@ namespace basisu
 
 	const uint32_t BASISU_MAX_SLICES = 0xFFFFFF;
 
+	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 4096; // 32768;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 64;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MAX = 65536;
+
 	struct image_stats
 	{
 		image_stats()
@@ -52,43 +71,52 @@ namespace basisu
 			m_filename.clear();
 			m_width = 0;
 			m_height = 0;
-
-			m_basis_etc1s_rgb_avg_psnr = 0.0f;
-			m_basis_etc1s_luma_709_psnr = 0.0f;
-			m_basis_etc1s_luma_601_psnr = 0.0f;
-			m_basis_etc1s_luma_709_ssim = 0.0f;
-
-			m_basis_bc1_rgb_avg_psnr = 0.0f;
-			m_basis_bc1_luma_709_psnr = 0.0f;
-			m_basis_bc1_luma_601_psnr = 0.0f;
-			m_basis_bc1_luma_709_ssim = 0.0f;
-
-			m_best_rgb_avg_psnr = 0.0f;
-			m_best_luma_709_psnr = 0.0f;
-			m_best_luma_601_psnr = 0.0f;
-			m_best_luma_709_ssim = 0.0f;
+						
+			m_basis_rgb_avg_psnr = 0.0f;
+			m_basis_rgba_avg_psnr = 0.0f;
+			m_basis_a_avg_psnr = 0.0f;
+			m_basis_luma_709_psnr = 0.0f;
+			m_basis_luma_601_psnr = 0.0f;
+			m_basis_luma_709_ssim = 0.0f;
+
+			m_bc7_rgb_avg_psnr = 0.0f;
+			m_bc7_rgba_avg_psnr = 0.0f;
+			m_bc7_a_avg_psnr = 0.0f;
+			m_bc7_luma_709_psnr = 0.0f;
+			m_bc7_luma_601_psnr = 0.0f;
+			m_bc7_luma_709_ssim = 0.0f;
+						
+			m_best_etc1s_rgb_avg_psnr = 0.0f;
+			m_best_etc1s_luma_709_psnr = 0.0f;
+			m_best_etc1s_luma_601_psnr = 0.0f;
+			m_best_etc1s_luma_709_ssim = 0.0f;
 		}
 
 		std::string m_filename;
 		uint32_t m_width;
 		uint32_t m_height;
 
-		// .basis compressed
-		float m_basis_etc1s_rgb_avg_psnr;
-		float m_basis_etc1s_luma_709_psnr;
-		float m_basis_etc1s_luma_601_psnr;
-		float m_basis_etc1s_luma_709_ssim;
+		// .basis compressed (ETC1S or UASTC statistics)
+		float m_basis_rgb_avg_psnr;
+		float m_basis_rgba_avg_psnr;
+		float m_basis_a_avg_psnr;
+		float m_basis_luma_709_psnr;
+		float m_basis_luma_601_psnr;
+		float m_basis_luma_709_ssim;
+
+		// BC7 statistics
+		float m_bc7_rgb_avg_psnr;
+		float m_bc7_rgba_avg_psnr;
+		float m_bc7_a_avg_psnr;
+		float m_bc7_luma_709_psnr;
+		float m_bc7_luma_601_psnr;
+		float m_bc7_luma_709_ssim;
 		
-		float m_basis_bc1_rgb_avg_psnr;
-		float m_basis_bc1_luma_709_psnr;
-		float m_basis_bc1_luma_601_psnr;
-		float m_basis_bc1_luma_709_ssim;
-
-		// Normal (highest quality) compressed ETC1S
-		float m_best_rgb_avg_psnr;
-		float m_best_luma_709_psnr;
-		float m_best_luma_601_psnr;
-		float m_best_luma_709_ssim;
+		// Highest achievable quality ETC1S statistics
+		float m_best_etc1s_rgb_avg_psnr;
+		float m_best_etc1s_luma_709_psnr;
+		float m_best_etc1s_luma_601_psnr;
+		float m_best_etc1s_luma_709_ssim;
 	};
 
 	template<bool def>
@@ -175,18 +203,30 @@ namespace basisu
 	struct basis_compressor_params
 	{
 		basis_compressor_params() :
+			m_pSel_codebook(NULL),
+			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
+			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
 			m_hybrid_sel_cb_quality_thresh(BASISU_DEFAULT_HYBRID_SEL_CB_QUALITY_THRESH, 0.0f, 1e+10f),
 			m_global_pal_bits(8, 0, ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS),
 			m_global_mod_bits(8, 0, basist::etc1_global_palette_entry_modifier::cTotalBits),
-			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
-			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
-			m_pSel_codebook(NULL),
+			m_mip_scale(1.0f, .000125f, 4.0f),
+			m_mip_smallest_dimension(1, 1, 16384),
 			m_max_endpoint_clusters(512),
 			m_max_selector_clusters(512),
 			m_quality_level(-1),
-			m_mip_scale(1.0f, .000125f, 4.0f),
-			m_mip_smallest_dimension(1, 1, 16384),
-			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_pack_uastc_flags(cPackUASTCLevelDefault),
+			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
+			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
+			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
+			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_resample_width(0, 1, 16384),
+			m_resample_height(0, 1, 16384),
+			m_resample_factor(0.0f, .00125f, 100.0f),
+			m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE),
+			m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX),
 			m_pJob_pool(nullptr)
 		{
 			clear();
@@ -196,15 +236,20 @@ namespace basisu
 		{
 			m_pSel_codebook = NULL;
 
+			m_uastc.clear();
+			m_status_output.clear();
+
 			m_source_filenames.clear();
 			m_source_alpha_filenames.clear();
 
 			m_source_images.clear();
+			m_source_mipmap_images.clear();
 
 			m_out_filename.clear();
 
 			m_y_flip.clear();
 			m_debug.clear();
+			m_validate.clear();
 			m_debug_images.clear();
 			m_global_sel_pal.clear();
 			m_auto_global_sel_pal.clear();
@@ -219,7 +264,11 @@ namespace basisu
 			m_check_for_alpha.clear();
 			m_force_alpha.clear();
 			m_multithreading.clear();
-			m_seperate_rg_to_color_alpha.clear();
+			m_swizzle[0] = 0;
+			m_swizzle[1] = 1;
+			m_swizzle[2] = 2;
+			m_swizzle[3] = 3;
+			m_renormalize.clear();
 			m_hybrid_sel_cb_quality_thresh.clear();
 			m_global_pal_bits.clear();
 			m_global_mod_bits.clear();
@@ -236,6 +285,7 @@ namespace basisu
 			m_mip_premultiplied.clear();
 			m_mip_renormalize.clear();
 			m_mip_wrapping.clear();
+			m_mip_fast.clear();
 			m_mip_smallest_dimension.clear();
 
 			m_max_endpoint_clusters = 0;
@@ -247,30 +297,63 @@ namespace basisu
 			m_userdata1 = 0;
 			m_us_per_frame = 0;
 
+			m_pack_uastc_flags = cPackUASTCLevelDefault;
+			m_rdo_uastc.clear();
+			m_rdo_uastc_quality_scalar.clear();
+			m_rdo_uastc_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_smooth_block_max_std_dev.clear();
+			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
+			m_rdo_uastc_skip_block_rms_thresh.clear();
+			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_multithreading.clear();
+
+			m_resample_width.clear();
+			m_resample_height.clear();
+			m_resample_factor.clear();
+
+			m_pGlobal_codebooks = nullptr;
+
+			m_create_ktx2_file.clear();
+			m_ktx2_uastc_supercompression = basist::KTX2_SS_NONE;
+			m_ktx2_key_values.clear();
+			m_ktx2_zstd_supercompression_level.clear();
+			m_ktx2_srgb_transfer_func.clear();
+
 			m_pJob_pool = nullptr;
 		}
-
+				
 		// Pointer to the global selector codebook, or nullptr to not use a global selector codebook
 		const basist::etc1_global_selector_codebook *m_pSel_codebook;
 
+		// True to generate UASTC .basis file data, otherwise ETC1S.
+		bool_param<false> m_uastc;
+
 		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
 		// Otherwise, the compressor processes the images in m_source_images.
-		std::vector<std::string> m_source_filenames;
-		std::vector<std::string> m_source_alpha_filenames;
+		basisu::vector<std::string> m_source_filenames;
+		basisu::vector<std::string> m_source_alpha_filenames;
 		
-		std::vector<image> m_source_images;
-		// TODO: Allow caller to supply their own mipmaps
+		basisu::vector<image> m_source_images;
+		
+		// Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual.
+		// If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error.
+		// The compressor applies the user-provided swizzling (in m_swizzle) to these images.
+		basisu::vector< basisu::vector<image> > m_source_mipmap_images;
 						
 		// Filename of the output basis file
-		std::string m_out_filename;	
+		std::string m_out_filename;
 
 		// The params are done this way so we can detect when the user has explictly changed them.
 
 		// Flip images across Y axis
 		bool_param<false> m_y_flip;
+
+		// If true, the compressor will print basis status to stdout during compression.
+		bool_param<true> m_status_output;
 		
 		// Output debug information during compression
 		bool_param<false> m_debug;
+		bool_param<false> m_validate;
 		
 		// m_debug_images is pretty slow
 		bool_param<false> m_debug_images;
@@ -284,7 +367,7 @@ namespace basisu
 		// Frontend/backend codec parameters
 		bool_param<false> m_no_hybrid_sel_cb;
 		
-		// Use perceptual sRGB colorspace metrics (for normal maps, etc.)
+		// Use perceptual sRGB colorspace metrics instead of linear
 		bool_param<true> m_perceptual;
 
 		// Disable selector RDO, for faster compression but larger files
@@ -299,7 +382,7 @@ namespace basisu
 
 		// Write the output basis file to disk using m_out_filename
 		bool_param<false> m_write_output_basis_files;
-				
+								
 		// Compute and display image metrics 
 		bool_param<false> m_compute_stats;
 		
@@ -311,7 +394,9 @@ namespace basisu
 		bool_param<true> m_multithreading;
 		
 		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
-		bool_param<false> m_seperate_rg_to_color_alpha;
+		char m_swizzle[4];
+
+		bool_param<false> m_renormalize;
 
 		bool_param<false> m_disable_hierarchical_endpoint_codebooks;
 
@@ -328,10 +413,11 @@ namespace basisu
 		bool_param<true> m_mip_premultiplied; // not currently supported
 		bool_param<false> m_mip_renormalize; 
 		bool_param<true> m_mip_wrapping;
+		bool_param<true> m_mip_fast;
 		param<int> m_mip_smallest_dimension;
 				
 		// Codebook size (quality) control. 
-		// If m_quality_level != -1, it controls the quality level. It ranges from [0,255].
+		// If m_quality_level != -1, it controls the quality level. It ranges from [0,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
 		// Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly.
 		uint32_t m_max_endpoint_clusters;
 		uint32_t m_max_selector_clusters;
@@ -343,6 +429,31 @@ namespace basisu
 		uint32_t m_userdata1;
 		uint32_t m_us_per_frame;
 
+		// cPackUASTCLevelDefault, etc.
+		uint32_t m_pack_uastc_flags;
+		bool_param<false> m_rdo_uastc;
+		param<float> m_rdo_uastc_quality_scalar;
+		param<int> m_rdo_uastc_dict_size;
+		param<float> m_rdo_uastc_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_smooth_block_max_std_dev;
+		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
+		param<float> m_rdo_uastc_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_multithreading;
+
+		param<int> m_resample_width;
+		param<int> m_resample_height;
+		param<float> m_resample_factor;
+		const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+
+		// KTX2 specific parameters.
+		// Internally, the compressor always creates a .basis file then it converts that lossless to KTX2.
+		bool_param<false> m_create_ktx2_file;
+		basist::ktx2_supercompression m_ktx2_uastc_supercompression;
+		basist::ktx2_transcoder::key_value_vec m_ktx2_key_values;
+		param<int> m_ktx2_zstd_supercompression_level;
+		bool_param<false> m_ktx2_srgb_transfer_func;
+
 		job_pool *m_pJob_pool;
 	};
 	
@@ -360,35 +471,41 @@ namespace basisu
 			cECSuccess = 0,
 			cECFailedReadingSourceImages,
 			cECFailedValidating,
+			cECFailedEncodeUASTC,
 			cECFailedFrontEnd,
 			cECFailedFontendExtract,
 			cECFailedBackend,
 			cECFailedCreateBasisFile,
-			cECFailedWritingOutput
+			cECFailedWritingOutput,
+			cECFailedUASTCRDOPostProcess,
+			cECFailedCreateKTX2File
 		};
 
 		error_code process();
 
+		// The output .basis file will always be valid of process() succeeded.
 		const uint8_vec &get_output_basis_file() const { return m_output_basis_file; }
-		const etc_block_vec &get_output_blocks() const { return m_output_blocks; }
+		
+		// The output .ktx2 file will only be valid if m_create_ktx2_file was true and process() succeeded.
+		const uint8_vec& get_output_ktx2_file() const { return m_output_ktx2_file; }
 
-		const std::vector<image_stats> &get_stats() const { return m_stats; }
+		const basisu::vector<image_stats> &get_stats() const { return m_stats; }
 
 		uint32_t get_basis_file_size() const { return m_basis_file_size; }
 		double get_basis_bits_per_texel() const { return m_basis_bits_per_texel; }
-
+		
 		bool get_any_source_image_has_alpha() const { return m_any_source_image_has_alpha; }
-
+								
 	private:
 		basis_compressor_params m_params;
 		
-		std::vector<image> m_slice_images;
+		basisu::vector<image> m_slice_images;
 
-		std::vector<image_stats> m_stats;
+		basisu::vector<image_stats> m_stats;
 
 		uint32_t m_basis_file_size;
 		double m_basis_bits_per_texel;
-		
+						
 		basisu_backend_slice_desc_vec m_slice_descs;
 
 		uint32_t m_total_blocks;
@@ -397,33 +514,41 @@ namespace basisu
 		basisu_frontend m_frontend;
 		pixel_block_vec m_source_blocks;
 
-		std::vector<gpu_image> m_frontend_output_textures;
+		basisu::vector<gpu_image> m_frontend_output_textures;
 
-		std::vector<gpu_image> m_best_etc1s_images;
-		std::vector<image> m_best_etc1s_images_unpacked;
+		basisu::vector<gpu_image> m_best_etc1s_images;
+		basisu::vector<image> m_best_etc1s_images_unpacked;
 
 		basisu_backend m_backend;
 
 		basisu_file m_basis_file;
 
-		std::vector<gpu_image> m_decoded_output_textures;
-		std::vector<image> m_decoded_output_textures_unpacked;
-		std::vector<gpu_image> m_decoded_output_textures_bc1;
-		std::vector<image> m_decoded_output_textures_unpacked_bc1;
+		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<image> m_decoded_output_textures_unpacked;
+		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
+		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;
 
 		uint8_vec m_output_basis_file;
-		etc_block_vec m_output_blocks;
+		uint8_vec m_output_ktx2_file;
+		
+		basisu::vector<gpu_image> m_uastc_slice_textures;
+		basisu_backend_output m_uastc_backend_output;
 
 		bool m_any_source_image_has_alpha;
 
 		bool read_source_images();
+		bool extract_source_blocks();
 		bool process_frontend();
 		bool extract_frontend_texture_data();
 		bool process_backend();
 		bool create_basis_file_and_transcode();
 		bool write_output_files_and_compute_stats();
-		bool generate_mipmaps(const image &img, std::vector<image> &mips, bool has_alpha);
+		error_code encode_slices_to_uastc();
+		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
 		bool validate_texture_type_constraints();
+		bool validate_ktx2_constraints();
+		void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
+		bool create_ktx2_file();
 	};
 
 } // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
index 7057c65cf8..f02fb62c11 100644
--- a/thirdparty/basis_universal/basisu_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
@@ -1,5 +1,5 @@
 // basisu_enc.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,7 +17,11 @@
 #include "basisu_resampler.h"
 #include "basisu_resampler_filters.h"
 #include "basisu_etc.h"
-#include "transcoder/basisu_transcoder.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_bc7enc.h"
+#include "apg_bmp.h"
+#include "jpgd.h"
+#include <vector>
 
 #if defined(_WIN32)
 // For QueryPerformanceCounter/QueryPerformanceFrequency
@@ -29,6 +33,9 @@ namespace basisu
 {
 	uint64_t interval_timer::g_init_ticks, interval_timer::g_freq;
 	double interval_timer::g_timer_freq;
+#if BASISU_SUPPORT_SSE
+	bool g_cpu_supports_sse41;
+#endif
 
 	uint8_t g_hamming_dist[256] =
 	{
@@ -50,10 +57,117 @@ namespace basisu
 		4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 	};
 
+	// This is a Public Domain 8x8 font from here:
+	// https://github.com/dhepper/font8x8/blob/master/font8x8_basic.h
+	const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8] = 
+	{
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},	// U+0020 ( )
+	 { 0x18, 0x3C, 0x3C, 0x18, 0x18, 0x00, 0x18, 0x00},   // U+0021 (!)
+	 { 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0022 (")
+	 { 0x36, 0x36, 0x7F, 0x36, 0x7F, 0x36, 0x36, 0x00},   // U+0023 (#)
+	 { 0x0C, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x0C, 0x00},   // U+0024 ($)
+	 { 0x00, 0x63, 0x33, 0x18, 0x0C, 0x66, 0x63, 0x00},   // U+0025 (%)
+	 { 0x1C, 0x36, 0x1C, 0x6E, 0x3B, 0x33, 0x6E, 0x00},   // U+0026 (&)
+	 { 0x06, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0027 (')
+	 { 0x18, 0x0C, 0x06, 0x06, 0x06, 0x0C, 0x18, 0x00},   // U+0028 (()
+	 { 0x06, 0x0C, 0x18, 0x18, 0x18, 0x0C, 0x06, 0x00},   // U+0029 ())
+	 { 0x00, 0x66, 0x3C, 0xFF, 0x3C, 0x66, 0x00, 0x00},   // U+002A (*)
+	 { 0x00, 0x0C, 0x0C, 0x3F, 0x0C, 0x0C, 0x00, 0x00},   // U+002B (+)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+002C (,)
+	 { 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00},   // U+002D (-)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+002E (.)
+	 { 0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01, 0x00},   // U+002F (/)
+	 { 0x3E, 0x63, 0x73, 0x7B, 0x6F, 0x67, 0x3E, 0x00},   // U+0030 (0)
+	 { 0x0C, 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x3F, 0x00},   // U+0031 (1)
+	 { 0x1E, 0x33, 0x30, 0x1C, 0x06, 0x33, 0x3F, 0x00},   // U+0032 (2)
+	 { 0x1E, 0x33, 0x30, 0x1C, 0x30, 0x33, 0x1E, 0x00},   // U+0033 (3)
+	 { 0x38, 0x3C, 0x36, 0x33, 0x7F, 0x30, 0x78, 0x00},   // U+0034 (4)
+	 { 0x3F, 0x03, 0x1F, 0x30, 0x30, 0x33, 0x1E, 0x00},   // U+0035 (5)
+	 { 0x1C, 0x06, 0x03, 0x1F, 0x33, 0x33, 0x1E, 0x00},   // U+0036 (6)
+	 { 0x3F, 0x33, 0x30, 0x18, 0x0C, 0x0C, 0x0C, 0x00},   // U+0037 (7)
+	 { 0x1E, 0x33, 0x33, 0x1E, 0x33, 0x33, 0x1E, 0x00},   // U+0038 (8)
+	 { 0x1E, 0x33, 0x33, 0x3E, 0x30, 0x18, 0x0E, 0x00},   // U+0039 (9)
+	 { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+003A (:)
+	 { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+003B (;)
+	 { 0x18, 0x0C, 0x06, 0x03, 0x06, 0x0C, 0x18, 0x00},   // U+003C (<)
+	 { 0x00, 0x00, 0x3F, 0x00, 0x00, 0x3F, 0x00, 0x00},   // U+003D (=)
+	 { 0x06, 0x0C, 0x18, 0x30, 0x18, 0x0C, 0x06, 0x00},   // U+003E (>)
+	 { 0x1E, 0x33, 0x30, 0x18, 0x0C, 0x00, 0x0C, 0x00},   // U+003F (?)
+	 { 0x3E, 0x63, 0x7B, 0x7B, 0x7B, 0x03, 0x1E, 0x00},   // U+0040 (@)
+	 { 0x0C, 0x1E, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x00},   // U+0041 (A)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x66, 0x66, 0x3F, 0x00},   // U+0042 (B)
+	 { 0x3C, 0x66, 0x03, 0x03, 0x03, 0x66, 0x3C, 0x00},   // U+0043 (C)
+	 { 0x1F, 0x36, 0x66, 0x66, 0x66, 0x36, 0x1F, 0x00},   // U+0044 (D)
+	 { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x46, 0x7F, 0x00},   // U+0045 (E)
+	 { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x06, 0x0F, 0x00},   // U+0046 (F)
+	 { 0x3C, 0x66, 0x03, 0x03, 0x73, 0x66, 0x7C, 0x00},   // U+0047 (G)
+	 { 0x33, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x33, 0x00},   // U+0048 (H)
+	 { 0x1E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0049 (I)
+	 { 0x78, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, 0x00},   // U+004A (J)
+	 { 0x67, 0x66, 0x36, 0x1E, 0x36, 0x66, 0x67, 0x00},   // U+004B (K)
+	 { 0x0F, 0x06, 0x06, 0x06, 0x46, 0x66, 0x7F, 0x00},   // U+004C (L)
+	 { 0x63, 0x77, 0x7F, 0x7F, 0x6B, 0x63, 0x63, 0x00},   // U+004D (M)
+	 { 0x63, 0x67, 0x6F, 0x7B, 0x73, 0x63, 0x63, 0x00},   // U+004E (N)
+	 { 0x1C, 0x36, 0x63, 0x63, 0x63, 0x36, 0x1C, 0x00},   // U+004F (O)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x06, 0x06, 0x0F, 0x00},   // U+0050 (P)
+	 { 0x1E, 0x33, 0x33, 0x33, 0x3B, 0x1E, 0x38, 0x00},   // U+0051 (Q)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x36, 0x66, 0x67, 0x00},   // U+0052 (R)
+	 { 0x1E, 0x33, 0x07, 0x0E, 0x38, 0x33, 0x1E, 0x00},   // U+0053 (S)
+	 { 0x3F, 0x2D, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0054 (T)
+	 { 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x3F, 0x00},   // U+0055 (U)
+	 { 0x33, 0x33, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0056 (V)
+	 { 0x63, 0x63, 0x63, 0x6B, 0x7F, 0x77, 0x63, 0x00},   // U+0057 (W)
+	 { 0x63, 0x63, 0x36, 0x1C, 0x1C, 0x36, 0x63, 0x00},   // U+0058 (X)
+	 { 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x0C, 0x1E, 0x00},   // U+0059 (Y)
+	 { 0x7F, 0x63, 0x31, 0x18, 0x4C, 0x66, 0x7F, 0x00},   // U+005A (Z)
+	 { 0x1E, 0x06, 0x06, 0x06, 0x06, 0x06, 0x1E, 0x00},   // U+005B ([)
+	 { 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0x40, 0x00},   // U+005C (\)
+	 { 0x1E, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1E, 0x00},   // U+005D (])
+	 { 0x08, 0x1C, 0x36, 0x63, 0x00, 0x00, 0x00, 0x00},   // U+005E (^)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF},   // U+005F (_)
+	 { 0x0C, 0x0C, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0060 (`)
+	 { 0x00, 0x00, 0x1E, 0x30, 0x3E, 0x33, 0x6E, 0x00},   // U+0061 (a)
+	 { 0x07, 0x06, 0x06, 0x3E, 0x66, 0x66, 0x3B, 0x00},   // U+0062 (b)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x03, 0x33, 0x1E, 0x00},   // U+0063 (c)
+	 { 0x38, 0x30, 0x30, 0x3e, 0x33, 0x33, 0x6E, 0x00},   // U+0064 (d)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x3f, 0x03, 0x1E, 0x00},   // U+0065 (e)
+	 { 0x1C, 0x36, 0x06, 0x0f, 0x06, 0x06, 0x0F, 0x00},   // U+0066 (f)
+	 { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0067 (g)
+	 { 0x07, 0x06, 0x36, 0x6E, 0x66, 0x66, 0x67, 0x00},   // U+0068 (h)
+	 { 0x0C, 0x00, 0x0E, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0069 (i)
+	 { 0x30, 0x00, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E},   // U+006A (j)
+	 { 0x07, 0x06, 0x66, 0x36, 0x1E, 0x36, 0x67, 0x00},   // U+006B (k)
+	 { 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+006C (l)
+	 { 0x00, 0x00, 0x33, 0x7F, 0x7F, 0x6B, 0x63, 0x00},   // U+006D (m)
+	 { 0x00, 0x00, 0x1F, 0x33, 0x33, 0x33, 0x33, 0x00},   // U+006E (n)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x33, 0x33, 0x1E, 0x00},   // U+006F (o)
+	 { 0x00, 0x00, 0x3B, 0x66, 0x66, 0x3E, 0x06, 0x0F},   // U+0070 (p)
+	 { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x78},   // U+0071 (q)
+	 { 0x00, 0x00, 0x3B, 0x6E, 0x66, 0x06, 0x0F, 0x00},   // U+0072 (r)
+	 { 0x00, 0x00, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x00},   // U+0073 (s)
+	 { 0x08, 0x0C, 0x3E, 0x0C, 0x0C, 0x2C, 0x18, 0x00},   // U+0074 (t)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x33, 0x6E, 0x00},   // U+0075 (u)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0076 (v)
+	 { 0x00, 0x00, 0x63, 0x6B, 0x7F, 0x7F, 0x36, 0x00},   // U+0077 (w)
+	 { 0x00, 0x00, 0x63, 0x36, 0x1C, 0x36, 0x63, 0x00},   // U+0078 (x)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0079 (y)
+	 { 0x00, 0x00, 0x3F, 0x19, 0x0C, 0x26, 0x3F, 0x00},   // U+007A (z)
+	 { 0x38, 0x0C, 0x0C, 0x07, 0x0C, 0x0C, 0x38, 0x00},   // U+007B ({)
+	 { 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00},   // U+007C (|)
+	 { 0x07, 0x0C, 0x0C, 0x38, 0x0C, 0x0C, 0x07, 0x00},   // U+007D (})
+	 { 0x6E, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+007E (~)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}    // U+007F
+	};
+			
 	// Encoder library initialization (just call once at startup)
 	void basisu_encoder_init()
 	{
+		detect_sse41();
+
 		basist::basisu_transcoder_init();
+		pack_etc1_solid_color_init();
+		//uastc_init();
+		bc7enc_compress_block_init(); // must be after uastc_init()
 	}
 
 	void error_printf(const char *pFmt, ...)
@@ -108,7 +222,7 @@ namespace basisu
 #else
 #error TODO
 #endif
-
+				
 	interval_timer::interval_timer() : m_start_time(0), m_stop_time(0), m_started(false), m_stopped(false)
 	{
 		if (!g_timer_freq)
@@ -169,38 +283,142 @@ namespace basisu
 		return ticks * g_timer_freq;
 	}
 		
-	bool load_png(const char* pFilename, image& img)
+	const uint32_t MAX_32BIT_ALLOC_SIZE = 250000000;
+
+	bool load_bmp(const char* pFilename, image& img)
 	{
-		std::vector<uint8_t> buffer;
-		unsigned err = lodepng::load_file(buffer, std::string(pFilename));
-		if (err)
+		int w = 0, h = 0;
+		unsigned int n_chans = 0;
+		unsigned char* pImage_data = apg_bmp_read(pFilename, &w, &h, &n_chans);
+				
+		if ((!pImage_data) || (!w) || (!h) || ((n_chans != 3) && (n_chans != 4)))
+		{
+			error_printf("Failed loading .BMP image \"%s\"!\n", pFilename);
+
+			if (pImage_data)
+				apg_bmp_free(pImage_data);
+						
 			return false;
+		}
 
-		unsigned w = 0, h = 0;
+		if (sizeof(void *) == sizeof(uint32_t))
+		{
+			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+
+				if (pImage_data)
+					apg_bmp_free(pImage_data);
+
+				return false;
+			}
+		}
+		
+		img.resize(w, h);
+
+		const uint8_t *pSrc = pImage_data;
+		for (int y = 0; y < h; y++)
+		{
+			color_rgba *pDst = &img(0, y);
+
+			for (int x = 0; x < w; x++)
+			{
+				pDst->r = pSrc[0];
+				pDst->g = pSrc[1];
+				pDst->b = pSrc[2];
+				pDst->a = (n_chans == 3) ? 255 : pSrc[3];
+
+				pSrc += n_chans;
+				++pDst;
+			}
+		}
+
+		apg_bmp_free(pImage_data);
+
+		return true;
+	}
+		
+	bool load_tga(const char* pFilename, image& img)
+	{
+		int w = 0, h = 0, n_chans = 0;
+		uint8_t* pImage_data = read_tga(pFilename, w, h, n_chans);
 				
+		if ((!pImage_data) || (!w) || (!h) || ((n_chans != 3) && (n_chans != 4)))
+		{
+			error_printf("Failed loading .TGA image \"%s\"!\n", pFilename);
+
+			if (pImage_data)
+				free(pImage_data);
+						
+			return false;
+		}
+
 		if (sizeof(void *) == sizeof(uint32_t))
 		{
+			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+
+				if (pImage_data)
+					free(pImage_data);
+
+				return false;
+			}
+		}
+		
+		img.resize(w, h);
+
+		const uint8_t *pSrc = pImage_data;
+		for (int y = 0; y < h; y++)
+		{
+			color_rgba *pDst = &img(0, y);
+
+			for (int x = 0; x < w; x++)
+			{
+				pDst->r = pSrc[0];
+				pDst->g = pSrc[1];
+				pDst->b = pSrc[2];
+				pDst->a = (n_chans == 3) ? 255 : pSrc[3];
+
+				pSrc += n_chans;
+				++pDst;
+			}
+		}
+
+		free(pImage_data);
+
+		return true;
+	}
+
+	bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename)
+	{
+		if (!buf_size)
+			return false;
+
+		unsigned err = 0, w = 0, h = 0;
+
+		if (sizeof(void*) == sizeof(uint32_t))
+		{
 			// Inspect the image first on 32-bit builds, to see if the image would require too much memory.
 			lodepng::State state;
-			err = lodepng_inspect(&w, &h, &state, &buffer[0], buffer.size());
+			err = lodepng_inspect(&w, &h, &state, pBuf, buf_size);
 			if ((err != 0) || (!w) || (!h))
 				return false;
 
 			const uint32_t exepected_alloc_size = w * h * sizeof(uint32_t);
-			
+
 			// If the file is too large on 32-bit builds then just bail now, to prevent causing a memory exception.
-			const uint32_t MAX_ALLOC_SIZE = 250000000;
-			if (exepected_alloc_size >= MAX_ALLOC_SIZE)
+			if (exepected_alloc_size >= MAX_32BIT_ALLOC_SIZE)
 			{
-				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", (pFilename != nullptr) ? pFilename : "<memory>", w, h);
 				return false;
 			}
-			
+
 			w = h = 0;
 		}
-				
+
 		std::vector<uint8_t> out;
-		err = lodepng::decode(out, w, h, &buffer[0], buffer.size());
+		err = lodepng::decode(out, w, h, pBuf, buf_size);
 		if ((err != 0) || (!w) || (!h))
 			return false;
 
@@ -213,12 +431,62 @@ namespace basisu
 
 		return true;
 	}
+		
+	bool load_png(const char* pFilename, image& img)
+	{
+		std::vector<uint8_t> buffer;
+		unsigned err = lodepng::load_file(buffer, std::string(pFilename));
+		if (err)
+			return false;
+
+
+		return load_png(buffer.data(), buffer.size(), img, pFilename);
+	}
+
+	bool load_jpg(const char *pFilename, image& img)
+	{
+		int width = 0, height = 0, actual_comps = 0;
+		uint8_t *pImage_data = jpgd::decompress_jpeg_image_from_file(pFilename, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagLinearChromaFiltering);
+		if (!pImage_data)
+			return false;
+		
+		img.init(pImage_data, width, height, 4);
+		
+		free(pImage_data);
+
+		return true;
+	}
+
+	bool load_image(const char* pFilename, image& img)
+	{
+		std::string ext(string_get_extension(std::string(pFilename)));
+
+		if (ext.length() == 0)
+			return false;
+
+		const char *pExt = ext.c_str();
+
+		if (strcasecmp(pExt, "png") == 0)
+			return load_png(pFilename, img);
+		if (strcasecmp(pExt, "bmp") == 0)
+			return load_bmp(pFilename, img);
+		if (strcasecmp(pExt, "tga") == 0)
+			return load_tga(pFilename, img);
+		if ( (strcasecmp(pExt, "jpg") == 0) || (strcasecmp(pExt, "jfif") == 0) || (strcasecmp(pExt, "jpeg") == 0) )
+			return load_jpg(pFilename, img);
+
+		return false;
+	}
 	
-	bool save_png(const char* pFilename, const image & img, uint32_t image_save_flags, uint32_t grayscale_comp)
+	bool save_png(const char* pFilename, const image &img, uint32_t image_save_flags, uint32_t grayscale_comp)
 	{
 		if (!img.get_total_pixels())
 			return false;
 
+		const uint32_t MAX_PNG_IMAGE_DIM = 32768;
+		if ((img.get_width() > MAX_PNG_IMAGE_DIM) || (img.get_height() > MAX_PNG_IMAGE_DIM))
+			return false;
+
 		std::vector<uint8_t> out;
 		unsigned err = 0;
 				
@@ -231,16 +499,19 @@ namespace basisu
 				for (uint32_t x = 0; x < img.get_width(); x++)
 					*pDst++ = img(x, y)[grayscale_comp];
 
-			err = lodepng::encode(out, (const uint8_t*)& g_pixels[0], img.get_width(), img.get_height(), LCT_GREY, 8);
+			err = lodepng::encode(out, (const uint8_t*)&g_pixels[0], img.get_width(), img.get_height(), LCT_GREY, 8);
 		}
 		else
 		{
 			bool has_alpha = img.has_alpha();
 			if ((!has_alpha) || ((image_save_flags & cImageSaveIgnoreAlpha) != 0))
 			{
-				uint8_vec rgb_pixels(img.get_width() * 3 * img.get_height());
+				const uint64_t total_bytes = (uint64_t)img.get_width() * 3U * (uint64_t)img.get_height();
+				if (total_bytes > INT_MAX)
+					return false;
+				uint8_vec rgb_pixels(static_cast<size_t>(total_bytes));
 				uint8_t *pDst = &rgb_pixels[0];
-
+								
 				for (uint32_t y = 0; y < img.get_height(); y++)
 				{
 					for (uint32_t x = 0; x < img.get_width(); x++)
@@ -302,7 +573,11 @@ namespace basisu
 			}
 		}
 
-		data.resize((size_t)filesize);
+		if (!data.try_resize((size_t)filesize))
+		{
+			fclose(pFile);
+			return false;
+		}
 
 		if (filesize)
 		{
@@ -525,7 +800,7 @@ namespace basisu
 			if ((s >= num_syms) || (A[r].m_key < A[s].m_key))
 			{
 				A[next].m_key = A[r].m_key;
-				A[r].m_key = static_cast<uint16_t>(next);
+				A[r].m_key = next;
 				++r;
 			}
 			else
@@ -536,13 +811,13 @@ namespace basisu
 
 			if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key))
 			{
-				A[next].m_key = static_cast<uint16_t>(A[next].m_key + A[r].m_key);
-				A[r].m_key = static_cast<uint16_t>(next);
+				A[next].m_key = A[next].m_key + A[r].m_key;
+				A[r].m_key = next;
 				++r;
 			}
 			else
 			{
-				A[next].m_key = static_cast<uint16_t>(A[next].m_key + A[s].m_key);
+				A[next].m_key = A[next].m_key + A[s].m_key;
 				++s;
 			}
 		}
@@ -562,7 +837,7 @@ namespace basisu
 				;
 
 			for ( ; num_avail > num_used; --next, --num_avail)
-				A[next].m_key = static_cast<uint16_t>(depth);
+				A[next].m_key = depth;
 
 			num_avail = 2 * num_used;
 			num_used = 0;
@@ -610,6 +885,10 @@ namespace basisu
 		for (i = 0; i < num_syms; i++)
 		{
 			uint32_t freq = pSyms0[i].m_key;
+			
+			// We scale all input frequencies to 16-bits.
+			assert(freq <= UINT16_MAX);
+
 			hist[freq & 0xFF]++;
 			hist[256 + ((freq >> 8) & 0xFF)]++;
 		}
@@ -731,8 +1010,13 @@ namespace basisu
 		else
 		{
 			for (uint32_t i = 0; i < num_syms; i++)
+			{
 				if (pSym_freq[i])
-					sym_freq[i] = static_cast<uint16_t>(maximum<uint32_t>((pSym_freq[i] * 65534U + (max_freq >> 1)) / max_freq, 1));
+				{
+					uint32_t f = static_cast<uint32_t>((static_cast<uint64_t>(pSym_freq[i]) * 65534U + (max_freq >> 1)) / max_freq);
+					sym_freq[i] = static_cast<uint16_t>(clamp<uint32_t>(f, 1, 65534));
+				}
+			}
 		}
 
 		return init(num_syms, &sym_freq[0], max_code_size);
@@ -1125,10 +1409,10 @@ namespace basisu
 	void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
 	{
 		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
-						
-		const uint32_t width = std::min(a.get_width(), b.get_width());
-		const uint32_t height = std::min(a.get_height(), b.get_height());
-						
+
+		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
+		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
+
 		double hist[256];
 		clear_obj(hist);
 
@@ -1159,7 +1443,7 @@ namespace basisu
 		{
 			if (hist[i])
 			{
-				m_max = std::max<float>(m_max, (float)i);
+				m_max = basisu::maximum<float>(m_max, (float)i);
 				double v = i * hist[i];
 				sum += v;
 				sum2 += i * v;
@@ -1171,9 +1455,9 @@ namespace basisu
 			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
 
 		m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
-		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0 * 255.0);
+		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
 		m_rms = (float)sqrt(m_mean_squared);
-		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0, 0.0f, 300.0f) : 1e+10f;
+		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f;
 	}
 
 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed)
@@ -1253,8 +1537,8 @@ namespace basisu
 	}
 
 	job_pool::job_pool(uint32_t num_threads) : 
-		m_kill_flag(false),
-		m_num_active_jobs(0)
+		m_num_active_jobs(0),
+		m_kill_flag(false)
 	{
 		assert(num_threads >= 1U);
 
@@ -1302,13 +1586,15 @@ namespace basisu
 		std::unique_lock<std::mutex> lock(m_mutex);
 
 		m_queue.emplace_back(std::move(job));
-
+						
 		const size_t queue_size = m_queue.size();
 
 		lock.unlock();
 
 		if (queue_size > 1)
+		{
 			m_has_work.notify_one();
+		}
 	}
 
 	void job_pool::wait_for_all()
@@ -1373,4 +1659,481 @@ namespace basisu
 		debug_printf("job_pool::job_thread: exiting\n");
 	}
 
+	// .TGA image loading
+	#pragma pack(push)
+	#pragma pack(1)
+	struct tga_header
+	{
+		uint8_t			m_id_len;
+		uint8_t			m_cmap;
+		uint8_t			m_type;
+		packed_uint<2>	m_cmap_first;
+		packed_uint<2> m_cmap_len;
+		uint8_t			m_cmap_bpp;
+		packed_uint<2> m_x_org;
+		packed_uint<2> m_y_org;
+		packed_uint<2> m_width;
+		packed_uint<2> m_height;
+		uint8_t			m_depth;
+		uint8_t			m_desc;
+	};
+	#pragma pack(pop)
+
+	const uint32_t MAX_TGA_IMAGE_SIZE = 16384;
+
+	enum tga_image_type
+	{
+		cITPalettized = 1,
+		cITRGB = 2,
+		cITGrayscale = 3
+	};
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans)
+	{
+		width = 0;
+		height = 0;
+		n_chans = 0;
+
+		if (buf_size <= sizeof(tga_header))
+			return nullptr;
+
+		const tga_header &hdr = *reinterpret_cast<const tga_header *>(pBuf);
+
+		if ((!hdr.m_width) || (!hdr.m_height) || (hdr.m_width > MAX_TGA_IMAGE_SIZE) || (hdr.m_height > MAX_TGA_IMAGE_SIZE))
+			return nullptr;
+
+		if (hdr.m_desc >> 6)
+			return nullptr;
+
+		// Simple validation
+		if ((hdr.m_cmap != 0) && (hdr.m_cmap != 1))
+			return nullptr;
+		
+		if (hdr.m_cmap)
+		{
+			if ((hdr.m_cmap_bpp == 0) || (hdr.m_cmap_bpp > 32))
+				return nullptr;
+
+			// Nobody implements CMapFirst correctly, so we're not supporting it. Never seen it used, either.
+			if (hdr.m_cmap_first != 0)
+				return nullptr;
+		}
+
+		const bool x_flipped = (hdr.m_desc & 0x10) != 0;
+		const bool y_flipped = (hdr.m_desc & 0x20) == 0;
+
+		bool rle_flag = false;
+		int file_image_type = hdr.m_type;
+		if (file_image_type > 8)
+		{
+			file_image_type -= 8;
+			rle_flag = true;
+		}
+
+		const tga_image_type image_type = static_cast<tga_image_type>(file_image_type);
+
+		switch (file_image_type)
+		{
+		case cITRGB:
+			if (hdr.m_depth == 8)
+				return nullptr;
+			break;
+		case cITPalettized:
+			if ((hdr.m_depth != 8) || (hdr.m_cmap != 1) || (hdr.m_cmap_len == 0))
+				return nullptr;
+			break;
+		case cITGrayscale:
+			if ((hdr.m_cmap != 0) || (hdr.m_cmap_len != 0))
+				return nullptr;
+			if ((hdr.m_depth != 8) && (hdr.m_depth != 16))
+				return nullptr;
+			break;
+		default:
+			return nullptr;
+		}
+
+		uint32_t tga_bytes_per_pixel = 0;
+
+		switch (hdr.m_depth)
+		{
+		case 32:
+			tga_bytes_per_pixel = 4;
+			n_chans = 4;
+			break;
+		case 24:
+			tga_bytes_per_pixel = 3;
+			n_chans = 3;
+			break;
+		case 16:
+		case 15:
+			tga_bytes_per_pixel = 2;
+			// For compatibility with stb_image_write.h
+			n_chans = ((file_image_type == cITGrayscale) && (hdr.m_depth == 16)) ? 4 : 3;
+			break;
+		case 8:
+			tga_bytes_per_pixel = 1;
+			// For palettized RGBA support, which both FreeImage and stb_image support.
+			n_chans = ((file_image_type == cITPalettized) && (hdr.m_cmap_bpp == 32)) ? 4 : 3;
+			break;
+		default:
+			return nullptr;
+		}
+
+		const uint32_t bytes_per_line = hdr.m_width * tga_bytes_per_pixel;
+
+		const uint8_t *pSrc = pBuf + sizeof(tga_header);
+		uint32_t bytes_remaining = buf_size - sizeof(tga_header);
+
+		if (hdr.m_id_len)
+		{
+			if (bytes_remaining < hdr.m_id_len)
+				return nullptr;
+			pSrc += hdr.m_id_len;
+			bytes_remaining += hdr.m_id_len;
+		}
+
+		color_rgba pal[256];
+		for (uint32_t i = 0; i < 256; i++)
+			pal[i].set(0, 0, 0, 255);
+
+		if ((hdr.m_cmap) && (hdr.m_cmap_len))
+		{
+			if (image_type == cITPalettized)
+			{
+				// Note I cannot find any files using 32bpp palettes in the wild (never seen any in ~30 years).
+				if ( ((hdr.m_cmap_bpp != 32) && (hdr.m_cmap_bpp != 24) && (hdr.m_cmap_bpp != 15) && (hdr.m_cmap_bpp != 16)) || (hdr.m_cmap_len > 256) )
+					return nullptr;
+
+				if (hdr.m_cmap_bpp == 32)
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 4;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						pal[i].r = pSrc[i * 4 + 2];
+						pal[i].g = pSrc[i * 4 + 1];
+						pal[i].b = pSrc[i * 4 + 0];
+						pal[i].a = pSrc[i * 4 + 3];
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+				else if (hdr.m_cmap_bpp == 24)
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 3;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						pal[i].r = pSrc[i * 3 + 2];
+						pal[i].g = pSrc[i * 3 + 1];
+						pal[i].b = pSrc[i * 3 + 0];
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+				else
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 2;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						const uint32_t v = pSrc[i * 2 + 0] | (pSrc[i * 2 + 1] << 8);
+
+						pal[i].r = (((v >> 10) & 31) * 255 + 15) / 31;
+						pal[i].g = (((v >> 5) & 31) * 255 + 15) / 31;
+						pal[i].b = ((v & 31) * 255 + 15) / 31;
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+			}
+			else
+			{
+				const uint32_t bytes_to_skip = (hdr.m_cmap_bpp >> 3) * hdr.m_cmap_len;
+				if (bytes_remaining < bytes_to_skip)
+					return nullptr;
+				pSrc += bytes_to_skip;
+				bytes_remaining += bytes_to_skip;
+			}
+		}
+		
+		width = hdr.m_width;
+		height = hdr.m_height;
+
+		const uint32_t source_pitch = width * tga_bytes_per_pixel;
+		const uint32_t dest_pitch = width * n_chans;
+		
+		uint8_t *pImage = (uint8_t *)malloc(dest_pitch * height);
+		if (!pImage)
+			return nullptr;
+
+		std::vector<uint8_t> input_line_buf;
+		if (rle_flag)
+			input_line_buf.resize(source_pitch);
+
+		int run_type = 0, run_remaining = 0;
+		uint8_t run_pixel[4];
+		memset(run_pixel, 0, sizeof(run_pixel));
+
+		for (int y = 0; y < height; y++)
+		{
+			const uint8_t *pLine_data;
+
+			if (rle_flag)
+			{
+				int pixels_remaining = width;
+				uint8_t *pDst = &input_line_buf[0];
+
+				do 
+				{
+					if (!run_remaining)
+					{
+						if (bytes_remaining < 1)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						int v = *pSrc++;
+						bytes_remaining--;
+
+						run_type = v & 0x80;
+						run_remaining = (v & 0x7F) + 1;
+
+						if (run_type)
+						{
+							if (bytes_remaining < tga_bytes_per_pixel)
+							{
+								free(pImage);
+								return nullptr;
+							}
+
+							memcpy(run_pixel, pSrc, tga_bytes_per_pixel);
+							pSrc += tga_bytes_per_pixel;
+							bytes_remaining -= tga_bytes_per_pixel;
+						}
+					}
+
+					const uint32_t n = basisu::minimum<uint32_t>(pixels_remaining, run_remaining);
+					pixels_remaining -= n;
+					run_remaining -= n;
+
+					if (run_type)
+					{
+						for (uint32_t i = 0; i < n; i++)
+							for (uint32_t j = 0; j < tga_bytes_per_pixel; j++)
+								*pDst++ = run_pixel[j];
+					}
+					else
+					{
+						const uint32_t bytes_wanted = n * tga_bytes_per_pixel;
+
+						if (bytes_remaining < bytes_wanted)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						memcpy(pDst, pSrc, bytes_wanted);
+						pDst += bytes_wanted;
+
+						pSrc += bytes_wanted;
+						bytes_remaining -= bytes_wanted;
+					}
+
+				} while (pixels_remaining);
+
+				assert((pDst - &input_line_buf[0]) == width * tga_bytes_per_pixel);
+
+				pLine_data = &input_line_buf[0];
+			}
+			else
+			{
+				if (bytes_remaining < source_pitch)
+				{
+					free(pImage);
+					return nullptr;
+				}
+
+				pLine_data = pSrc;
+				bytes_remaining -= source_pitch;
+				pSrc += source_pitch;
+			}
+
+			// Convert to 24bpp RGB or 32bpp RGBA.
+			uint8_t *pDst = pImage + (y_flipped ? (height - 1 - y) : y) * dest_pitch + (x_flipped ? (width - 1) * n_chans : 0);
+			const int dst_stride = x_flipped ? -((int)n_chans) : n_chans;
+
+			switch (hdr.m_depth)
+			{
+			case 32:
+				assert(tga_bytes_per_pixel == 4 && n_chans == 4);
+				for (int i = 0; i < width; i++, pLine_data += 4, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+					pDst[3] = pLine_data[3];
+				}
+				break;
+			case 24:
+				assert(tga_bytes_per_pixel == 3 && n_chans == 3);
+				for (int i = 0; i < width; i++, pLine_data += 3, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+				}
+				break;
+			case 16:
+			case 15:
+				if (image_type == cITRGB)
+				{
+					assert(tga_bytes_per_pixel == 2 && n_chans == 3);
+					for (int i = 0; i < width; i++, pLine_data += 2, pDst += dst_stride)
+					{
+						const uint32_t v = pLine_data[0] | (pLine_data[1] << 8);
+						pDst[0] = (((v >> 10) & 31) * 255 + 15) / 31;
+						pDst[1] = (((v >> 5) & 31) * 255 + 15) / 31;
+						pDst[2] = ((v & 31) * 255 + 15) / 31;
+					}
+				}
+				else
+				{
+					assert(image_type == cITGrayscale && tga_bytes_per_pixel == 2 && n_chans == 4);
+					for (int i = 0; i < width; i++, pLine_data += 2, pDst += dst_stride)
+					{
+						pDst[0] = pLine_data[0];
+						pDst[1] = pLine_data[0];
+						pDst[2] = pLine_data[0];
+						pDst[3] = pLine_data[1];
+					}
+				}
+				break;
+			case 8:
+				assert(tga_bytes_per_pixel == 1);
+				if (image_type == cITPalettized)
+				{
+					if (hdr.m_cmap_bpp == 32)
+					{
+						assert(n_chans == 4);
+						for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+						{
+							const uint32_t c = *pLine_data;
+							pDst[0] = pal[c].r;
+							pDst[1] = pal[c].g;
+							pDst[2] = pal[c].b;
+							pDst[3] = pal[c].a;
+						}
+					}
+					else
+					{
+						assert(n_chans == 3);
+						for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+						{
+							const uint32_t c = *pLine_data;
+							pDst[0] = pal[c].r;
+							pDst[1] = pal[c].g;
+							pDst[2] = pal[c].b;
+						}
+					}
+				}
+				else
+				{
+					assert(n_chans == 3);
+					for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+					{
+						const uint8_t c = *pLine_data;
+						pDst[0] = c;
+						pDst[1] = c;
+						pDst[2] = c;
+					}
+				}
+				break;
+			default:
+				assert(0);
+				break;
+			}
+		} // y
+
+		return pImage;
+	}
+
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans)
+	{
+		width = height = n_chans = 0;
+
+		uint8_vec filedata;
+		if (!read_file_to_vec(pFilename, filedata))
+			return nullptr;
+
+		if (!filedata.size() || (filedata.size() > UINT32_MAX))
+			return nullptr;
+		
+		return read_tga(&filedata[0], (uint32_t)filedata.size(), width, height, n_chans);
+	}
+
+	void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...)
+	{
+		char buf[2048];
+
+		va_list args;
+		va_start(args, pFmt);
+#ifdef _WIN32		
+		vsprintf_s(buf, sizeof(buf), pFmt, args);
+#else
+		vsnprintf(buf, sizeof(buf), pFmt, args);
+#endif
+		va_end(args);
+
+		const char* p = buf;
+
+		const uint32_t orig_x_ofs = x_ofs;
+
+		while (*p)
+		{
+			uint8_t c = *p++;
+			if ((c < 32) || (c > 127))
+				c = '.';
+
+			const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0];
+
+			for (uint32_t y = 0; y < 8; y++)
+			{
+				uint32_t row_bits = pGlpyh[y];
+				for (uint32_t x = 0; x < 8; x++)
+				{
+					const uint32_t q = row_bits & (1 << x);
+										
+					const color_rgba* pColor = q ? &fg : pBG;
+					if (!pColor)
+						continue;
+
+					if (alpha_only)
+						fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+					else
+						fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+				}
+			}
+
+			x_ofs += 8 * scale_x;
+			if ((x_ofs + 8 * scale_x) > m_width)
+			{
+				x_ofs = orig_x_ofs;
+				y_ofs += 8 * scale_y;
+			}
+		}
+	}
+		
 } // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h
index 0a0c3c6fc0..05c95cbc3b 100644
--- a/thirdparty/basis_universal/basisu_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_enc.h
@@ -1,5 +1,5 @@
 // basisu_enc.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
-#include "transcoder/basisu_transcoder_internal.h"
+#include "../transcoder/basisu.h"
+#include "../transcoder/basisu_transcoder_internal.h"
 
 #include <mutex>
 #include <atomic>
@@ -28,13 +28,29 @@
 #include <libgen.h>
 #endif
 
+// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
+
+// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
+#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
+
 namespace basisu
 {
 	extern uint8_t g_hamming_dist[256];
+	extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];
 
-	// Encoder library initialization
+	// Encoder library initialization.
+	// This function MUST be called before encoding anything!
 	void basisu_encoder_init();
 
+	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
+	extern void detect_sse41();
+
+#if BASISU_SUPPORT_SSE
+	extern bool g_cpu_supports_sse41;
+#else
+	const bool g_cpu_supports_sse41 = false;
+#endif
+
 	void error_printf(const char *pFmt, ...);
 
 	// Helpers
@@ -43,7 +59,68 @@ namespace basisu
 	{
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}
-	
+
+	inline int32_t clampi(int32_t value, int32_t low, int32_t high) 
+	{ 
+		if (value < low) 
+			value = low; 
+		else if (value > high) 
+			value = high; 
+		return value; 
+	}
+
+	inline uint8_t mul_8(uint32_t v, uint32_t a)
+	{
+		v = v * a + 128; 
+		return (uint8_t)((v + (v >> 8)) >> 8);
+	}
+
+	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 64);
+		uint64_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= ((uint64_t)(byte_bits) << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+
+	inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 32);
+		uint32_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= (byte_bits << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+				
 	// Hashing
 	
 	inline uint32_t bitmix32c(uint32_t v) 
@@ -69,6 +146,16 @@ namespace basisu
 		return v;
 	}
 
+	inline uint32_t wang_hash(uint32_t seed)
+	{
+		 seed = (seed ^ 61) ^ (seed >> 16);
+		 seed *= 9;
+		 seed = seed ^ (seed >> 4);
+		 seed *= 0x27d4eb2d;
+		 seed = seed ^ (seed >> 15);
+		 return seed;
+	}
+
 	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
 
 	template <typename Key>
@@ -80,6 +167,72 @@ namespace basisu
 		}
 	};
 
+	class running_stat
+	{
+	public:
+		running_stat() :
+			m_n(0),
+			m_old_m(0), m_new_m(0), m_old_s(0), m_new_s(0)
+		{
+		}
+		void clear()
+		{
+			m_n = 0;
+		}
+		void push(double x)
+		{
+			m_n++;
+			if (m_n == 1)
+			{
+				m_old_m = m_new_m = x;
+				m_old_s = 0.0;
+				m_min = x;
+				m_max = x;
+			}
+			else
+			{
+				m_new_m = m_old_m + (x - m_old_m) / m_n;
+				m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
+				m_old_m = m_new_m;
+				m_old_s = m_new_s;
+				m_min = basisu::minimum(x, m_min);
+				m_max = basisu::maximum(x, m_max);
+			}
+		}
+		uint32_t get_num() const
+		{
+			return m_n;
+		}
+		double get_mean() const
+		{
+			return (m_n > 0) ? m_new_m : 0.0;
+		}
+
+		double get_variance() const
+		{
+			return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
+		}
+
+		double get_std_dev() const
+		{
+			return sqrt(get_variance());
+		}
+
+		double get_min() const
+		{
+			return m_min;
+		}
+
+		double get_max() const
+		{
+			return m_max;
+		}
+
+	private:
+		uint32_t m_n;
+		double m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
+	};
+
 	// Linear algebra
 
 	template <uint32_t N, typename T>
@@ -118,7 +271,7 @@ namespace basisu
 		inline vec &set(const vec<OtherN, OtherT> &other)
 		{
 			uint32_t i;
-			if (static_cast<void *>(&other) == static_cast<void *>(this))
+			if ((const void *)(&other) == (const void *)(this))
 				return *this;
 			const uint32_t m = minimum(OtherN, N);
 			for (i = 0; i < m; i++)
@@ -358,6 +511,7 @@ namespace basisu
 		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
 
 	public:
+		// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
 		job_pool(uint32_t num_threads);
 		~job_pool();
 				
@@ -370,7 +524,7 @@ namespace basisu
 		
 	private:
 		std::vector<std::thread> m_threads;
-        std::vector<std::function<void()> > m_queue;
+		std::vector<std::function<void()> > m_queue;
 		
 		std::mutex m_mutex;
 		std::condition_variable m_has_work;
@@ -420,7 +574,7 @@ namespace basisu
 			return *this;
 		}
 	};
-		
+				
 	class color_rgba
 	{
 	public:
@@ -440,6 +594,25 @@ namespace basisu
 		inline color_rgba()
 		{
 			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
+			static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
+		}
+
+		// Not too hot about this idea.
+		inline color_rgba(const basist::color32& other) :
+			r(other.r),
+			g(other.g),
+			b(other.b),
+			a(other.a)
+		{
+		}
+
+		color_rgba& operator= (const basist::color32& rhs)
+		{
+			r = rhs.r;
+			g = rhs.g;
+			b = rhs.b;
+			a = rhs.a;
+			return *this;
 		}
 
 		inline color_rgba(int y)
@@ -563,11 +736,20 @@ namespace basisu
 		inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
 		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } 
 		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
+
+		inline basist::color32 get_color32() const
+		{
+			return basist::color32(r, g, b, a);
+		}
+
+		static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
+		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
 	};
 
-	typedef std::vector<color_rgba> color_rgba_vec;
+	typedef basisu::vector<color_rgba> color_rgba_vec;
 
 	const color_rgba g_black_color(0, 0, 0, 255);
+	const color_rgba g_black_trans_color(0, 0, 0, 0);
 	const color_rgba g_white_color(255, 255, 255, 255);
 
 	inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
@@ -595,6 +777,7 @@ namespace basisu
 	{
 		if (perceptual)
 		{
+#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
 			const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
 			const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;
 
@@ -617,11 +800,61 @@ namespace basisu
 			}
 
 			return d;
+#elif 1
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int delta_l = dr * 27 + dg * 92 + db * 9;
+			int delta_cr = dr * 128 - delta_l;
+			int delta_cb = db * 128 - delta_l;
+
+			uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
+				((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
+				((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
+
+			if (alpha)
+			{
+				int da = (e1.a - e2.a) << 7;
+				id += ((uint32_t)(da * da) >> 7U);
+			}
+
+			return id;
+#else
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
+			int64_t delta_cr = dr * 128 - delta_l;
+			int64_t delta_cb = db * 128 - delta_l;
+
+			int64_t id = ((delta_l * delta_l) * 128) +
+				((delta_cr * delta_cr) * 26) +
+				((delta_cb * delta_cb) * 3);
+
+			if (alpha)
+			{
+				int64_t da = (e1.a - e2.a);
+				id += (da * da) * 128;
+			}
+
+			int d = (id + 8192) >> 14;
+
+			return d;
+#endif
 		}
 		else
 			return color_distance(e1, e2, alpha);
 	}
 
+	static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
+	{
+		const int dl = a.r - b.r;
+		const int da = a.a - b.a;
+		return dl * dl + da * da;
+	}
+
 	// String helpers
 
 	inline int string_find_right(const std::string& filename, char c)
@@ -929,7 +1162,7 @@ namespace basisu
 			float m_priority;
 		};
 
-		std::vector<entry> m_heap;
+		basisu::vector<entry> m_heap;
 		uint32_t m_size;
 
 		// Push down entry at index
@@ -961,7 +1194,7 @@ namespace basisu
 	public:
 		typedef TrainingVectorType training_vec_type;
 		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
-		typedef std::vector< training_vec_with_weight > array_of_weighted_training_vecs;
+		typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
 
 		tree_vector_quant() :
 			m_next_codebook_index(0)
@@ -981,7 +1214,7 @@ namespace basisu
 		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
 				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
 
-		void retrieve(std::vector< std::vector<uint32_t> > &codebook) const
+		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@@ -994,7 +1227,7 @@ namespace basisu
 			}
 		}
 
-		void retrieve(std::vector<TrainingVectorType> &codebook) const
+		void retrieve(basisu::vector<TrainingVectorType> &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@@ -1007,7 +1240,7 @@ namespace basisu
 			}
 		}
 
-		void retrieve(uint32_t max_clusters, std::vector<uint_vec> &codebook) const
+		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
       {
 			uint_vec node_stack;
          node_stack.reserve(512);
@@ -1054,7 +1287,7 @@ namespace basisu
 			priority_queue var_heap;
 			var_heap.init(max_size, 0, m_nodes[0].m_var);
 
-			std::vector<uint32_t> l_children, r_children;
+			basisu::vector<uint32_t> l_children, r_children;
 
 			// Now split the worst nodes
 			l_children.reserve(m_training_vecs.size() + 1);
@@ -1092,7 +1325,7 @@ namespace basisu
 			inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }
 
 			// vecs is erased
-			inline void set(const TrainingVectorType &org, uint64_t weight, float var, std::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
+			inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
 
 			inline bool is_leaf() const { return m_left_index < 0; }
 
@@ -1100,11 +1333,11 @@ namespace basisu
 			uint64_t m_weight;
 			TrainingVectorType m_origin;
 			int32_t m_left_index, m_right_index;
-			std::vector<uint32_t> m_training_vecs;
+			basisu::vector<uint32_t> m_training_vecs;
 			int m_codebook_index;
 		};
 
-		typedef std::vector<tsvq_node> tsvq_node_vec;
+		typedef basisu::vector<tsvq_node> tsvq_node_vec;
 		tsvq_node_vec m_nodes;
 
 		array_of_weighted_training_vecs m_training_vecs;
@@ -1139,7 +1372,7 @@ namespace basisu
 			return root;
 		}
 
-		bool split_node(uint32_t node_index, priority_queue &var_heap, std::vector<uint32_t> &l_children, std::vector<uint32_t> &r_children)
+		bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
 		{
 			TrainingVectorType l_child_org, r_child_org;
 			uint64_t l_weight = 0, r_weight = 0;
@@ -1239,7 +1472,7 @@ namespace basisu
 
 		bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
 		{
-			const uint32_t N = TrainingVectorType::num_elements;
+			//const uint32_t N = TrainingVectorType::num_elements;
 
 			if (2 == node.m_training_vecs.size())
 			{
@@ -1304,7 +1537,7 @@ namespace basisu
 				if (largest_axis_index < 0)
 					return false;
 
-				std::vector<float> keys(node.m_training_vecs.size());
+				basisu::vector<float> keys(node.m_training_vecs.size());
 				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
 					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
 
@@ -1352,8 +1585,8 @@ namespace basisu
 		}
 
 		bool refine_split(const tsvq_node &node,
-			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, std::vector<uint32_t> &l_children,
-			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, std::vector<uint32_t> &r_children) const
+			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
+			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
 		{
 			l_children.reserve(node.m_training_vecs.size());
 			r_children.reserve(node.m_training_vecs.size());
@@ -1466,8 +1699,8 @@ namespace basisu
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
 	{
 		codebook.resize(0);
@@ -1493,7 +1726,7 @@ namespace basisu
 		if (!q.generate(max_threads))
 			return false;
 
-		std::vector<uint_vec> initial_codebook;
+		basisu::vector<uint_vec> initial_codebook;
 
 		q.retrieve(initial_codebook);
 
@@ -1512,12 +1745,14 @@ namespace basisu
 		bool success_flags[cMaxThreads];
 		clear_obj(success_flags);
 
-		std::vector<uint_vec> local_clusters[cMaxThreads];
-		std::vector<uint_vec> local_parent_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
 
 		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
 		{
+#ifndef __EMSCRIPTEN__
 			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
+#endif
 
 				Quantizer& lq = quantizers[thread_iter];
 				uint_vec& cluster_indices = initial_codebook[thread_iter];
@@ -1558,11 +1793,15 @@ namespace basisu
 					}
 				}
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // thread_iter
 
+#ifndef __EMSCRIPTEN__
 		pJob_pool->wait_for_all();
+#endif
 
 		uint32_t total_clusters = 0, total_parent_clusters = 0;
 
@@ -1598,8 +1837,8 @@ namespace basisu
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, job_pool *pJob_pool)
 	{
 		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
@@ -1629,7 +1868,7 @@ namespace basisu
 
 		Quantizer group_quant;
 		typedef typename group_hash::const_iterator group_hash_const_iter;
-		std::vector<group_hash_const_iter> unique_vec_iters;
+		basisu::vector<group_hash_const_iter> unique_vec_iters;
 		unique_vec_iters.reserve(unique_vecs.size());
 
 		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
@@ -1644,7 +1883,7 @@ namespace basisu
 
 		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
 
-		std::vector<uint_vec> group_codebook, group_parent_codebook;
+		basisu::vector<uint_vec> group_codebook, group_parent_codebook;
 		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
 			max_codebook_size, max_parent_codebook_size,
 			group_codebook,
@@ -1693,7 +1932,7 @@ namespace basisu
 
 	class histogram
 	{
-		std::vector<uint32_t> m_hist;
+		basisu::vector<uint32_t> m_hist;
 
 	public:
 		histogram(uint32_t size = 0) { init(size); }
@@ -1754,7 +1993,8 @@ namespace basisu
 		
 	struct sym_freq
 	{
-		uint16_t m_key, m_sym_index;
+		uint32_t m_key;
+		uint16_t m_sym_index;
 	};
 
 	sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);
@@ -1835,7 +2075,7 @@ namespace basisu
 		{
 			if (m_bit_buffer_size)
 			{
-				m_total_bits += 8;
+				m_total_bits += 8 - (m_bit_buffer_size & 7);
 				append_byte(static_cast<uint8_t>(m_bit_buffer));
 
 				m_bit_buffer = 0;
@@ -2107,6 +2347,12 @@ namespace basisu
 			resize(w, h, p);
 		}
 
+		image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			init(pImage, width, height, comps);
+		}
+
 		image(const image &other) :
 			m_width(0), m_height(0), m_pitch(0)
 		{
@@ -2155,6 +2401,47 @@ namespace basisu
 			return *this;
 		}
 
+		void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
+		{
+			assert(comps >= 1 && comps <= 4);
+			
+			resize(width, height);
+
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const uint8_t *pSrc = &pImage[(x + y * width) * comps];
+					color_rgba &dst = (*this)(x, y);
+
+					if (comps == 1)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = 255;
+					}
+					else if (comps == 2)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = pSrc[1];
+					}
+					else
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[1];
+						dst.b = pSrc[2];
+						if (comps == 4)
+							dst.a = pSrc[3];
+						else
+							dst.a = 255;
+					}
+				}
+			}
+		}
+
 		image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
 		{
 			for (uint32_t iy = 0; iy < h; iy++)
@@ -2163,6 +2450,14 @@ namespace basisu
 			return *this;
 		}
 
+		image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped_alpha(x + ix, y + iy, c);
+			return *this;
+		}
+
 		image &crop_dup_borders(uint32_t w, uint32_t h)
 		{
 			const uint32_t orig_w = m_width, orig_h = m_height;
@@ -2252,6 +2547,13 @@ namespace basisu
 			return *this;
 		}
 
+		inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y).m_comps[3] = c.m_comps[3];
+			return *this;
+		}
+
 		// Very straightforward blit with full clipping. Not fast, but it works.
 		image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
 		{
@@ -2376,6 +2678,8 @@ namespace basisu
 			}
 			return *this;
 		}
+
+		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
 				
 	private:
 		uint32_t m_width, m_height, m_pitch;  // all in pixels
@@ -2384,7 +2688,7 @@ namespace basisu
 
 	// Float images
 
-	typedef std::vector<vec4F> vec4F_vec;
+	typedef basisu::vector<vec4F> vec4F_vec;
 
 	class imagef
 	{
@@ -2635,10 +2939,27 @@ namespace basisu
 	};
 
 	// Image saving/loading/resampling
-
+	
+	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
 	bool load_png(const char* pFilename, image& img);
 	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
 
+	bool load_bmp(const char* pFilename, image& img);
+	inline bool load_bmp(const std::string &filename, image &img) { return load_bmp(filename.c_str(), img); }
+		
+	bool load_tga(const char* pFilename, image& img);
+	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
+
+	bool load_jpg(const char *pFilename, image& img);
+	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
+	
+	// Currently loads .BMP, .PNG, or .TGA.
+	bool load_image(const char* pFilename, image& img);
+	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
+		
 	enum
 	{
 		cImageSaveGrayscale = 1,
@@ -2697,7 +3018,7 @@ namespace basisu
 	template<typename T>
 	class vector2D
 	{
-		typedef std::vector<T> TVec;
+		typedef basisu::vector<T> TVec;
 
 		uint32_t m_width, m_height;
 		TVec m_values;
@@ -2800,7 +3121,7 @@ namespace basisu
 	}
 
 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
-
+		
 } // namespace basisu
 
 
diff --git a/thirdparty/basis_universal/encoder/basisu_etc.cpp b/thirdparty/basis_universal/encoder/basisu_etc.cpp
new file mode 100644
index 0000000000..232e8965b0
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_etc.cpp
@@ -0,0 +1,1593 @@
+// basis_etc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_etc.h"
+
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
+#define BASISU_DEBUG_ETC_ENCODER 0
+#define BASISU_DEBUG_ETC_ENCODER_DEEPER 0
+
+namespace basisu
+{
+	const int8_t g_etc2_eac_tables[16][8] =
+	{
+		{ -3, -6, -9, -15, 2, 5, 8, 14 }, { -3, -7, -10, -13, 2, 6, 9, 12 }, { -2, -5, -8, -13, 1, 4, 7, 12 }, { -2, -4, -6, -13, 1, 3, 5, 12 },
+		{ -3, -6, -8, -12, 2, 5, 7, 11 }, { -3, -7, -9, -11, 2, 6, 8, 10 }, { -4, -7, -8, -11, 3, 6, 7, 10 }, { -3, -5, -8, -11, 2, 4, 7, 10 },
+		{ -2, -6, -8, -10, 1, 5, 7, 9 }, { -2, -5, -8, -10, 1, 4, 7, 9 }, { -2, -4, -8, -10, 1, 3, 7, 9 }, { -2, -5, -7, -10, 1, 4, 6, 9 },
+		{ -3, -4, -7, -10, 2, 3, 6, 9 }, { -1, -2, -3, -10, 0, 1, 2, 9 }, { -4, -6, -8, -9, 3, 5, 7, 8 }, { -3, -5, -7, -9, 2, 4, 6, 8 }
+	};
+
+	const int8_t g_etc2_eac_tables8[16][8] =
+	{
+		{ -24, -48, -72, -120, 16, 40, 64, 112 }, { -24,-56,-80,-104,16,48,72,96 }, { -16,-40,-64,-104,8,32,56,96 }, { -16,-32,-48,-104,8,24,40,96 },
+		{ -24,-48,-64,-96,16,40,56,88 }, { -24,-56,-72,-88,16,48,64,80 }, { -32,-56,-64,-88,24,48,56,80 }, { -24,-40,-64,-88,16,32,56,80 },
+		{ -16,-48,-64,-80,8,40,56,72 }, { -16,-40,-64,-80,8,32,56,72 }, { -16,-32,-64,-80,8,24,56,72 }, { -16,-40,-56,-80,8,32,48,72 },
+		{ -24,-32,-56,-80,16,24,48,72 }, { -8,-16,-24,-80,0,8,16,72 }, { -32,-48,-64,-72,24,40,56,64 },	{ -24,-40,-56,-72,16,32,48,64 }
+	};
+		
+	// Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
+	static uint16_t g_etc1_inverse_lookup[2 * 8 * 4][256];      // [ diff/inten_table/selector][desired_color ]
+
+	// g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
+	// To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
+	static const uint16_t g_etc1_color8_to_etc_block_config_0_255[2][33] =
+	{
+		{ 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,		  0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
+		{ 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,		  0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
+	};
+
+	// Really only [254][11].
+	static const uint16_t g_etc1_color8_to_etc_block_config_1_to_254[254][12] =
+	{
+		{ 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,		0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
+		0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,		0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
+		0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,		0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
+		0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,		0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
+		}, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,		0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
+		0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {		0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
+		0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,		0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
+		0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,		0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
+		0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,		0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
+		0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,		0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
+		0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {		0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
+		0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,		0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
+		0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,		0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
+		0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,		0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
+		0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {		0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
+		0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,		0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
+		0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {		0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
+		0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,		0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
+		0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,		0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
+		0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {		0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
+		0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,		0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
+		0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,		0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
+		0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,		0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
+		0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {		0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
+		0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,		0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
+		0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,		0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
+		0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,		0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
+		0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,		0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
+		0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,		0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
+		0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF		}, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
+		0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,		0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
+		0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,		0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
+		0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,		0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
+		0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,		0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
+		0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,		0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
+		}, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {		0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
+		0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,		0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
+		0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF		}, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
+		0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {		0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
+		0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,		0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
+		0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,		0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
+		0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,		0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
+		0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,		0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
+		0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,		0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
+		0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,		0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
+		0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,		0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
+		0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {		0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
+		0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {		0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
+		0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,		0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
+		0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,		0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
+		0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF		}, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
+		0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,		0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
+		0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,		0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
+		0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,		0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
+		0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,		0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,		0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
+	};
+
+	static uint32_t etc1_decode_value(uint32_t diff, uint32_t inten, uint32_t selector, uint32_t packed_c)
+	{
+		const uint32_t limit = diff ? 32 : 16; 
+		BASISU_NOTE_UNUSED(limit);
+		assert((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
+		int c;
+		if (diff)
+			c = (packed_c >> 2) | (packed_c << 3);
+		else
+			c = packed_c | (packed_c << 4);
+		c += g_etc1_inten_tables[inten][selector];
+		c = clamp<int>(c, 0, 255);
+		return c;
+	}
+
+	void pack_etc1_solid_color_init()
+	{
+		for (uint32_t diff = 0; diff < 2; diff++)
+		{
+			const uint32_t limit = diff ? 32 : 16;
+
+			for (uint32_t inten = 0; inten < 8; inten++)
+			{
+				for (uint32_t selector = 0; selector < 4; selector++)
+				{
+					const uint32_t inverse_table_index = diff + (inten << 1) + (selector << 4);
+					for (uint32_t color = 0; color < 256; color++)
+					{
+						uint32_t best_error = UINT32_MAX, best_packed_c = 0;
+						for (uint32_t packed_c = 0; packed_c < limit; packed_c++)
+						{
+							int v = etc1_decode_value(diff, inten, selector, packed_c);
+							uint32_t err = (uint32_t)labs(v - static_cast<int>(color));
+							if (err < best_error)
+							{
+								best_error = err;
+								best_packed_c = packed_c;
+								if (!best_error)
+									break;
+							}
+						}
+						assert(best_error <= 255);
+						g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16_t>(best_packed_c | (best_error << 8));
+					}
+				}
+			}
+		}
+	}
+
+	// Packs solid color blocks efficiently using a set of small precomputed tables.
+	// For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor)
+	{
+		assert(g_etc1_inverse_lookup[0][255]);
+
+		static uint32_t s_next_comp[4] = { 1, 2, 0, 1 };
+
+		uint32_t best_error = UINT32_MAX, best_i = 0;
+		int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+		// For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			const uint32_t c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+			const int delta_range = 1;
+			for (int delta = -delta_range; delta <= delta_range; delta++)
+			{
+				const int c_plus_delta = clamp<int>(pColor[i] + delta, 0, 255);
+
+				const uint16_t* pTable;
+				if (!c_plus_delta)
+					pTable = g_etc1_color8_to_etc_block_config_0_255[0];
+				else if (c_plus_delta == 255)
+					pTable = g_etc1_color8_to_etc_block_config_0_255[1];
+				else
+					pTable = g_etc1_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+				do
+				{
+					const uint32_t x = *pTable++;
+
+#ifdef _DEBUG
+					const uint32_t diff = x & 1;
+					const uint32_t inten = (x >> 1) & 7;
+					const uint32_t selector = (x >> 4) & 3;
+					const uint32_t p0 = (x >> 8) & 255;
+					assert(etc1_decode_value(diff, inten, selector, p0) == (uint32_t)c_plus_delta);
+#endif
+
+					const uint16_t* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+					uint16_t p1 = pInverse_table[c1];
+					uint16_t p2 = pInverse_table[c2];
+					const uint32_t trial_error = square(c_plus_delta - pColor[i]) + square(p1 >> 8) + square(p2 >> 8);
+					if (trial_error < best_error)
+					{
+						best_error = trial_error;
+						best_x = x;
+						best_packed_c1 = p1 & 0xFF;
+						best_packed_c2 = p2 & 0xFF;
+						best_i = i;
+						if (!best_error)
+							goto found_perfect_match;
+					}
+				} while (*pTable != 0xFFFF);
+			}
+		}
+	found_perfect_match:
+
+		const uint32_t diff = best_x & 1;
+		const uint32_t inten = (best_x >> 1) & 7;
+
+		block.m_bytes[3] = static_cast<uint8_t>(((inten | (inten << 3)) << 2) | (diff << 1));
+
+		const uint32_t etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
+		*reinterpret_cast<uint16_t*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
+		*reinterpret_cast<uint16_t*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
+
+		const uint32_t best_packed_c0 = (best_x >> 8) & 255;
+		if (diff)
+		{
+			block.m_bytes[best_i] = static_cast<uint8_t>(best_packed_c0 << 3);
+			block.m_bytes[s_next_comp[best_i]] = static_cast<uint8_t>(best_packed_c1 << 3);
+			block.m_bytes[s_next_comp[best_i + 1]] = static_cast<uint8_t>(best_packed_c2 << 3);
+		}
+		else
+		{
+			block.m_bytes[best_i] = static_cast<uint8_t>(best_packed_c0 | (best_packed_c0 << 4));
+			block.m_bytes[s_next_comp[best_i]] = static_cast<uint8_t>(best_packed_c1 | (best_packed_c1 << 4));
+			block.m_bytes[s_next_comp[best_i + 1]] = static_cast<uint8_t>(best_packed_c2 | (best_packed_c2 << 4));
+		}
+
+		return best_error;
+	}
+	
+	const uint32_t BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE = 165;
+
+	static const struct { uint8_t m_v[4]; } g_cluster_fit_order_tab[BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE] =
+	{
+		{ { 0, 0, 0, 8 } },{ { 0, 5, 2, 1 } },{ { 0, 6, 1, 1 } },{ { 0, 7, 0, 1 } },{ { 0, 7, 1, 0 } },
+		{ { 0, 0, 8, 0 } },{ { 0, 0, 3, 5 } },{ { 0, 1, 7, 0 } },{ { 0, 0, 4, 4 } },{ { 0, 0, 2, 6 } },
+		{ { 0, 0, 7, 1 } },{ { 0, 0, 1, 7 } },{ { 0, 0, 5, 3 } },{ { 1, 6, 0, 1 } },{ { 0, 0, 6, 2 } },
+		{ { 0, 2, 6, 0 } },{ { 2, 4, 2, 0 } },{ { 0, 3, 5, 0 } },{ { 3, 3, 1, 1 } },{ { 4, 2, 0, 2 } },
+		{ { 1, 5, 2, 0 } },{ { 0, 5, 3, 0 } },{ { 0, 6, 2, 0 } },{ { 2, 4, 1, 1 } },{ { 5, 1, 0, 2 } },
+		{ { 6, 1, 1, 0 } },{ { 3, 3, 0, 2 } },{ { 6, 0, 0, 2 } },{ { 0, 8, 0, 0 } },{ { 6, 1, 0, 1 } },
+		{ { 0, 1, 6, 1 } },{ { 1, 6, 1, 0 } },{ { 4, 1, 3, 0 } },{ { 0, 2, 5, 1 } },{ { 5, 0, 3, 0 } },
+		{ { 5, 3, 0, 0 } },{ { 0, 1, 5, 2 } },{ { 0, 3, 4, 1 } },{ { 2, 5, 1, 0 } },{ { 1, 7, 0, 0 } },
+		{ { 0, 1, 4, 3 } },{ { 6, 0, 2, 0 } },{ { 0, 4, 4, 0 } },{ { 2, 6, 0, 0 } },{ { 0, 2, 4, 2 } },
+		{ { 0, 5, 1, 2 } },{ { 0, 6, 0, 2 } },{ { 3, 5, 0, 0 } },{ { 0, 4, 3, 1 } },{ { 3, 4, 1, 0 } },
+		{ { 4, 3, 1, 0 } },{ { 1, 5, 0, 2 } },{ { 0, 3, 3, 2 } },{ { 1, 4, 1, 2 } },{ { 0, 4, 2, 2 } },
+		{ { 2, 3, 3, 0 } },{ { 4, 4, 0, 0 } },{ { 1, 2, 4, 1 } },{ { 0, 5, 0, 3 } },{ { 0, 1, 3, 4 } },
+		{ { 1, 5, 1, 1 } },{ { 1, 4, 2, 1 } },{ { 1, 3, 2, 2 } },{ { 5, 2, 1, 0 } },{ { 1, 3, 3, 1 } },
+		{ { 0, 1, 2, 5 } },{ { 1, 1, 5, 1 } },{ { 0, 3, 2, 3 } },{ { 2, 5, 0, 1 } },{ { 3, 2, 2, 1 } },
+		{ { 2, 3, 0, 3 } },{ { 1, 4, 3, 0 } },{ { 2, 2, 1, 3 } },{ { 6, 2, 0, 0 } },{ { 1, 0, 6, 1 } },
+		{ { 3, 3, 2, 0 } },{ { 7, 1, 0, 0 } },{ { 3, 1, 4, 0 } },{ { 0, 2, 3, 3 } },{ { 0, 4, 1, 3 } },
+		{ { 0, 4, 0, 4 } },{ { 0, 1, 0, 7 } },{ { 2, 0, 5, 1 } },{ { 2, 0, 4, 2 } },{ { 3, 0, 2, 3 } },
+		{ { 2, 2, 4, 0 } },{ { 2, 2, 3, 1 } },{ { 4, 0, 3, 1 } },{ { 3, 2, 3, 0 } },{ { 2, 3, 2, 1 } },
+		{ { 1, 3, 4, 0 } },{ { 7, 0, 1, 0 } },{ { 3, 0, 4, 1 } },{ { 1, 0, 5, 2 } },{ { 8, 0, 0, 0 } },
+		{ { 3, 0, 1, 4 } },{ { 4, 1, 1, 2 } },{ { 4, 0, 2, 2 } },{ { 1, 2, 5, 0 } },{ { 4, 2, 1, 1 } },
+		{ { 3, 4, 0, 1 } },{ { 2, 0, 3, 3 } },{ { 5, 0, 1, 2 } },{ { 5, 0, 0, 3 } },{ { 2, 4, 0, 2 } },
+		{ { 2, 1, 4, 1 } },{ { 4, 0, 1, 3 } },{ { 2, 1, 5, 0 } },{ { 4, 2, 2, 0 } },{ { 4, 0, 4, 0 } },
+		{ { 1, 0, 4, 3 } },{ { 1, 4, 0, 3 } },{ { 3, 0, 3, 2 } },{ { 4, 3, 0, 1 } },{ { 0, 1, 1, 6 } },
+		{ { 1, 3, 1, 3 } },{ { 0, 2, 2, 4 } },{ { 2, 0, 2, 4 } },{ { 5, 1, 1, 1 } },{ { 3, 0, 5, 0 } },
+		{ { 2, 3, 1, 2 } },{ { 3, 0, 0, 5 } },{ { 0, 3, 1, 4 } },{ { 5, 0, 2, 1 } },{ { 2, 1, 3, 2 } },
+		{ { 2, 0, 6, 0 } },{ { 3, 1, 3, 1 } },{ { 5, 1, 2, 0 } },{ { 1, 0, 3, 4 } },{ { 1, 1, 6, 0 } },
+		{ { 4, 0, 0, 4 } },{ { 2, 0, 1, 5 } },{ { 0, 3, 0, 5 } },{ { 1, 3, 0, 4 } },{ { 4, 1, 2, 1 } },
+		{ { 1, 2, 3, 2 } },{ { 3, 1, 0, 4 } },{ { 5, 2, 0, 1 } },{ { 1, 2, 2, 3 } },{ { 3, 2, 1, 2 } },
+		{ { 2, 2, 2, 2 } },{ { 6, 0, 1, 1 } },{ { 1, 2, 1, 4 } },{ { 1, 1, 4, 2 } },{ { 3, 2, 0, 3 } },
+		{ { 1, 2, 0, 5 } },{ { 1, 0, 7, 0 } },{ { 3, 1, 2, 2 } },{ { 1, 0, 2, 5 } },{ { 2, 0, 0, 6 } },
+		{ { 2, 1, 1, 4 } },{ { 2, 2, 0, 4 } },{ { 1, 1, 3, 3 } },{ { 7, 0, 0, 1 } },{ { 1, 0, 0, 7 } },
+		{ { 2, 1, 2, 3 } },{ { 4, 1, 0, 3 } },{ { 3, 1, 1, 3 } },{ { 1, 1, 2, 4 } },{ { 2, 1, 0, 5 } },
+		{ { 1, 0, 1, 6 } },{ { 0, 2, 1, 5 } },{ { 0, 2, 0, 6 } },{ { 1, 1, 1, 5 } },{ { 1, 1, 0, 6 } }
+	};
+		
+	const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] =
+	{
+		{ -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 },
+		{ -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 }
+	};
+
+	const uint8_t g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
+	const uint8_t g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
+
+	// [flip][subblock][pixel_index]
+	const etc_coord2 g_etc1_pixel_coords[2][2][8] =
+	{
+		{
+		  {
+			 { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
+			 { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
+		  },
+		  {
+			 { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+			 { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
+		  }
+		},
+		{
+		  {
+			 { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+			 { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 }
+		  },
+		  {
+			 { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+			 { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
+		  },
+		}
+	};
+
+	// [flip][subblock][pixel_index]
+	const uint32_t g_etc1_pixel_indices[2][2][8] =
+	{
+		{
+			{
+				0 + 4 * 0, 0 + 4 * 1, 0 + 4 * 2, 0 + 4 * 3,
+				1 + 4 * 0, 1 + 4 * 1, 1 + 4 * 2, 1 + 4 * 3
+			},
+			{
+				2 + 4 * 0, 2 + 4 * 1, 2 + 4 * 2, 2 + 4 * 3,
+				3 + 4 * 0, 3 + 4 * 1, 3 + 4 * 2, 3 + 4 * 3
+			}
+		},
+		{
+			{
+				0 + 4 * 0, 1 + 4 * 0, 2 + 4 * 0, 3 + 4 * 0,
+				0 + 4 * 1, 1 + 4 * 1, 2 + 4 * 1, 3 + 4 * 1
+			},
+			{
+				0 + 4 * 2, 1 + 4 * 2, 2 + 4 * 2, 3 + 4 * 2,
+				0 + 4 * 3, 1 + 4 * 3, 2 + 4 * 3, 3 + 4 * 3
+			},
+		}
+	};
+
+	uint16_t etc_block::pack_color5(const color_rgba& color, bool scaled, uint32_t bias)
+	{
+		return pack_color5(color.r, color.g, color.b, scaled, bias);
+	}
+
+	uint16_t etc_block::pack_color5(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
+	{
+		if (scaled)
+		{
+			r = (r * 31U + bias) / 255U;
+			g = (g * 31U + bias) / 255U;
+			b = (b * 31U + bias) / 255U;
+		}
+
+		r = minimum(r, 31U);
+		g = minimum(g, 31U);
+		b = minimum(b, 31U);
+
+		return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
+	}
+
+	color_rgba etc_block::unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha)
+	{
+		uint32_t b = packed_color5 & 31U;
+		uint32_t g = (packed_color5 >> 5U) & 31U;
+		uint32_t r = (packed_color5 >> 10U) & 31U;
+
+		if (scaled)
+		{
+			b = (b << 3U) | (b >> 2U);
+			g = (g << 3U) | (g >> 2U);
+			r = (r << 3U) | (r >> 2U);
+		}
+
+		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
+	}
+
+	void etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, bool scaled)
+	{
+		result = unpack_color5(packed_color5, scaled, 255);
+	}
+
+	void etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, bool scaled)
+	{
+		color_rgba c(unpack_color5(packed_color5, scaled, 0));
+		r = c.r;
+		g = c.g;
+		b = c.b;
+	}
+
+	bool etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
+	{
+		color_rgba_i16 dc(unpack_delta3(packed_delta3));
+
+		int b = (packed_color5 & 31U) + dc.b;
+		int g = ((packed_color5 >> 5U) & 31U) + dc.g;
+		int r = ((packed_color5 >> 10U) & 31U) + dc.r;
+
+		bool success = true;
+		if (static_cast<uint32_t>(r | g | b) > 31U)
+		{
+			success = false;
+			r = clamp<int>(r, 0, 31);
+			g = clamp<int>(g, 0, 31);
+			b = clamp<int>(b, 0, 31);
+		}
+
+		if (scaled)
+		{
+			b = (b << 3U) | (b >> 2U);
+			g = (g << 3U) | (g >> 2U);
+			r = (r << 3U) | (r >> 2U);
+		}
+
+		result.set_noclamp_rgba(r, g, b, minimum(alpha, 255U));
+		return success;
+	}
+
+	bool etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
+	{
+		color_rgba result;
+		const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
+		r = result.r;
+		g = result.g;
+		b = result.b;
+		return success;
+	}
+
+	uint16_t etc_block::pack_delta3(const color_rgba_i16& color)
+	{
+		return pack_delta3(color.r, color.g, color.b);
+	}
+
+	uint16_t etc_block::pack_delta3(int r, int g, int b)
+	{
+		assert((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
+		assert((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
+		assert((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
+		if (r < 0) r += 8;
+		if (g < 0) g += 8;
+		if (b < 0) b += 8;
+		return static_cast<uint16_t>(b | (g << 3) | (r << 6));
+	}
+
+	color_rgba_i16 etc_block::unpack_delta3(uint16_t packed_delta3)
+	{
+		int r = (packed_delta3 >> 6) & 7;
+		int g = (packed_delta3 >> 3) & 7;
+		int b = packed_delta3 & 7;
+		if (r >= 4) r -= 8;
+		if (g >= 4) g -= 8;
+		if (b >= 4) b -= 8;
+		return color_rgba_i16(r, g, b, 255);
+	}
+
+	void etc_block::unpack_delta3(int& r, int& g, int& b, uint16_t packed_delta3)
+	{
+		r = (packed_delta3 >> 6) & 7;
+		g = (packed_delta3 >> 3) & 7;
+		b = packed_delta3 & 7;
+		if (r >= 4) r -= 8;
+		if (g >= 4) g -= 8;
+		if (b >= 4) b -= 8;
+	}
+
+	uint16_t etc_block::pack_color4(const color_rgba& color, bool scaled, uint32_t bias)
+	{
+		return pack_color4(color.r, color.g, color.b, scaled, bias);
+	}
+
+	uint16_t etc_block::pack_color4(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
+	{
+		if (scaled)
+		{
+			r = (r * 15U + bias) / 255U;
+			g = (g * 15U + bias) / 255U;
+			b = (b * 15U + bias) / 255U;
+		}
+
+		r = minimum(r, 15U);
+		g = minimum(g, 15U);
+		b = minimum(b, 15U);
+
+		return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
+	}
+
+	color_rgba etc_block::unpack_color4(uint16_t packed_color4, bool scaled, uint32_t alpha)
+	{
+		uint32_t b = packed_color4 & 15U;
+		uint32_t g = (packed_color4 >> 4U) & 15U;
+		uint32_t r = (packed_color4 >> 8U) & 15U;
+
+		if (scaled)
+		{
+			b = (b << 4U) | b;
+			g = (g << 4U) | g;
+			r = (r << 4U) | r;
+		}
+
+		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
+	}
+
+	void etc_block::unpack_color4(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color4, bool scaled)
+	{
+		color_rgba c(unpack_color4(packed_color4, scaled, 0));
+		r = c.r;
+		g = c.g;
+		b = c.b;
+	}
+
+	void etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		unpack_color5(r, g, b, packed_color5, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+	}
+
+	bool etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint16_t packed_delta3, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+
+		return success;
+	}
+
+	void etc_block::get_abs_subblock_colors(color_rgba* pDst, uint16_t packed_color4, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		unpack_color4(r, g, b, packed_color4, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+	}
+		
+	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha)
+	{
+		const bool diff_flag = block.get_diff_bit();
+		const bool flip_flag = block.get_flip_bit();
+		const uint32_t table_index0 = block.get_inten_table(0);
+		const uint32_t table_index1 = block.get_inten_table(1);
+
+		color_rgba subblock_colors0[4];
+		color_rgba subblock_colors1[4];
+
+		if (diff_flag)
+		{
+			const uint16_t base_color5 = block.get_base5_color();
+			const uint16_t delta_color3 = block.get_delta3_color();
+			etc_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
+
+			if (!etc_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
+				return false;
+		}
+		else
+		{
+			const uint16_t base_color4_0 = block.get_base4_color(0);
+			etc_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
+
+			const uint16_t base_color4_1 = block.get_base4_color(1);
+			etc_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
+		}
+
+		if (preserve_alpha)
+		{
+			if (flip_flag)
+			{
+				for (uint32_t y = 0; y < 2; y++)
+				{
+					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+
+				for (uint32_t y = 2; y < 4; y++)
+				{
+					pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+			}
+			else
+			{
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+			}
+		}
+		else
+		{
+			if (flip_flag)
+			{
+				// 0000
+				// 0000
+				// 1111
+				// 1111
+				for (uint32_t y = 0; y < 2; y++)
+				{
+					pDst[0] = subblock_colors0[block.get_selector(0, y)];
+					pDst[1] = subblock_colors0[block.get_selector(1, y)];
+					pDst[2] = subblock_colors0[block.get_selector(2, y)];
+					pDst[3] = subblock_colors0[block.get_selector(3, y)];
+					pDst += 4;
+				}
+
+				for (uint32_t y = 2; y < 4; y++)
+				{
+					pDst[0] = subblock_colors1[block.get_selector(0, y)];
+					pDst[1] = subblock_colors1[block.get_selector(1, y)];
+					pDst[2] = subblock_colors1[block.get_selector(2, y)];
+					pDst[3] = subblock_colors1[block.get_selector(3, y)];
+					pDst += 4;
+				}
+			}
+			else
+			{
+				// 0011
+				// 0011
+				// 0011
+				// 0011
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					pDst[0] = subblock_colors0[block.get_selector(0, y)];
+					pDst[1] = subblock_colors0[block.get_selector(1, y)];
+					pDst[2] = subblock_colors1[block.get_selector(2, y)];
+					pDst[3] = subblock_colors1[block.get_selector(3, y)];
+					pDst += 4;
+				}
+			}
+		}
+
+		return true;
+	}
+
+	inline int extend_6_to_8(uint32_t n)
+	{
+		return (n << 2) | (n >> 4);
+	}
+
+	inline int extend_7_to_8(uint32_t n)
+	{
+		return (n << 1) | (n >> 6);
+	}
+
+	inline int extend_4_to_8(uint32_t n)
+	{
+		return (n << 4) | n;
+	}
+		
+	uint64_t etc_block::evaluate_etc1_error(const color_rgba* pBlock_pixels, bool perceptual, int subblock_index) const
+	{
+		color_rgba unpacked_block[16];
+
+		unpack_etc1(*this, unpacked_block);
+
+		uint64_t total_error = 0;
+
+		if (subblock_index < 0)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+				total_error += color_distance(perceptual, pBlock_pixels[i], unpacked_block[i], false);
+		}
+		else
+		{
+			const bool flip_bit = get_flip_bit();
+
+			for (uint32_t i = 0; i < 8; i++)
+			{
+				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
+
+				total_error += color_distance(perceptual, pBlock_pixels[idx], unpacked_block[idx], false);
+			}
+		}
+
+		return total_error;
+	}
+
+	void etc_block::get_subblock_pixels(color_rgba* pPixels, int subblock_index) const
+	{
+		if (subblock_index < 0)
+			unpack_etc1(*this, pPixels);
+		else
+		{
+			color_rgba unpacked_block[16];
+
+			unpack_etc1(*this, unpacked_block);
+
+			const bool flip_bit = get_flip_bit();
+
+			for (uint32_t i = 0; i < 8; i++)
+			{
+				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
+
+				pPixels[i] = unpacked_block[idx];
+			}
+		}
+	}
+								
+	bool etc1_optimizer::compute()
+	{
+		assert(m_pResult->m_pSelectors);
+
+		if (m_pParams->m_pForce_selectors)
+		{
+			assert(m_pParams->m_quality >= cETCQualitySlow);
+			if (m_pParams->m_quality < cETCQualitySlow)
+				return false;
+		}
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		if (m_pParams->m_cluster_fit)
+		{
+			if (m_pParams->m_quality == cETCQualityFast)
+				compute_internal_cluster_fit(4);
+			else if (m_pParams->m_quality == cETCQualityMedium)
+				compute_internal_cluster_fit(16);
+			else if (m_pParams->m_quality == cETCQualitySlow)
+				compute_internal_cluster_fit(64);
+			else
+				compute_internal_cluster_fit(BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE);
+		}
+		else
+			compute_internal_neighborhood(m_br, m_bg, m_bb);
+
+		if (!m_best_solution.m_valid)
+		{
+			m_pResult->m_error = UINT32_MAX;
+			return false;
+		}
+
+		//const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+		const uint8_t* pSelectors = m_pParams->m_pForce_selectors ? m_pParams->m_pForce_selectors : &m_best_solution.m_selectors[0];
+
+#if defined(DEBUG) || defined(_DEBUG)
+		{
+			// sanity check the returned error
+			color_rgba block_colors[4];
+			m_best_solution.m_coords.get_block_colors(block_colors);
+
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+			uint64_t actual_error = 0;
+			
+			bool perceptual;
+			if (m_pParams->m_quality >= cETCQualityMedium)
+				perceptual = m_pParams->m_perceptual;
+			else
+				perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+						
+			for (uint32_t i = 0; i < n; i++)
+				actual_error += color_distance(perceptual, pSrc_pixels[i], block_colors[pSelectors[i]], false);
+
+			assert(actual_error == m_best_solution.m_error);
+		}
+#endif      
+
+		m_pResult->m_error = m_best_solution.m_error;
+
+		m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
+		m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
+
+		m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
+		memcpy(m_pResult->m_pSelectors, pSelectors, n);
+		m_pResult->m_n = n;
+
+		return true;
+	}
+
+	void etc1_optimizer::refine_solution(uint32_t max_refinement_trials)
+	{
+		// Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
+		// Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
+		// The goal is:
+		// pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
+		// Rearranging this:
+		// (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
+		// block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
+		// So what this means:
+		// optimal_block_color = avg_input - avg_inten_delta
+		// So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
+		// Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
+		// Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		for (uint32_t refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
+		{
+			const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+			const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+
+			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
+			for (uint32_t r = 0; r < n; r++)
+			{
+				const uint32_t s = *pSelectors++;
+				const int yd_temp = pInten_table[s];
+				// Compute actual delta being applied to each pixel, taking into account clamping.
+				delta_sum_r += clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r;
+				delta_sum_g += clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g;
+				delta_sum_b += clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b;
+			}
+
+			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+				break;
+
+			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
+			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
+			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
+			const int br1 = clamp<int>(static_cast<int32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bg1 = clamp<int>(static_cast<int32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bb1 = clamp<int>(static_cast<int32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+			printf("Refinement trial %u, avg_delta %f %f %f\n", refinement_trial, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
+#endif
+
+			if (!evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution))
+				break;
+
+		}  // refinement_trial
+	}
+
+	void etc1_optimizer::compute_internal_neighborhood(int scan_r, int scan_g, int scan_b)
+	{
+		if (m_best_solution.m_error == 0)
+			return;
+
+		//const uint32_t n = m_pParams->m_num_src_pixels;
+		const int scan_delta_size = m_pParams->m_scan_delta_size;
+
+		// Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
+		// Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
+		for (int zdi = 0; zdi < scan_delta_size; zdi++)
+		{
+			const int zd = m_pParams->m_pScan_deltas[zdi];
+			const int mbb = scan_b + zd;
+			if (mbb < 0) continue; else if (mbb > m_limit) break;
+
+			for (int ydi = 0; ydi < scan_delta_size; ydi++)
+			{
+				const int yd = m_pParams->m_pScan_deltas[ydi];
+				const int mbg = scan_g + yd;
+				if (mbg < 0) continue; else if (mbg > m_limit) break;
+
+				for (int xdi = 0; xdi < scan_delta_size; xdi++)
+				{
+					const int xd = m_pParams->m_pScan_deltas[xdi];
+					const int mbr = scan_r + xd;
+					if (mbr < 0) continue; else if (mbr > m_limit) break;
+
+					etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
+
+					if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
+						continue;
+
+					if (m_pParams->m_refinement)
+					{
+						refine_solution((m_pParams->m_quality == cETCQualityFast) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2));
+					}
+
+				} // xdi
+			} // ydi
+		} // zdi
+	}
+
+	void etc1_optimizer::compute_internal_cluster_fit(uint32_t total_perms_to_try)
+	{
+		if ((!m_best_solution.m_valid) || ((m_br != m_best_solution.m_coords.m_unscaled_color.r) || (m_bg != m_best_solution.m_coords.m_unscaled_color.g) || (m_bb != m_best_solution.m_coords.m_unscaled_color.b)))
+		{
+			evaluate_solution(etc1_solution_coordinates(m_br, m_bg, m_bb, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
+		}
+
+		if ((m_best_solution.m_error == 0) || (!m_best_solution.m_valid))
+			return;
+
+		for (uint32_t i = 0; i < total_perms_to_try; i++)
+		{
+			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+
+			const int *pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
+
+			const uint8_t *pNum_selectors = g_cluster_fit_order_tab[i].m_v;
+
+			for (uint32_t q = 0; q < 4; q++)
+			{
+				const int yd_temp = pInten_table[q];
+
+				delta_sum_r += pNum_selectors[q] * (clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r);
+				delta_sum_g += pNum_selectors[q] * (clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g);
+				delta_sum_b += pNum_selectors[q] * (clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b);
+			}
+
+			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+				continue;
+
+			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / 8;
+			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / 8;
+			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / 8;
+
+			const int br1 = clamp<int>(static_cast<int32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bg1 = clamp<int>(static_cast<int32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bb1 = clamp<int>(static_cast<int32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+			printf("Second refinement trial %u, avg_delta %f %f %f\n", i, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
+#endif
+
+			evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
+
+			if (m_best_solution.m_error == 0)
+				break;
+		}
+	}
+
+	void etc1_optimizer::init(const params& params, results& result)
+	{
+		m_pParams = &params;
+		m_pResult = &result;
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		m_selectors.resize(n);
+		m_best_selectors.resize(n);
+		m_temp_selectors.resize(n);
+		m_trial_solution.m_selectors.resize(n);
+		m_best_solution.m_selectors.resize(n);
+
+		m_limit = m_pParams->m_use_color4 ? 15 : 31;
+
+		vec3F avg_color(0.0f);
+
+		m_luma.resize(n);
+		m_sorted_luma_indices.resize(n);
+		m_sorted_luma.resize(n);
+		
+		int min_r = 255, min_g = 255, min_b = 255;
+		int max_r = 0, max_g = 0, max_b = 0;
+		
+		for (uint32_t i = 0; i < n; i++)
+		{
+			const color_rgba& c = m_pParams->m_pSrc_pixels[i];
+
+			min_r = basisu::minimum<int>(min_r, c.r);
+			min_g = basisu::minimum<int>(min_g, c.g);
+			min_b = basisu::minimum<int>(min_b, c.b);
+
+			max_r = basisu::maximum<int>(max_r, c.r);
+			max_g = basisu::maximum<int>(max_g, c.g);
+			max_b = basisu::maximum<int>(max_b, c.b);
+
+			const vec3F fc(c.r, c.g, c.b);
+
+			avg_color += fc;
+
+			m_luma[i] = static_cast<uint16_t>(c.r + c.g + c.b);
+			m_sorted_luma_indices[i] = i;
+		}
+		avg_color /= static_cast<float>(n);
+		m_avg_color = avg_color;
+		m_max_comp_spread = basisu::maximum(basisu::maximum(max_r - min_r, max_g - min_g), max_b - min_b);
+
+		m_br = clamp<int>(static_cast<uint32_t>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
+		m_bg = clamp<int>(static_cast<uint32_t>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
+		m_bb = clamp<int>(static_cast<uint32_t>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Avg block color: %u %u %u\n", m_br, m_bg, m_bb);
+#endif
+
+		if (m_pParams->m_quality == cETCQualityFast)
+		{
+			indirect_sort(n, &m_sorted_luma_indices[0], &m_luma[0]);
+
+			m_pSorted_luma = &m_sorted_luma[0];
+			m_pSorted_luma_indices = &m_sorted_luma_indices[0];
+			
+			for (uint32_t i = 0; i < n; i++)
+				m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
+		}
+
+		m_best_solution.m_coords.clear();
+		m_best_solution.m_valid = false;
+		m_best_solution.m_error = UINT64_MAX;
+
+		clear_obj(m_solutions_tried);
+	}
+
+	// Return false if we've probably already tried this solution, true if we have definitely not.
+	bool etc1_optimizer::check_for_redundant_solution(const etc1_solution_coordinates& coords)
+	{
+		// Hash first 3 bytes of color (RGB)
+		uint32_t kh = hash_hsieh((uint8_t*)&coords.m_unscaled_color.r, 3);
+
+		uint32_t h0 = kh & cSolutionsTriedHashMask;
+		uint32_t h1 = (kh >> cSolutionsTriedHashBits) & cSolutionsTriedHashMask;
+
+		// Simple Bloom filter lookup with k=2
+		if ( ((m_solutions_tried[h0 >> 3] & (1 << (h0 & 7))) != 0) &&
+		     ((m_solutions_tried[h1 >> 3] & (1 << (h1 & 7))) != 0) )
+			return false;
+
+		m_solutions_tried[h0 >> 3] |= (1 << (h0 & 7));
+		m_solutions_tried[h1 >> 3] |= (1 << (h1 & 7));
+
+		return true;
+	}
+		
+	static uint8_t g_eval_dist_tables[8][256] =
+	{
+		// 99% threshold
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,},
+		{ 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,}
+	};
+
+	bool etc1_optimizer::evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+	{
+		if (!check_for_redundant_solution(coords))
+			return false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval solution: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
+#endif
+
+		trial_solution.m_valid = false;
+
+		if (m_pParams->m_constrain_against_base_color5)
+		{
+			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
+			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
+			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
+
+			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
+			{
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
+#endif
+				return false;
+			}
+		}
+
+		const color_rgba base_color(coords.get_scaled_color());
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+		assert(trial_solution.m_selectors.size() == n);
+
+		trial_solution.m_error = INT64_MAX;
+
+		const uint8_t *pSelectors_to_use = m_pParams->m_pForce_selectors;
+
+		for (uint32_t inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
+		{
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				if (!g_eval_dist_tables[inten_table][m_max_comp_spread])
+					continue;
+			}
+#if 0
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				// For tables 5-7, if the max component spread falls within certain ranges, skip the inten table. Statistically they are extremely unlikely to result in lower error.
+				if (inten_table == 7)
+				{
+					if (m_max_comp_spread < 42)
+						continue;
+				}
+				else if (inten_table == 6)
+				{
+					if ((m_max_comp_spread >= 12) && (m_max_comp_spread <= 31))
+						continue;
+				}
+				else if (inten_table == 5)
+				{
+					if ((m_max_comp_spread >= 13) && (m_max_comp_spread <= 21))
+						continue;
+				}
+			}
+#endif
+
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			color_rgba block_colors[4];
+			for (uint32_t s = 0; s < 4; s++)
+			{
+				const int yd = pInten_table[s];
+				block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
+			}
+
+			uint64_t total_error = 0;
+
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+
+			if (!g_cpu_supports_sse41)
+			{
+				for (uint32_t c = 0; c < n; c++)
+				{
+					const color_rgba& src_pixel = *pSrc_pixels++;
+
+					uint32_t best_selector_index = 0;
+					uint32_t best_error = 0;
+
+					if (pSelectors_to_use)
+					{
+						best_selector_index = pSelectors_to_use[c];
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[best_selector_index], false);
+					}
+					else
+					{
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[0], false);
+
+						uint32_t trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[1], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 1;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[2], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 2;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[3], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 3;
+						}
+					}
+
+					m_temp_selectors[c] = static_cast<uint8_t>(best_selector_index);
+
+					total_error += best_error;
+					if (total_error >= trial_solution.m_error)
+						break;
+				}
+			}
+			else
+			{
+#if BASISU_SUPPORT_SSE
+				if (pSelectors_to_use)
+				{
+					if (m_pParams->m_perceptual)
+						perceptual_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						linear_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
+				}
+				else
+				{
+					if (m_pParams->m_perceptual)
+						find_selectors_perceptual_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						find_selectors_linear_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
+				}
+#endif
+			}
+
+			if (total_error < trial_solution.m_error)
+			{
+				trial_solution.m_error = total_error;
+				trial_solution.m_coords.m_inten_table = inten_table;
+				trial_solution.m_selectors.swap(m_temp_selectors);
+				trial_solution.m_valid = true;
+			}
+		}
+		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+				
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
+#endif
+
+		bool success = false;
+		if (pBest_solution)
+		{
+			if (trial_solution.m_error < pBest_solution->m_error)
+			{
+				*pBest_solution = trial_solution;
+				success = true;
+			}
+		}
+				
+		return success;
+	}
+
+	bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+	{
+		if (!check_for_redundant_solution(coords))
+			return false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval solution fast: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
+#endif
+
+		if (m_pParams->m_constrain_against_base_color5)
+		{
+			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
+			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
+			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
+
+			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
+			{
+				trial_solution.m_valid = false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
+#endif
+				return false;
+			}
+		}
+
+		const color_rgba base_color(coords.get_scaled_color());
+		
+		const uint32_t n = m_pParams->m_num_src_pixels;
+		assert(trial_solution.m_selectors.size() == n);
+
+		trial_solution.m_error = UINT64_MAX;
+								
+		const bool perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+				
+		for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
+		{
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			uint32_t block_inten[4];
+			color_rgba block_colors[4];
+			for (uint32_t s = 0; s < 4; s++)
+			{
+				const int yd = pInten_table[s];
+				color_rgba block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
+				block_colors[s] = block_color;
+				block_inten[s] = block_color.r + block_color.g + block_color.b;
+			}
+
+			// evaluate_solution_fast() enforces/assumes a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
+			// The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
+			// 0   1   2   3
+			//   01  12  23
+			const uint32_t block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
+															
+			uint64_t total_error = 0;
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+						
+			if (perceptual)
+			{
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+				{
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[0], pSrc_pixels[c], false);
+				}
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+				{
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[3], pSrc_pixels[c], false);
+				}
+				else
+				{
+					if (!g_cpu_supports_sse41)
+					{
+						uint32_t cur_selector = 0, c;
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+								if (++cur_selector > 2)
+									goto done;
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+							total_error += color_distance(true, block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+						}
+					done:
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							total_error += color_distance(true, block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+							++c;
+						}
+					}
+					else
+					{
+#if BASISU_SUPPORT_SSE
+						uint32_t cur_selector = 0, c;
+
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+							{
+								if (++cur_selector > 2)
+									goto done3;
+							}
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						}
+					done3:
+
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							++c;
+						}
+
+						int64_t block_error;
+						perceptual_distance_rgb_4_N_sse41(&block_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, INT64_MAX);
+						total_error += block_error;
+#endif
+					}
+				}
+			}
+			else
+			{
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+				{
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[0], pSrc_pixels[c], false);
+				}
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+				{
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[3], pSrc_pixels[c], false);
+				}
+				else
+				{
+					uint32_t cur_selector = 0, c;
+					for (c = 0; c < n; c++)
+					{
+						const uint32_t y = m_pSorted_luma[c];
+						while ((y * 2) >= block_inten_midpoints[cur_selector])
+							if (++cur_selector > 2)
+								goto done2;
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						total_error += color_distance(block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+					}
+				done2:
+					while (c < n)
+					{
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = 3;
+						total_error += color_distance(block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+						++c;
+					}
+				}
+			}
+
+			if (total_error < trial_solution.m_error)
+			{
+				trial_solution.m_error = total_error;
+				trial_solution.m_coords.m_inten_table = inten_table;
+				trial_solution.m_selectors.swap(m_temp_selectors);
+				trial_solution.m_valid = true;
+				if (!total_error)
+					break;
+			}
+		}
+		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
+#endif
+
+		bool success = false;
+		if (pBest_solution)
+		{
+			if (trial_solution.m_error < pBest_solution->m_error)
+			{
+				*pBest_solution = trial_solution;
+				success = true;
+			}
+		}
+
+		return success;
+	}
+
+	uint64_t pack_eac_a8(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		results.m_selectors.resize(num_pixels);
+		results.m_selectors_temp.resize(num_pixels);
+
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t a = pPixels[i];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			results.m_base = min_alpha;
+			results.m_table = 13;
+			results.m_multiplier = 1;
+			for (uint32_t i = 0; i < num_pixels; i++)
+				results.m_selectors[i] = 4;
+			return 0;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		uint64_t best_err = UINT64_MAX;
+
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			if ((table_mask & (1U << table)) == 0)
+				continue;
+
+			const float range = (float)(g_etc2_eac_tables[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+			const int center = (int)roundf(lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range));
+
+			const int base_min = clamp255(center - base_search_rad);
+			const int base_max = clamp255(center + base_search_rad);
+
+			const int mul = (int)roundf(alpha_range / range);
+			const int mul_low = clamp<int>(mul - mul_search_rad, 1, 15);
+			const int mul_high = clamp<int>(mul + mul_search_rad, 1, 15);
+
+			for (int base = base_min; base <= base_max; base++)
+			{
+				for (int multiplier = mul_low; multiplier <= mul_high; multiplier++)
+				{
+					uint64_t total_err = 0;
+
+					for (uint32_t i = 0; i < num_pixels; i++)
+					{
+						const int a = pPixels[i];
+
+						uint32_t best_s_err = UINT32_MAX;
+						uint32_t best_s = 0;
+						for (uint32_t s = 0; s < 8; s++)
+						{
+							const int v = clamp255((int)multiplier * g_etc2_eac_tables[table][s] + (int)base);
+
+							uint32_t err = iabs(a - v);
+							if (err < best_s_err)
+							{
+								best_s_err = err;
+								best_s = s;
+							}
+						}
+
+						results.m_selectors_temp[i] = static_cast<uint8_t>(best_s);
+
+						total_err += best_s_err * best_s_err;
+						if (total_err >= best_err)
+							break;
+					}
+
+					if (total_err < best_err)
+					{
+						best_err = total_err;
+						results.m_base = base;
+						results.m_multiplier = multiplier;
+						results.m_table = table;
+						results.m_selectors.swap(results.m_selectors_temp);
+						if (!best_err)
+							return best_err;
+					}
+
+				} // table
+
+			} // multiplier
+
+		} // base
+
+		return best_err;
+	}
+
+	void pack_eac_a8(eac_a8_block* pBlock, const uint8_t* pPixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		pack_eac_a8_results results;
+		pack_eac_a8(results, pPixels, 16, base_search_rad, mul_search_rad, table_mask);
+
+		pBlock->m_base = results.m_base;
+		pBlock->m_multiplier = results.m_multiplier;
+		pBlock->m_table = results.m_table;
+		for (uint32_t y = 0; y < 4; y++)
+			for (uint32_t x = 0; x < 4; x++)
+				pBlock->set_selector(x, y, results.m_selectors[x + y * 4]);
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_etc.h b/thirdparty/basis_universal/encoder/basisu_etc.h
index a202d01f6e..1e3ece43b8 100644
--- a/thirdparty/basis_universal/basisu_etc.h
+++ b/thirdparty/basis_universal/encoder/basisu_etc.h
@@ -1,5 +1,5 @@
 // basis_etc.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,9 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_enc.h"
-#include <set>
 
 namespace basisu
 {
@@ -116,7 +115,7 @@ namespace basisu
 		{
 			assert((ofs + num) <= 64U);
 			assert(num && (num < 32U));
-			return (read_be64(&m_uint64) >> ofs) & ((1UL << num) - 1UL);
+			return (uint32_t)(read_be64(&m_uint64) >> ofs) & ((1UL << num) - 1UL);
 		}
 
 		inline void set_general_bits(uint32_t ofs, uint32_t num, uint32_t bits)
@@ -266,6 +265,27 @@ namespace basisu
 			p[-2] |= (msb << byte_bit_ofs);
 		}
 
+		// Selector "etc1_val" ranges from 0-3 and is a direct (raw) ETC1 selector.
+		inline void set_raw_selector(uint32_t x, uint32_t y, uint32_t etc1_val)
+		{
+			assert((x | y | etc1_val) < 4);
+			const uint32_t bit_index = x * 4 + y;
+
+			uint8_t* p = &m_bytes[7 - (bit_index >> 3)];
+
+			const uint32_t byte_bit_ofs = bit_index & 7;
+			const uint32_t mask = 1 << byte_bit_ofs;
+						
+			const uint32_t lsb = etc1_val & 1;
+			const uint32_t msb = etc1_val >> 1;
+
+			p[0] &= ~mask;
+			p[0] |= (lsb << byte_bit_ofs);
+
+			p[-2] &= ~mask;
+			p[-2] |= (msb << byte_bit_ofs);
+		}
+
 		inline uint32_t get_raw_selector_bits() const
 		{
 			return m_bytes[4] | (m_bytes[5] << 8) | (m_bytes[6] << 16) | (m_bytes[7] << 24);
@@ -622,6 +642,23 @@ namespace basisu
 			return true;
 		}
 
+		bool set_block_color5_clamp(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(true);
+			set_base5_color(pack_color5(c0_unscaled, false));
+
+			int dr = c1_unscaled.r - c0_unscaled.r;
+			int dg = c1_unscaled.g - c0_unscaled.g;
+			int db = c1_unscaled.b - c0_unscaled.b;
+			
+			dr = clamp<int>(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			dg = clamp<int>(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			db = clamp<int>(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+						
+			set_delta3_color(pack_delta3(dr, dg, db));
+
+			return true;
+		}
 		color_rgba get_selector_color(uint32_t x, uint32_t y, uint32_t s) const
 		{
 			color_rgba block_colors[4];
@@ -720,7 +757,7 @@ namespace basisu
 		}
 	};
 		
-	typedef std::vector<etc_block> etc_block_vec;
+	typedef basisu::vector<etc_block> etc_block_vec;
 
 	// Returns false if the unpack fails (could be bogus data or ETC2)
 	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha = false);
@@ -844,10 +881,10 @@ namespace basisu
 				bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
 			}
 			const int* pInten_table = g_etc1_inten_tables[m_inten_table];
-			pBlock_colors[0].set((uint8_t)(br + pInten_table[0]), (uint8_t)(bg + pInten_table[0]), (uint8_t)(bb + pInten_table[0]), 255);
-			pBlock_colors[1].set((uint8_t)(br + pInten_table[1]), (uint8_t)(bg + pInten_table[1]), (uint8_t)(bb + pInten_table[1]), 255);
-			pBlock_colors[2].set((uint8_t)(br + pInten_table[2]), (uint8_t)(bg + pInten_table[2]), (uint8_t)(bb + pInten_table[2]), 255);
-			pBlock_colors[3].set((uint8_t)(br + pInten_table[3]), (uint8_t)(bg + pInten_table[3]), (uint8_t)(bb + pInten_table[3]), 255);
+			pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0], 255);
+			pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1], 255);
+			pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2], 255);
+			pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3], 255);
 		}
 
 		color_rgba m_unscaled_color;
@@ -914,9 +951,6 @@ namespace basisu
 				m_refinement = true;
 
 				m_pForce_selectors = nullptr;
-
-				m_pEval_solution_override = nullptr;
-				m_pEval_solution_override_data = nullptr;
 			}
 
 			uint32_t m_num_src_pixels;
@@ -932,9 +966,6 @@ namespace basisu
 			bool m_refinement;
 
 			const uint8_t* m_pForce_selectors;
-
-			evaluate_solution_override_func m_pEval_solution_override;
-			void *m_pEval_solution_override_data;
 		};
 
 		struct results
@@ -970,7 +1001,7 @@ namespace basisu
 			}
 
 			etc1_solution_coordinates  m_coords;
-			std::vector<uint8_t>    m_selectors;
+			basisu::vector<uint8_t>    m_selectors;
 			uint64_t                     m_error;
 			bool                       m_valid;
 
@@ -1001,33 +1032,36 @@ namespace basisu
 
 		vec3F m_avg_color;
 		int m_br, m_bg, m_bb;
-		std::vector<uint16_t> m_luma;
-		std::vector<uint32_t> m_sorted_luma;
-		std::vector<uint32_t> m_sorted_luma_indices;
+		int m_max_comp_spread;
+		basisu::vector<uint16_t> m_luma;
+		basisu::vector<uint32_t> m_sorted_luma;
+		basisu::vector<uint32_t> m_sorted_luma_indices;
 		const uint32_t* m_pSorted_luma_indices;
 		uint32_t* m_pSorted_luma;
 
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_best_selectors;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_best_selectors;
 
 		potential_solution m_best_solution;
 		potential_solution m_trial_solution;
-		std::vector<uint8_t> m_temp_selectors;
-
-		std::set<uint32_t> m_solutions_tried;
+		basisu::vector<uint8_t> m_temp_selectors;
 
+		enum { cSolutionsTriedHashBits = 10, cTotalSolutionsTriedHashSize = 1 << cSolutionsTriedHashBits, cSolutionsTriedHashMask = cTotalSolutionsTriedHashSize - 1 };
+		uint8_t m_solutions_tried[cTotalSolutionsTriedHashSize / 8];
+		
 		void get_nearby_inten_tables(uint32_t idx, int &first_inten_table, int &last_inten_table)
 		{
 			first_inten_table = maximum<int>(idx - 1, 0);
 			last_inten_table = minimum<int>(cETC1IntenModifierValues, idx + 1);
 		}
-
+		
+		bool check_for_redundant_solution(const etc1_solution_coordinates& coords);
 		bool evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
 		bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
 
 		inline bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
 		{
-			if (m_pParams->m_quality >= cETCQualitySlow)
+			if (m_pParams->m_quality >= cETCQualityMedium)
 				return evaluate_solution_slow(coords, trial_solution, pBest_solution);
 			else
 				return evaluate_solution_fast(coords, trial_solution, pBest_solution);
@@ -1042,5 +1076,77 @@ namespace basisu
 	{
 		etc1_optimizer m_optimizer;
 	};
+	
+	void pack_etc1_solid_color_init();
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor);
+
+	// ETC EAC
+	extern const int8_t g_etc2_eac_tables[16][8];
+	extern const int8_t g_etc2_eac_tables8[16][8];
+
+	const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7;
+
+	struct eac_a8_block
+	{
+		uint16_t m_base : 8;
+		uint16_t m_table : 4;
+		uint16_t m_multiplier : 4;
+
+		uint8_t m_selectors[6];
+
+		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
+		{
+			assert((x < 4) && (y < 4));
+			return static_cast<uint32_t>((selector_bits >> (45 - (y + x * 4) * 3)) & 7);
+		}
+
+		inline uint64_t get_selector_bits() const
+		{
+			uint64_t pixels = ((uint64_t)m_selectors[0] << 40) | ((uint64_t)m_selectors[1] << 32) | ((uint64_t)m_selectors[2] << 24) | ((uint64_t)m_selectors[3] << 16) | ((uint64_t)m_selectors[4] << 8) | m_selectors[5];
+			return pixels;
+		}
+
+		inline void set_selector_bits(uint64_t pixels)
+		{
+			m_selectors[0] = (uint8_t)(pixels >> 40);
+			m_selectors[1] = (uint8_t)(pixels >> 32);
+			m_selectors[2] = (uint8_t)(pixels >> 24);
+			m_selectors[3] = (uint8_t)(pixels >> 16);
+			m_selectors[4] = (uint8_t)(pixels >> 8);
+			m_selectors[5] = (uint8_t)(pixels);
+		}
+
+		void set_selector(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 8));
+
+			const uint32_t ofs = 45 - (y + x * 4) * 3;
+
+			uint64_t pixels = get_selector_bits();
+
+			pixels &= ~(7ULL << ofs);
+			pixels |= (static_cast<uint64_t>(s) << ofs);
+
+			set_selector_bits(pixels);
+		}
+	};
+
+	struct etc2_rgba_block
+	{
+		eac_a8_block m_alpha;
+		etc_block m_rgb;
+	};
 
+	struct pack_eac_a8_results
+	{
+		uint32_t m_base;
+		uint32_t m_table;
+		uint32_t m_multiplier;
+		uint8_vec m_selectors;
+		uint8_vec m_selectors_temp;
+	};
+
+	uint64_t pack_eac_a8(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+	void pack_eac_a8(eac_a8_block* pBlock, const uint8_t* pPixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+		
 } // namespace basisu
diff --git a/thirdparty/basis_universal/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
index 6f7a9bf889..324fc8e447 100644
--- a/thirdparty/basis_universal/basisu_frontend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
@@ -1,5 +1,5 @@
 // basisu_frontend.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,11 +17,16 @@
 // This code originally supported full ETC1 and ETC1S, so there's some legacy stuff to be cleaned up in here.
 // Add endpoint tiling support (where we force adjacent blocks to use the same endpoints during quantization), for a ~10% or more increase in bitrate at same SSIM. The backend already supports this.
 //
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_frontend.h"
 #include <unordered_set>
 #include <unordered_map>
 
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_FRONTEND_VERIFY(c) do { if (!(c)) handle_verify_failure(__LINE__); } while(0)
 
 namespace basisu
@@ -29,10 +34,11 @@ namespace basisu
 	const uint32_t cMaxCodebookCreationThreads = 8;
 
 	const uint32_t BASISU_MAX_ENDPOINT_REFINEMENT_STEPS = 3;
-	const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;
+	//const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;
 
 	const uint32_t BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE = 16;
-	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE = 16;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 = 32;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT = 16;
 	
 	// TODO - How to handle internal verifies in the basisu lib
 	static inline void handle_verify_failure(int line)
@@ -57,14 +63,14 @@ namespace basisu
 
 			uint32_t tv = size / sizeof(vec6F_quantizer::training_vec_with_weight);
 
-			std::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
+			basisu::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
 			fread(&v[0], 1, sizeof(v[0]) * tv, pFile);
 
 			for (uint32_t i = 0; i < tv; i++)
 				m_endpoint_clusterizer.add_training_vec(v[i].first, v[i].second);
 
 			m_endpoint_clusterizer.generate(16128);
-			std::vector<uint_vec> codebook;
+			basisu::vector<uint_vec> codebook;
 			m_endpoint_clusterizer.retrieve(codebook);
 
 			printf("Generated %u entries\n", (uint32_t)codebook.size());
@@ -78,6 +84,7 @@ namespace basisu
 		{
 			if (!p.m_pGlobal_sel_codebook)
 			{
+				debug_printf("basisu_frontend::init: No global sel codebook!\n");
 				assert(0);
 				return false;
 			}
@@ -128,11 +135,19 @@ namespace basisu
 		case 2:
 		{
 			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+
+			break;
+		}
+		case 3:
+		{
+			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
 			m_use_hierarchical_selector_codebooks = false;
 			break;
 		}
-		case 3:
+		case 4:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = true;
@@ -141,7 +156,7 @@ namespace basisu
 			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
 			break;
 		}
-		case 4:
+		case 5:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
@@ -150,7 +165,8 @@ namespace basisu
 			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
 			break;
 		}
-		case 5:
+		case 6:
+		default:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
@@ -180,106 +196,113 @@ namespace basisu
 
 		init_etc1_images();
 
-		init_endpoint_training_vectors();
-
-		generate_endpoint_clusters();
-				
-		for (uint32_t refine_endpoint_step = 0; refine_endpoint_step < m_num_endpoint_codebook_iterations; refine_endpoint_step++)
+		if (m_params.m_pGlobal_codebooks)
 		{
-			BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
-
-			if (refine_endpoint_step)
-			{
-				introduce_new_endpoint_clusters();
-			}
-
-			generate_endpoint_codebook(refine_endpoint_step);
-
-			if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
-			{
-				char buf[256];
-				snprintf(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);
-				dump_endpoint_clusterization_visualization(buf, false);
-			}
-
-			bool early_out = false;
+			init_global_codebooks();
+		}
+		else
+		{
+			init_endpoint_training_vectors();
 
-			if (m_endpoint_refinement)
+			generate_endpoint_clusters();
+				
+			for (uint32_t refine_endpoint_step = 0; refine_endpoint_step < m_num_endpoint_codebook_iterations; refine_endpoint_step++)
 			{
-				//dump_endpoint_clusterization_visualization("endpoint_clusters_before_refinement.png");
-
-				if (!refine_endpoint_clusterization())
-					early_out = true;
+				BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
 
-				if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))
+				if (refine_endpoint_step)
 				{
-					eliminate_redundant_or_empty_endpoint_clusters();
-					generate_endpoint_codebook(refine_endpoint_step);
+					introduce_new_endpoint_clusters();
 				}
 
+				generate_endpoint_codebook(refine_endpoint_step);
+
 				if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
 				{
 					char buf[256];
-					snprintf(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);
-
+					snprintf(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);
 					dump_endpoint_clusterization_visualization(buf, false);
-					snprintf(buf, sizeof(buf), "endpoint_cluster_colors_vis_post_%u.png", refine_endpoint_step);
+				}
+
+				bool early_out = false;
 
-					dump_endpoint_clusterization_visualization(buf, true);
+				if (m_endpoint_refinement)
+				{
+					//dump_endpoint_clusterization_visualization("endpoint_clusters_before_refinement.png");
+
+					if (!refine_endpoint_clusterization())
+						early_out = true;
+
+					if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))
+					{
+						eliminate_redundant_or_empty_endpoint_clusters();
+						generate_endpoint_codebook(refine_endpoint_step);
+					}
+
+					if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
+					{
+						char buf[256];
+						snprintf(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);
+
+						dump_endpoint_clusterization_visualization(buf, false);
+						snprintf(buf, sizeof(buf), "endpoint_cluster_colors_vis_post_%u.png", refine_endpoint_step);
+
+						dump_endpoint_clusterization_visualization(buf, true);
+					}
 				}
-			}
 						
-			eliminate_redundant_or_empty_endpoint_clusters();
+				eliminate_redundant_or_empty_endpoint_clusters();
 
-			if (m_params.m_debug_stats)
-				debug_printf("Total endpoint clusters: %u\n", (uint32_t)m_endpoint_clusters.size());
+				if (m_params.m_debug_stats)
+					debug_printf("Total endpoint clusters: %u\n", (uint32_t)m_endpoint_clusters.size());
 
-			if (early_out)
-				break;
-		}
+				if (early_out)
+					break;
+			}
 
-		BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+			BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
 
-		generate_block_endpoint_clusters();
+			generate_block_endpoint_clusters();
 
-		create_initial_packed_texture();
+			create_initial_packed_texture();
 
-		generate_selector_clusters();
+			generate_selector_clusters();
 
-		if (m_use_hierarchical_selector_codebooks)
-			compute_selector_clusters_within_each_parent_cluster();
+			if (m_use_hierarchical_selector_codebooks)
+				compute_selector_clusters_within_each_parent_cluster();
 				
-		if (m_params.m_compression_level == 0)
-		{
-			create_optimized_selector_codebook(0);
+			if (m_params.m_compression_level == 0)
+			{
+				create_optimized_selector_codebook(0);
 
-			find_optimal_selector_clusters_for_each_block();
+				find_optimal_selector_clusters_for_each_block();
 			
-			introduce_special_selector_clusters();
-		}
-		else
-		{
-			const uint32_t num_refine_selector_steps = m_params.m_pGlobal_sel_codebook ? 1 : m_num_selector_codebook_iterations;
-			for (uint32_t refine_selector_steps = 0; refine_selector_steps < num_refine_selector_steps; refine_selector_steps++)
+				introduce_special_selector_clusters();
+			}
+			else
 			{
-				create_optimized_selector_codebook(refine_selector_steps);
+				const uint32_t num_refine_selector_steps = m_params.m_pGlobal_sel_codebook ? 1 : m_num_selector_codebook_iterations;
+				for (uint32_t refine_selector_steps = 0; refine_selector_steps < num_refine_selector_steps; refine_selector_steps++)
+				{
+					create_optimized_selector_codebook(refine_selector_steps);
 
-				find_optimal_selector_clusters_for_each_block();
+					find_optimal_selector_clusters_for_each_block();
 
-				introduce_special_selector_clusters();
+					introduce_special_selector_clusters();
 				
-				if ((m_params.m_compression_level >= 3) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
-				{
-					if (!refine_block_endpoints_given_selectors())
-						break;
+					if ((m_params.m_compression_level >= 4) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+					{
+						if (!refine_block_endpoints_given_selectors())
+							break;
+					}
 				}
 			}
-		}
-
-		optimize_selector_codebook();
+						
+			optimize_selector_codebook();
 
-		if (m_params.m_debug_stats)
-			debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_indices.size());
+			if (m_params.m_debug_stats)
+				debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size());
+		}
 
 		finalize();
 
@@ -294,6 +317,259 @@ namespace basisu
 		return true;
 	}
 
+	bool basisu_frontend::init_global_codebooks()
+	{
+		const basist::basisu_lowlevel_etc1s_transcoder* pTranscoder = m_params.m_pGlobal_codebooks;
+
+		const basist::basisu_lowlevel_etc1s_transcoder::endpoint_vec& endpoints = pTranscoder->get_endpoints();
+		const basist::basisu_lowlevel_etc1s_transcoder::selector_vec& selectors = pTranscoder->get_selectors();
+				
+		m_endpoint_cluster_etc_params.resize(endpoints.size());
+		for (uint32_t i = 0; i < endpoints.size(); i++)
+		{
+			m_endpoint_cluster_etc_params[i].m_inten_table[0] = endpoints[i].m_inten5;
+			m_endpoint_cluster_etc_params[i].m_inten_table[1] = endpoints[i].m_inten5;
+
+			m_endpoint_cluster_etc_params[i].m_color_unscaled[0].set(endpoints[i].m_color5.r, endpoints[i].m_color5.g, endpoints[i].m_color5.b, 255);
+			m_endpoint_cluster_etc_params[i].m_color_used[0] = true;
+			m_endpoint_cluster_etc_params[i].m_valid = true;
+		}
+
+		m_optimized_cluster_selectors.resize(selectors.size());
+		for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+		{
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					m_optimized_cluster_selectors[i].set_selector(x, y, selectors[i].get_selector(x, y));
+		}
+
+		m_block_endpoint_clusters_indices.resize(m_total_blocks);
+
+		m_orig_encoded_blocks.resize(m_total_blocks);
+
+		m_block_selector_cluster_index.resize(m_total_blocks);
+
+#if 0
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job([this, first_index, last_index] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const etc_block& blk = m_etc1_blocks_etc1s[block_index];
+
+					const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+
+					etc_block trial_blk;
+					trial_blk.set_block_color5_etc1s(blk.m_color_unscaled[0]);
+					trial_blk.set_flip_bit(true);
+
+					uint64_t best_err = UINT64_MAX;
+					uint32_t best_index = 0;
+
+					for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+					{
+						trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());
+
+						const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+						if (cur_err < best_err)
+						{
+							best_err = cur_err;
+							best_index = i;
+							if (!cur_err)
+								break;
+						}
+
+					} // block_index
+
+					m_block_selector_cluster_index[block_index] = best_index;
+				}
+
+#ifndef __EMSCRIPTEN__
+				});
+#endif
+
+		}
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		m_encoded_blocks.resize(m_total_blocks);
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+			const uint32_t selector_index = m_block_selector_cluster_index[block_index];
+
+			etc_block& blk = m_encoded_blocks[block_index];
+
+			blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);
+			blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);
+			blk.set_flip_bit(true);
+			blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());
+		}
+#endif
+
+		// HACK HACK
+		const uint32_t NUM_PASSES = 3;
+		for (uint32_t pass = 0; pass < NUM_PASSES; pass++)
+		{
+			debug_printf("init_global_codebooks: pass %u\n", pass);
+
+			const uint32_t N = 128;
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, pass] {
+#endif
+										
+					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+					{
+						const etc_block& blk = pass ? m_encoded_blocks[block_index] : m_etc1_blocks_etc1s[block_index];
+						const uint32_t blk_raw_selector_bits = blk.get_raw_selector_bits();
+
+						etc_block trial_blk(blk);
+						trial_blk.set_raw_selector_bits(blk_raw_selector_bits);
+						trial_blk.set_flip_bit(true);
+
+						uint64_t best_err = UINT64_MAX;
+						uint32_t best_index = 0;
+						etc_block best_block(trial_blk);
+												
+						for (uint32_t i = 0; i < m_endpoint_cluster_etc_params.size(); i++)
+						{
+							if (m_endpoint_cluster_etc_params[i].m_inten_table[0] > blk.get_inten_table(0))
+								continue;
+
+							trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[i].m_color_unscaled[0]);
+							trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[i].m_inten_table[0]);
+
+							const color_rgba* pSource_pixels = get_source_pixel_block(block_index).get_ptr();
+							uint64_t cur_err;
+							if (!pass)
+								cur_err = trial_blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
+							else
+								cur_err = trial_blk.evaluate_etc1_error(pSource_pixels, m_params.m_perceptual);
+
+							if (cur_err < best_err)
+							{
+								best_err = cur_err;
+								best_index = i;
+								best_block = trial_blk;
+
+								if (!cur_err)
+									break;
+							}
+						}
+
+						m_block_endpoint_clusters_indices[block_index][0] = best_index;
+						m_block_endpoint_clusters_indices[block_index][1] = best_index;
+
+						m_orig_encoded_blocks[block_index] = best_block;
+
+					} // block_index
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			}
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			m_endpoint_clusters.resize(0);
+			m_endpoint_clusters.resize(endpoints.size());
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				const uint32_t endpoint_cluster_index = m_block_endpoint_clusters_indices[block_index][0];
+				m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2);
+				m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2 + 1);
+			}
+
+			m_block_selector_cluster_index.resize(m_total_blocks);
+
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index] {
+#endif
+
+					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+					{
+						const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+
+						etc_block trial_blk;
+						trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_color_unscaled[0]);
+						trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_inten_table[0]);
+						trial_blk.set_flip_bit(true);
+
+						uint64_t best_err = UINT64_MAX;
+						uint32_t best_index = 0;
+
+						for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+						{
+							trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());
+
+							const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+							if (cur_err < best_err)
+							{
+								best_err = cur_err;
+								best_index = i;
+								if (!cur_err)
+									break;
+							}
+
+						} // block_index
+
+						m_block_selector_cluster_index[block_index] = best_index;
+					}
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			}
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			m_encoded_blocks.resize(m_total_blocks);
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+				const uint32_t selector_index = m_block_selector_cluster_index[block_index];
+
+				etc_block& blk = m_encoded_blocks[block_index];
+
+				blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);
+				blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);
+				blk.set_flip_bit(true);
+				blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());
+			}
+
+		} // pass
+
+		m_selector_cluster_block_indices.resize(selectors.size());
+		for (uint32_t block_index = 0; block_index < m_etc1_blocks_etc1s.size(); block_index++)
+			m_selector_cluster_block_indices[m_block_selector_cluster_index[block_index]].push_back(block_index);
+				
+		return true;
+	}
+
 	void basisu_frontend::introduce_special_selector_clusters()
 	{
 		debug_printf("introduce_special_selector_clusters\n");
@@ -302,7 +578,7 @@ namespace basisu
 			return;
 
 		uint32_t total_blocks_relocated = 0;
-		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_indices.size();
+		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
 
 		bool_vec block_relocated_flags(m_total_blocks);
 
@@ -328,7 +604,7 @@ namespace basisu
 
 			m_optimized_cluster_selectors.push_back(blk);
 			
-			vector_ensure_element_is_valid(m_selector_cluster_indices, new_selector_cluster_index);
+			vector_ensure_element_is_valid(m_selector_cluster_block_indices, new_selector_cluster_index);
 			
 			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
 			{
@@ -357,14 +633,14 @@ namespace basisu
 				// Change the block to use the new cluster
 				m_block_selector_cluster_index[block_index] = new_selector_cluster_index;
 				
-				m_selector_cluster_indices[new_selector_cluster_index].push_back(block_index);
+				m_selector_cluster_block_indices[new_selector_cluster_index].push_back(block_index);
 
 				block_relocated_flags[block_index] = true;
 
 #if 0
-				int j = vector_find(m_selector_cluster_indices[old_selector_cluster_index], block_index);
+				int j = vector_find(m_selector_cluster_block_indices[old_selector_cluster_index], block_index);
 				if (j >= 0)
-					m_selector_cluster_indices[old_selector_cluster_index].erase(m_selector_cluster_indices[old_selector_cluster_index].begin() + j);
+					m_selector_cluster_block_indices[old_selector_cluster_index].erase(m_selector_cluster_block_indices[old_selector_cluster_index].begin() + j);
 #endif
 
 				total_blocks_relocated++;
@@ -381,7 +657,7 @@ namespace basisu
 
 			for (int selector_cluster_index = 0; selector_cluster_index < (int)initial_selector_clusters; selector_cluster_index++)
 			{
-				uint_vec& block_indices = m_selector_cluster_indices[selector_cluster_index];
+				uint_vec& block_indices = m_selector_cluster_block_indices[selector_cluster_index];
 
 				uint32_t dst_ofs = 0;
 
@@ -399,6 +675,7 @@ namespace basisu
 		debug_printf("Total blocks relocated to new flat selector clusters: %u\n", total_blocks_relocated);
 	}
 
+	// This method will change the number and ordering of the selector codebook clusters.
 	void basisu_frontend::optimize_selector_codebook()
 	{
 		debug_printf("optimize_selector_codebook\n");
@@ -436,15 +713,17 @@ namespace basisu
 			new_to_old.push_back(i);
 		}
 
+		debug_printf("Original selector clusters: %u, new cluster selectors: %u\n", orig_total_selector_clusters, total_new_entries);
+
 		for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
 		{
 			BASISU_FRONTEND_VERIFY((old_to_new[m_block_selector_cluster_index[i]] >= 0) && (old_to_new[m_block_selector_cluster_index[i]] < (int)total_new_entries));
 			m_block_selector_cluster_index[i] = old_to_new[m_block_selector_cluster_index[i]];
 		}
 
-		std::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
+		basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
 		basist::etc1_global_selector_codebook_entry_id_vec new_optimized_cluster_selector_global_cb_ids(m_optimized_cluster_selector_global_cb_ids.size() ? total_new_entries : 0);
-		std::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_indices.size() ? total_new_entries : 0);
+		basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_block_indices.size() ? total_new_entries : 0);
 		bool_vec new_selector_cluster_uses_global_cb(m_selector_cluster_uses_global_cb.size() ? total_new_entries : 0);
 
 		for (uint32_t i = 0; i < total_new_entries; i++)
@@ -455,24 +734,40 @@ namespace basisu
 			if (m_optimized_cluster_selector_global_cb_ids.size())
 				new_optimized_cluster_selector_global_cb_ids[i] = m_optimized_cluster_selector_global_cb_ids[new_to_old[i]];
 
-			if (m_selector_cluster_indices.size())
-				new_selector_cluster_indices[i] = m_selector_cluster_indices[new_to_old[i]];
+			//if (m_selector_cluster_block_indices.size())
+			//	new_selector_cluster_indices[i] = m_selector_cluster_block_indices[new_to_old[i]];
 
 			if (m_selector_cluster_uses_global_cb.size())
 				new_selector_cluster_uses_global_cb[i] = m_selector_cluster_uses_global_cb[new_to_old[i]];
 		}
 
+		for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
+		{
+			new_selector_cluster_indices[m_block_selector_cluster_index[i]].push_back(i);
+		}
+				
 		m_optimized_cluster_selectors.swap(new_optimized_cluster_selectors);
 		m_optimized_cluster_selector_global_cb_ids.swap(new_optimized_cluster_selector_global_cb_ids);
-		m_selector_cluster_indices.swap(new_selector_cluster_indices);
+		m_selector_cluster_block_indices.swap(new_selector_cluster_indices);
 		m_selector_cluster_uses_global_cb.swap(new_selector_cluster_uses_global_cb);
-				
+
+		// This isn't strictly necessary - doing it for completeness/future sanity.
+		if (m_selector_clusters_within_each_parent_cluster.size())
+		{
+			for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+				for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+					m_selector_clusters_within_each_parent_cluster[i][j] = old_to_new[m_selector_clusters_within_each_parent_cluster[i][j]];
+		}
+								
 		debug_printf("optimize_selector_codebook: Before: %u After: %u\n", orig_total_selector_clusters, total_new_entries);
 	}
 
 	void basisu_frontend::init_etc1_images()
 	{
 		debug_printf("basisu_frontend::init_etc1_images\n");
+
+		interval_timer tm;
+		tm.start();
 				
 		m_etc1_blocks_etc1s.resize(m_total_blocks);
 
@@ -481,8 +776,10 @@ namespace basisu
 		{
 			const uint32_t first_index = block_index_iter;                                        
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);       
-                                                                                      
+
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++) 
 				{
@@ -494,6 +791,8 @@ namespace basisu
 			
 					if (m_params.m_compression_level == 0)
 						optimizer_params.m_quality = cETCQualityFast;
+					else if (m_params.m_compression_level == 1)
+						optimizer_params.m_quality = cETCQualityMedium;
 					else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 						optimizer_params.m_quality = cETCQualityUber;
 						
@@ -506,8 +805,9 @@ namespace basisu
 					optimizer_results.m_n = 16;
 
 					optimizer.init(optimizer_params, optimizer_results);
-					optimizer.compute();
-			
+					if (!optimizer.compute())
+						BASISU_FRONTEND_VERIFY(false);
+
 					etc_block &blk = m_etc1_blocks_etc1s[block_index];
 
 					memset(&blk, 0, sizeof(blk));
@@ -520,10 +820,17 @@ namespace basisu
 							blk.set_selector(x, y, selectors[x + y * 4]);
 				}
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
+
 		}
-		                                     
+
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
 	}
 
 	void basisu_frontend::init_endpoint_training_vectors()
@@ -540,7 +847,9 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
+#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{			
@@ -562,11 +871,15 @@ namespace basisu
 
 				} // block_index;
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // block_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 	}
 
 	void basisu_frontend::generate_endpoint_clusters()
@@ -649,7 +962,7 @@ namespace basisu
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -707,17 +1020,19 @@ namespace basisu
 			const uint32_t first_index = cluster_index_iter;                                    
 			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 					assert(cluster_indices.size());
 
 					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
-						std::vector<color_rgba> cluster_pixels(8);
+						basisu::vector<color_rgba> cluster_pixels(8);
 
 						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
 						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
@@ -775,10 +1090,15 @@ namespace basisu
 					}
 				} // cluster_index
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
+
 		} // cluster_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 
 		vector_sort(m_subblock_endpoint_quant_err_vec);
 	}
@@ -837,7 +1157,7 @@ namespace basisu
 				continue;
 #endif
 
-			const uint32_t new_endpoint_cluster_index = (uint32_t)m_endpoint_clusters.size();
+			//const uint32_t new_endpoint_cluster_index = (uint32_t)m_endpoint_clusters.size();
 
 			enlarge_vector(m_endpoint_clusters, 1)->push_back(training_vector_index);
 			enlarge_vector(m_endpoint_cluster_etc_params, 1);
@@ -893,18 +1213,20 @@ namespace basisu
 		{
 			const uint32_t first_index = cluster_index_iter;                                    
 			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
-			
+
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, step ] {
+#endif
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 					BASISU_FRONTEND_VERIFY(cluster_indices.size());
 
 					const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
 
-					std::vector<color_rgba> cluster_pixels(total_pixels);
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
 
 					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
@@ -935,20 +1257,21 @@ namespace basisu
 						cluster_optimizer_params.m_use_color4 = false;
 						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
 
-						if (m_params.m_compression_level == 0)
+						if (m_params.m_compression_level <= 1)
 							cluster_optimizer_params.m_quality = cETCQualityMedium;
 						else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 							cluster_optimizer_params.m_quality = cETCQualityUber;
 
 						etc1_optimizer::results cluster_optimizer_results;
 
-						std::vector<uint8_t> cluster_selectors(total_pixels);
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
 						cluster_optimizer_results.m_n = total_pixels;
 						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
 
 						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
 
-						optimizer.compute();
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
 
 						new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
 						new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
@@ -1012,20 +1335,24 @@ namespace basisu
 				
 				} // cluster_index
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // cluster_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 	}
 
 	bool basisu_frontend::check_etc1s_constraints() const
 	{
-		std::vector<vec2U> block_clusters(m_total_blocks);
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1053,11 +1380,11 @@ namespace basisu
 		if (m_use_hierarchical_endpoint_codebooks)
 			compute_endpoint_clusters_within_each_parent_cluster();
 
-		std::vector<vec2U> block_clusters(m_total_blocks);
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1081,19 +1408,19 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &best_cluster_indices, &block_clusters] {
+#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
-					const bool is_flipped = true;
-			
 					const uint32_t cluster_index = block_clusters[block_index][0];
 					BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
 
-					const color_rgba *subblock_pixels = get_source_pixel_block(block_index).get_ptr();
+					const color_rgba *pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();
 					const uint32_t num_subblock_pixels = 16;
 
-					uint64_t best_cluster_err = UINT64_MAX;
+					uint64_t best_cluster_err = INT64_MAX;
 					uint32_t best_cluster_index = 0;
 
 					const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
@@ -1116,19 +1443,20 @@ namespace basisu
 						// Can't assign it here - may result in too much error when selector quant occurs
 						if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
 						{
-							total_err = UINT64_MAX;
+							total_err = INT64_MAX;
 							goto skip_cluster;
 						}
 
 						etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
-
+												
+#if 0
 						for (uint32_t p = 0; p < num_subblock_pixels; p++)
 						{
 							uint64_t best_err = UINT64_MAX;
 
 							for (uint32_t r = low_selector; r <= high_selector; r++)
 							{
-								uint64_t err = color_distance(m_params.m_perceptual, subblock_pixels[p], subblock_colors[r], false);
+								uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);
 								best_err = minimum(best_err, err);
 								if (!best_err)
 									break;
@@ -1138,6 +1466,64 @@ namespace basisu
 							if (total_err > best_cluster_err)
 								break;
 						} // p
+#else
+						if (m_params.m_perceptual)
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+						else
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+#endif
 
 					skip_cluster:
 						if ((total_err < best_cluster_err) ||
@@ -1154,14 +1540,18 @@ namespace basisu
 					best_cluster_indices[block_index] = best_cluster_index;
 
 				} // block_index
-			
+
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 						
 		} // block_index_iter
-		
+
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 
-		std::vector<typename std::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
+		basisu::vector<typename basisu::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
 		uint32_t total_subblocks_reassigned = 0;
 
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
@@ -1199,8 +1589,8 @@ namespace basisu
 
 		indirect_sort((uint32_t)m_endpoint_clusters.size(), &sorted_endpoint_cluster_indices[0], &m_endpoint_cluster_etc_params[0]);
 
-		std::vector<std::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());
-		std::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());
+		basisu::vector<basisu::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());
+		basisu::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());
 		
 		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
 		{
@@ -1264,7 +1654,9 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
 				
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
@@ -1288,12 +1680,16 @@ namespace basisu
 					blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
 						
 				} // block_index
-			
+
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // block_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 
 		m_orig_encoded_blocks = m_encoded_blocks;
 	}
@@ -1302,9 +1698,9 @@ namespace basisu
 	{
 		uint_vec block_selector_cluster_indices(m_total_blocks);
 
-		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_indices.size()); cluster_index++)
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_block_indices.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_selector_cluster_indices[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_block_indices[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1317,7 +1713,7 @@ namespace basisu
 		} // cluster_index
 
 		m_selector_clusters_within_each_parent_cluster.resize(0);
-		m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_indices.size());
+		m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_block_indices.size());
 
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
 		{
@@ -1355,7 +1751,9 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
+#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
@@ -1382,47 +1780,54 @@ namespace basisu
 				
 				} // block_index
 
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // block_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 
 		vec16F_clusterizer selector_clusterizer;
 		for (uint32_t i = 0; i < m_total_blocks; i++)
 			selector_clusterizer.add_training_vec(training_vecs[i].first, training_vecs[i].second);
 
-		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE : 0;
+		const int selector_parent_codebook_size = (m_params.m_compression_level <= 1) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 : BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT;
+		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? selector_parent_codebook_size : 0;
+		debug_printf("Using selector parent codebook size %u\n", parent_codebook_size);
 
 		uint32_t max_threads = 0;
 		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
 
 		bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,
 			m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,
-			m_selector_cluster_indices,
-			m_selector_parent_cluster_indices,
+			m_selector_cluster_block_indices,
+			m_selector_parent_cluster_block_indices,
 			max_threads, m_params.m_pJob_pool);
 		BASISU_FRONTEND_VERIFY(status);
 
 		if (m_use_hierarchical_selector_codebooks)
 		{
-			if (!m_selector_parent_cluster_indices.size())
+			if (!m_selector_parent_cluster_block_indices.size())
 			{
-				m_selector_parent_cluster_indices.resize(0);
-				m_selector_parent_cluster_indices.resize(1);
+				m_selector_parent_cluster_block_indices.resize(0);
+				m_selector_parent_cluster_block_indices.resize(1);
 				for (uint32_t i = 0; i < m_total_blocks; i++)
-					m_selector_parent_cluster_indices[0].push_back(i);
+					m_selector_parent_cluster_block_indices[0].push_back(i);
 			}
 
-			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE <= UINT8_MAX);
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 <= UINT8_MAX);
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT <= UINT8_MAX);
 
 			m_block_parent_selector_cluster.resize(0);
 			m_block_parent_selector_cluster.resize(m_total_blocks);
 			vector_set_all(m_block_parent_selector_cluster, 0xFF);
 
-			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_indices.size(); parent_cluster_index++)
+			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_block_indices.size(); parent_cluster_index++)
 			{
-				const uint_vec &cluster = m_selector_parent_cluster_indices[parent_cluster_index];
+				const uint_vec &cluster = m_selector_parent_cluster_block_indices[parent_cluster_index];
 				for (uint32_t j = 0; j < cluster.size(); j++)
 					m_block_parent_selector_cluster[cluster[j]] = static_cast<uint8_t>(parent_cluster_index);
 			}
@@ -1432,9 +1837,9 @@ namespace basisu
 			}
 
 			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
-			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_indices.size(); cluster_index++)
+			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_block_indices.size(); cluster_index++)
 			{
-				const uint_vec &cluster = m_selector_cluster_indices[cluster_index];
+				const uint_vec &cluster = m_selector_cluster_block_indices[cluster_index];
 			
 				uint32_t parent_cluster_index = 0;
 				for (uint32_t j = 0; j < cluster.size(); j++)
@@ -1452,14 +1857,16 @@ namespace basisu
 			}
 		}
 
-		debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_indices.size(), (uint32_t)m_selector_parent_cluster_indices.size());
+		debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size(), (uint32_t)m_selector_parent_cluster_block_indices.size());
 	}
 
 	void basisu_frontend::create_optimized_selector_codebook(uint32_t iter)
 	{
 		debug_printf("create_optimized_selector_codebook\n");
 
-		const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_indices.size();
+		const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
+
+		debug_printf("Total selector clusters (from m_selector_cluster_block_indices.size()): %u\n", (uint32_t)m_selector_cluster_block_indices.size());
 
 		m_optimized_cluster_selectors.resize(total_selector_clusters);
 
@@ -1474,12 +1881,14 @@ namespace basisu
 			{
 				const uint32_t first_index = cluster_index_iter;                                    
 				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
-			
+
+#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &total_clusters_processed, &total_selector_clusters] {
+#endif
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1528,11 +1937,15 @@ namespace basisu
 
 					} // cluster_index
 
+#ifndef __EMSCRIPTEN__
 				} );
+#endif
 
 			} // cluster_index_iter
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
+#endif
 		}
 		else
 		{
@@ -1552,12 +1965,14 @@ namespace basisu
 			{
 				const uint32_t first_index = cluster_index_iter;                                    
 				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
-			
+
+#ifndef __EMSCRIPTEN__			
 				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &uses_hybrid_sel_codebook, &total_clusters_processed, &total_selector_clusters] {
+#endif
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1667,29 +2082,33 @@ namespace basisu
 
 					} // cluster_index
 
+#ifndef __EMSCRIPTEN__
 				} );
+#endif
 
 			} // cluster_index_iter
 
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
+#endif
 
 		} // if (m_params.m_pGlobal_sel_codebook)
-
+				
 		if (m_params.m_debug_images)
 		{
 			uint32_t max_selector_cluster_size = 0;
 
-			for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
-				max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_indices[i].size());
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+				max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_block_indices[i].size());
 
 			if ((max_selector_cluster_size * 5) < 32768)
 			{
 				const uint32_t x_spacer_len = 16;
-				image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_indices.size() * 5);
+				image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_block_indices.size() * 5);
 
-				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_indices.size(); selector_cluster_index++)
+				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)
 				{
-					const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[selector_cluster_index];
+					const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[selector_cluster_index];
 
 					for (uint32_t y = 0; y < 4; y++)
 						for (uint32_t x = 0; x < 4; x++)
@@ -1717,31 +2136,56 @@ namespace basisu
 	void basisu_frontend::find_optimal_selector_clusters_for_each_block()
 	{
 		debug_printf("find_optimal_selector_clusters_for_each_block\n");
-				
+
+		// Sanity checks
+		BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());
+		for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+		{
+			for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+			{
+				BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());
+			}
+		}
+
 		m_block_selector_cluster_index.resize(m_total_blocks);
-				
+							
 		if (m_params.m_compression_level == 0)
 		{
 			// Don't do anything, just leave the blocks in their original selector clusters.
-			for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
 			{
-				for (uint32_t j = 0; j < m_selector_cluster_indices[i].size(); j++)
-					m_block_selector_cluster_index[m_selector_cluster_indices[i][j]] = i;
+				for (uint32_t j = 0; j < m_selector_cluster_block_indices[i].size(); j++)
+					m_block_selector_cluster_index[m_selector_cluster_block_indices[i][j]] = i;
 			}
 		}
 		else
 		{
-			std::vector< std::vector<uint32_t> > new_cluster_indices;
-
+			// Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.
+			basisu::vector< basisu::vector<uint32_t> > new_cluster_indices(m_optimized_cluster_selectors.size());
+						
 			// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
 
+			basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());
+			for (uint32_t cluster_index = 0; cluster_index < m_optimized_cluster_selectors.size(); cluster_index++)
+			{
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						unpacked_optimized_cluster_selectors[cluster_index * 16 + y * 4 + x] = (uint8_t)m_optimized_cluster_selectors[cluster_index].get_selector(x, y);
+					}
+				}
+			}
+						
 			const uint32_t N = 1024;
 			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 			{
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices] {
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices, &unpacked_optimized_cluster_selectors] {
+#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
@@ -1752,20 +2196,32 @@ namespace basisu
 					color_rgba trial_block_colors[4];
 					blk.get_block_colors(trial_block_colors, 0);
 
-					uint64_t best_cluster_err = UINT64_MAX;
+					// precompute errors for the i-th block pixel and selector sel: [sel][i]
+					uint32_t trial_errors[4][16];
+
+					for (int sel = 0; sel < 4; ++sel)
+					{
+						for (int i = 0; i < 16; ++i)
+						{
+							trial_errors[sel][i] = color_distance(m_params.m_perceptual, pBlock_pixels[i], trial_block_colors[sel], false);
+						}
+					}
+
+					uint64_t best_cluster_err = INT64_MAX;
 					uint32_t best_cluster_index = 0;
 
 					const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
 					const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;
 
-					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_indices.size();
+					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_block_indices.size();
 
+#if 0
 					for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
 					{
 						const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
 
 						const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
-								
+
 						uint64_t trial_err = 0;
 						for (int y = 0; y < 4; y++)
 						{
@@ -1778,18 +2234,82 @@ namespace basisu
 									goto early_out;
 							}
 						}
-								
+
 						if (trial_err < best_cluster_err)
 						{
 							best_cluster_err = trial_err;
 							best_cluster_index = cluster_index;
-							if (!best_cluster_err) 
+							if (!best_cluster_err)
 								break;
 						}
 
 					early_out:
 						;
 					}
+#else
+					if (m_params.m_perceptual)
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+																
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+										
+								trial_err += trial_errors[sel][i];
+								if (trial_err > best_cluster_err)
+									goto early_out;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out:
+							;
+
+						} // cluster_iter
+					}
+					else
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+
+								trial_err += trial_errors[sel][i];
+								if (trial_err > best_cluster_err)
+									goto early_out2;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out2:
+							;
+
+						} // cluster_iter
+					}
+#endif
 
 					blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
 
@@ -1804,17 +2324,21 @@ namespace basisu
 					
 				} // block_index
 
+#ifndef __EMSCRIPTEN__
 				} );
+#endif
 
 			} // block_index_iter
-						
+
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-			
-			m_selector_cluster_indices.swap(new_cluster_indices);
+#endif
+
+			m_selector_cluster_block_indices.swap(new_cluster_indices);
 		}
 
-		for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
-			vector_sort(m_selector_cluster_indices[i]);
+		for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+			vector_sort(m_selector_cluster_block_indices[i]);
 	}
 
 	// TODO: Remove old ETC1 specific stuff, and thread this.
@@ -1842,7 +2366,7 @@ namespace basisu
 			const uint_vec &subblocks = subblock_params.m_subblocks;
 			//uint32_t total_pixels = subblock.m_subblocks.size() * 8;
 
-			std::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]
+			basisu::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]
 			uint8_vec subblock_selectors[2];
 
 			uint64_t cur_subblock_err[2] = { 0, 0 };
@@ -1882,7 +2406,7 @@ namespace basisu
 
 			clear_obj(cluster_optimizer_results);
 
-			std::vector<uint8_t> cluster_selectors[2];
+			basisu::vector<uint8_t> cluster_selectors[2];
 
 			for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)
 			{
@@ -1938,7 +2462,7 @@ namespace basisu
 
 						const uint32_t block_index = training_vector_index >> 1;
 						const uint32_t subblock_index = training_vector_index & 1;
-						const bool is_flipped = true;
+						//const bool is_flipped = true;
 
 						etc_block &blk = m_encoded_blocks[block_index];
 
@@ -2002,7 +2526,7 @@ namespace basisu
 
 		if (m_params.m_debug_stats)
 			debug_printf("Total subblock endpoints refined: %u (%3.1f%%)\n", total_subblocks_refined, total_subblocks_refined * 100.0f / total_subblocks_examined);
-
+				
 		return total_subblocks_refined;
 	}
 
@@ -2012,8 +2536,8 @@ namespace basisu
 
 		uint32_t max_endpoint_cluster_size = 0;
 
-		std::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());
-		std::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());
+		basisu::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());
+		basisu::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());
 		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
 		{
 			max_endpoint_cluster_size = maximum<uint32_t>(max_endpoint_cluster_size, (uint32_t)m_endpoint_clusters[i].size());
@@ -2100,30 +2624,33 @@ namespace basisu
 	{
 		debug_printf("reoptimize_remapped_endpoints\n");
 
-		std::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());
+		basisu::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());
 		for (uint32_t i = 0; i < new_block_endpoints.size(); i++)
 			new_endpoint_cluster_block_indices[new_block_endpoints[i]].push_back(i);
 
-		std::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
-		std::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
+		basisu::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
+		basisu::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
 		
 		const uint32_t N = 256;
 		for (uint32_t cluster_index_iter = 0; cluster_index_iter < new_endpoint_cluster_block_indices.size(); cluster_index_iter += N)
 		{
 			const uint32_t first_index = cluster_index_iter;                                    
 			const uint32_t last_index = minimum<uint32_t>((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N);   
-			
+
+#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] {
+#endif
+
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
+					const basisu::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
 
 					if (!cluster_block_indices.size())
 						continue;
 
 					const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;
 
-					std::vector<color_rgba> cluster_pixels(total_pixels);
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
 					uint8_vec force_selectors(total_pixels);
 
 					etc_block blk;
@@ -2170,16 +2697,19 @@ namespace basisu
 
 						if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 							cluster_optimizer_params.m_quality = cETCQualityUber;
+						else
+							cluster_optimizer_params.m_quality = cETCQualitySlow;
 
 						etc1_optimizer::results cluster_optimizer_results;
 
-						std::vector<uint8_t> cluster_selectors(total_pixels);
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
 						cluster_optimizer_results.m_n = total_pixels;
 						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
 
 						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
 
-						optimizer.compute();
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
 
 						new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
 						new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
@@ -2198,11 +2728,16 @@ namespace basisu
 					cluster_valid[cluster_index] = true;
 
 				} // cluster_index
+
+#ifndef __EMSCRIPTEN__
 			} );
+#endif
 
 		} // cluster_index_iter
 
+#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
+#endif
 				
 		uint32_t total_unused_clusters = 0;
 		uint32_t total_improved_clusters = 0;
@@ -2239,7 +2774,7 @@ namespace basisu
 
 			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 1\n");
 
-			std::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);
+			basisu::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);
 
 			for (uint32_t block_index = 0; block_index < new_block_endpoints.size(); block_index++)
 			{
diff --git a/thirdparty/basis_universal/basisu_frontend.h b/thirdparty/basis_universal/encoder/basisu_frontend.h
index c3f5d23c71..4ff6d40466 100644
--- a/thirdparty/basis_universal/basisu_frontend.h
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.h
@@ -1,5 +1,5 @@
 // basisu_frontend.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,7 +17,8 @@
 #include "basisu_etc.h"
 #include "basisu_gpu_texture.h"
 #include "basisu_global_selector_palette_helpers.h"
-#include "transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_transcoder.h"
 
 namespace basisu
 {
@@ -34,8 +35,8 @@ namespace basisu
 		uint32_t &operator[] (uint32_t i) { assert(i < 2); return m_comps[i]; }
 	};
 
-	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 1;
-	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 5;
+	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 2;
+	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 6;
 
 	class basisu_frontend
 	{
@@ -72,16 +73,19 @@ namespace basisu
 				m_perceptual(true),
 				m_debug_stats(false),
 				m_debug_images(false),
+																
 				m_dump_endpoint_clusterization(true),
+				m_validate(false),
+				m_multithreaded(false),
+				m_disable_hierarchical_endpoint_codebooks(false),
 				m_pGlobal_sel_codebook(NULL),
 				m_num_global_sel_codebook_pal_bits(0),
 				m_num_global_sel_codebook_mod_bits(0),
 				m_use_hybrid_selector_codebooks(false),
 				m_hybrid_codebook_quality_thresh(0.0f),
-				m_validate(false),
 				m_tex_type(basist::cBASISTexType2D),
-				m_multithreaded(false),
-				m_disable_hierarchical_endpoint_codebooks(false),
+				m_pGlobal_codebooks(nullptr),
+				
 				m_pJob_pool(nullptr)
 			{
 			}
@@ -108,6 +112,7 @@ namespace basisu
 			bool m_use_hybrid_selector_codebooks;
 			float m_hybrid_codebook_quality_thresh;
 			basist::basis_texture_type m_tex_type;
+			const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
 			
 			job_pool *m_pJob_pool;
 		};
@@ -142,7 +147,7 @@ namespace basisu
 		bool get_endpoint_cluster_color_is_used(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_used[individual_mode]; }
 
 		// Selector clusters
-		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_indices.size()); }
+		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_block_indices.size()); }
 		uint32_t get_block_selector_cluster_index(uint32_t block_index) const { return m_block_selector_cluster_index[block_index]; }
 		const etc_block &get_selector_cluster_selector_bits(uint32_t cluster_index) const { return m_optimized_cluster_selectors[cluster_index]; }
 
@@ -150,7 +155,7 @@ namespace basisu
 		const bool_vec &get_selector_cluster_uses_global_cb_vec() const { return m_selector_cluster_uses_global_cb; }
 
 		// Returns block indices using each selector cluster
-		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_indices[selector_cluster_index]; }
+		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_block_indices[selector_cluster_index]; }
 
 		void dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks);
 		
@@ -188,16 +193,16 @@ namespace basisu
 
 		// For each endpoint cluster: An array of which subblock indices (block_index*2+subblock) are located in that cluster.
 		// Array of block indices for each endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters; 
+		basisu::vector<uint_vec> m_endpoint_clusters;
 
 		// Array of block indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_parent_clusters;  
+		basisu::vector<uint_vec> m_endpoint_parent_clusters;
 		
 		// Each block's parent cluster index
 		uint8_vec m_block_parent_endpoint_cluster; 
 
 		// Array of endpoint cluster indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster;
 				
 		struct endpoint_cluster_etc_params
 		{
@@ -267,35 +272,35 @@ namespace basisu
 			}
 		};
 
-		typedef std::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
+		typedef basisu::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
 		
 		// Each endpoint cluster's ETC1S parameters 
 		cluster_subblock_etc_params_vec m_endpoint_cluster_etc_params;
 
 		// The endpoint cluster index used by each ETC1 subblock.
-		std::vector<vec2U> m_block_endpoint_clusters_indices;
+		basisu::vector<vec2U> m_block_endpoint_clusters_indices;
 				
 		// The block(s) within each selector cluster
 		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
-		std::vector<uint_vec> m_selector_cluster_indices;
+		basisu::vector<uint_vec> m_selector_cluster_block_indices;
 
 		// The selector bits for each selector cluster.
-		std::vector<etc_block> m_optimized_cluster_selectors;
+		basisu::vector<etc_block> m_optimized_cluster_selectors;
 
 		// The block(s) within each parent selector cluster.
-		std::vector<uint_vec> m_selector_parent_cluster_indices;
+		basisu::vector<uint_vec> m_selector_parent_cluster_block_indices;
 		
 		// Each block's parent selector cluster
 		uint8_vec m_block_parent_selector_cluster;
 
 		// Array of selector cluster indices for each parent selector cluster
-		std::vector<uint_vec> m_selector_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_selector_clusters_within_each_parent_cluster;
 
 		basist::etc1_global_selector_codebook_entry_id_vec m_optimized_cluster_selector_global_cb_ids;
 		bool_vec m_selector_cluster_uses_global_cb;
 
 		// Each block's selector cluster index
-		std::vector<uint32_t> m_block_selector_cluster_index;
+		basisu::vector<uint32_t> m_block_selector_cluster_index;
 
 		struct subblock_endpoint_quant_err
 		{
@@ -321,13 +326,14 @@ namespace basisu
 		};
 
 		// The sorted subblock endpoint quant error for each endpoint cluster
-		std::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
+		basisu::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
 
 		std::mutex m_lock;
 
 		//-----------------------------------------------------------------------------
 
 		void init_etc1_images();
+		bool init_global_codebooks();
 		void init_endpoint_training_vectors();
 		void dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors);
 		void generate_endpoint_clusters();
diff --git a/thirdparty/basis_universal/basisu_global_selector_palette_helpers.cpp b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
index 102fc24980..102fc24980 100644
--- a/thirdparty/basis_universal/basisu_global_selector_palette_helpers.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
diff --git a/thirdparty/basis_universal/basisu_global_selector_palette_helpers.h b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
index 32692c516b..7c35439df8 100644
--- a/thirdparty/basis_universal/basisu_global_selector_palette_helpers.h
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
@@ -14,9 +14,9 @@
 // limitations under the License.
 #pragma once
 
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_etc.h"
-#include "transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_global_selector_palette.h"
 
 namespace basisu
 {
@@ -36,7 +36,7 @@ namespace basisu
 
 		void clear() { clear_obj(*this); }
 	};
-	typedef std::vector<pixel_block> pixel_block_vec;
+	typedef basisu::vector<pixel_block> pixel_block_vec;
 
 	uint64_t etc1_global_selector_codebook_find_best_entry(const basist::etc1_global_selector_codebook &codebook,
 		uint32_t num_src_pixel_blocks, const pixel_block *pSrc_pixel_blocks, const etc_block *pBlock_endpoints,
diff --git a/thirdparty/basis_universal/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
index 117668c5e2..3f9fb67bdd 100644
--- a/thirdparty/basis_universal/basisu_gpu_texture.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,38 +16,10 @@
 #include "basisu_enc.h"
 #include "basisu_pvrtc1_4.h"
 #include "basisu_astc_decomp.h"
+#include "basisu_bc7enc.h"
 
 namespace basisu
 {
-	const int8_t g_etc2_eac_tables[16][8] = 
-	{
-		{ -3, -6, -9, -15, 2, 5, 8, 14 }, { -3, -7, -10, -13, 2, 6, 9, 12 }, { -2, -5, -8, -13, 1, 4, 7, 12 }, { -2, -4, -6, -13, 1, 3, 5, 12 },
-		{ -3, -6, -8, -12, 2, 5, 7, 11 }, { -3, -7, -9, -11, 2, 6, 8, 10 }, { -4, -7, -8, -11, 3, 6, 7, 10 }, { -3, -5, -8, -11, 2, 4, 7, 10 },
-		{ -2, -6, -8, -10, 1, 5, 7, 9 }, { -2, -5, -8, -10, 1, 4, 7, 9 }, { -2, -4, -8, -10, 1, 3, 7, 9 }, { -2, -5, -7, -10, 1, 4, 6, 9 },
-		{ -3, -4, -7, -10, 2, 3, 6, 9 }, { -1, -2, -3, -10, 0, 1, 2, 9 }, { -4, -6, -8, -9, 3, 5, 7, 8 }, { -3, -5, -7, -9, 2, 4, 6, 8 }
-	};
-
-	struct eac_a8_block
-	{
-		uint16_t m_base : 8;
-		uint16_t m_table : 4;
-		uint16_t m_multiplier : 4;
-
-		uint8_t m_selectors[6];
-
-		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
-		{
-			assert((x < 4) && (y < 4));
-			return static_cast<uint32_t>((selector_bits >> (45 - (y + x * 4) * 3)) & 7);
-		}
-				
-		inline uint64_t get_selector_bits() const
-		{
-			uint64_t pixels = ((uint64_t)m_selectors[0] << 40) | ((uint64_t)m_selectors[1] << 32) | ((uint64_t)m_selectors[2] << 24) |	((uint64_t)m_selectors[3] << 16) | ((uint64_t)m_selectors[4] << 8) | m_selectors[5];
-			return pixels;
-		}
-	};
-		
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels)
 	{
 		static_assert(sizeof(eac_a8_block) == 8, "sizeof(eac_a8_block) == 8");
@@ -123,19 +95,18 @@ namespace basisu
 		bc1_block::unpack_color(l, r0, g0, b0);
 		bc1_block::unpack_color(h, r1, g1, b1);
 
+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+
 		bool used_punchthrough = false;
 
 		if (l > h)
 		{
-			c[0].set_noclamp_rgba(r0, g0, b0, 255);
-			c[1].set_noclamp_rgba(r1, g1, b1, 255);
 			c[2].set_noclamp_rgba((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255);
 			c[3].set_noclamp_rgba((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255);
 		}
 		else
 		{
-			c[0].set_noclamp_rgba(r0, g0, b0, 255);
-			c[1].set_noclamp_rgba(r1, g1, b1, 255);
 			c[2].set_noclamp_rgba((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255);
 			c[3].set_noclamp_rgba(0, 0, 0, 0);
 			used_punchthrough = true;
@@ -165,6 +136,142 @@ namespace basisu
 		return used_punchthrough;
 	}
 
+	bool unpack_bc1_nv(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8");
+
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		int r0 = (l >> 11) & 31;
+		int g0 = (l >> 5) & 63;
+		int b0 = l & 31;
+		int r1 = (h >> 11) & 31;
+		int g1 = (h >> 5) & 63;
+		int b1 = h & 31;
+
+		c[0].b = (uint8_t)((3 * b0 * 22) / 8);
+		c[0].g = (uint8_t)((g0 << 2) | (g0 >> 4));
+		c[0].r = (uint8_t)((3 * r0 * 22) / 8);
+		c[0].a = 0xFF;
+
+		c[1].r = (uint8_t)((3 * r1 * 22) / 8);
+		c[1].g = (uint8_t)((g1 << 2) | (g1 >> 4));
+		c[1].b = (uint8_t)((3 * b1 * 22) / 8);
+		c[1].a = 0xFF;
+
+		int gdiff = c[1].g - c[0].g;
+
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].r = (uint8_t)(((2 * r0 + r1) * 22) / 8);
+			c[2].g = (uint8_t)(((256 * c[0].g + gdiff/4 + 128 + gdiff * 80) / 256));
+			c[2].b = (uint8_t)(((2 * b0 + b1) * 22) / 8);
+			c[2].a = 0xFF;
+
+			c[3].r = (uint8_t)(((2 * r1 + r0) * 22) / 8);
+			c[3].g = (uint8_t)((256 * c[1].g - gdiff/4 + 128 - gdiff * 80) / 256);
+			c[3].b = (uint8_t)(((2 * b1 + b0) * 22) / 8);
+			c[3].a = 0xFF;
+		}
+		else
+		{
+			c[2].r = (uint8_t)(((r0 + r1) * 33) / 8);
+			c[2].g = (uint8_t)((256 * c[0].g + gdiff/4 + 128 + gdiff * 128) / 256);
+			c[2].b = (uint8_t)(((b0 + b1) * 33) / 8);
+			c[2].a = 0xFF;
+
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
+	static inline int interp_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 43 + c1 * 21 + 32) >> 6; }
+	static inline int interp_half_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1 + 1) >> 1; }
+
+	bool unpack_bc1_amd(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		uint32_t r0, g0, b0, r1, g1, b1;
+		bc1_block::unpack_color(l, r0, g0, b0);
+		bc1_block::unpack_color(h, r1, g1, b1);
+
+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+				
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].set_noclamp_rgba(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255);
+		}
+		else
+		{
+			c[2].set_noclamp_rgba(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
 	struct bc4_block
 	{
 		enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 };
@@ -292,7 +399,7 @@ namespace basisu
 
 		if (mode)
 		{
-			c[1].set(std::max(0, c[0].r - (c[3].r >> 2)), std::max(0, c[0].g - (c[3].g >> 2)), std::max(0, c[0].b - (c[3].b >> 2)), 255);
+			c[1].set(basisu::maximum(0, c[0].r - (c[3].r >> 2)), basisu::maximum(0, c[0].g - (c[3].g >> 2)), basisu::maximum(0, c[0].b - (c[3].b >> 2)), 255);
 			c[2] = c[0];
 			c[0].set(0, 0, 0, 255);
 		}
@@ -317,6 +424,191 @@ namespace basisu
 		}
 	}
 
+	// BC7 mode 0-7 decompression.
+	// Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines.
+
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t pbit, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(pbit < 2); assert(val_bits >= 4 && val_bits <= 8); const uint32_t total_bits = val_bits + 1; val = (val << 1) | pbit; val <<= (8 - total_bits); val |= (val >> total_bits); assert(val <= 255); return val; }
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(val_bits >= 4 && val_bits <= 8); val <<= (8 - val_bits); val |= (val >> val_bits); assert(val <= 255); return val; }
+
+	static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) { assert(w < 4); return (l * (64 - basist::g_bc7_weights2[w]) + h * basist::g_bc7_weights2[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp3(uint32_t l, uint32_t h, uint32_t w) { assert(w < 8); return (l * (64 - basist::g_bc7_weights3[w]) + h * basist::g_bc7_weights3[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp4(uint32_t l, uint32_t h, uint32_t w) { assert(w < 16); return (l * (64 - basist::g_bc7_weights4[w]) + h * basist::g_bc7_weights4[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t bits)
+	{
+		assert(l <= 255 && h <= 255);
+		switch (bits)
+		{
+		case 2: return bc7_interp2(l, h, w);
+		case 3: return bc7_interp3(l, h, w);
+		case 4: return bc7_interp4(l, h, w);
+		default: 
+			break;
+		}
+		return 0;
+	}
+		
+	bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 3;
+		const uint32_t ENDPOINTS = 6;
+		const uint32_t COMPS = 3;
+		const uint32_t WEIGHT_BITS = (mode == 0) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5;
+		const uint32_t PBITS = (mode == 0) ? 6 : 0;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, (mode == 0) ? 4 : 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+
+		uint32_t pbits[6];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_third_subset_1[part]) || (i == basist::g_bc7_table_anchor_index_third_subset_2[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS)));
+
+		color_rgba block_colors[3][8];
+		for (uint32_t s = 0; s < 3; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = 255;
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition3[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 2;
+		const uint32_t ENDPOINTS = 4;
+		const uint32_t COMPS = (mode == 7) ? 4 : 3;
+		const uint32_t WEIGHT_BITS = (mode == 1) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 7) ? 5 : ((mode == 1) ? 6 : 7);
+		const uint32_t PBITS = (mode == 1) ? 2 : 4;
+		const uint32_t SHARED_PBITS = (mode == 1) ? true : false;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+		
+		uint32_t pbits[4];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+						
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_second_subset[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+		
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == ((mode == 7U) ? 4U : 3U)) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS));
+		
+		color_rgba block_colors[2][8];
+		for (uint32_t s = 0; s < 2; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < COMPS; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3];
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition2[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode4_5(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		const uint32_t ENDPOINTS = 2;
+		const uint32_t COMPS = 4;
+		const uint32_t WEIGHT_BITS = 2;
+		const uint32_t A_WEIGHT_BITS = (mode == 4) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 4) ? 5 : 7;
+		const uint32_t A_ENDPOINT_BITS = (mode == 4) ? 6 : 8;
+		//const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		//const uint32_t A_WEIGHT_VALS = 1 << A_WEIGHT_BITS;
+
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t comp_rot = read_bits32(pBuf, bit_offset, 2);
+		const uint32_t index_mode = (mode == 4) ? read_bits32(pBuf, bit_offset, 1) : 0;
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+		
+		const uint32_t weight_bits[2] = { index_mode ? A_WEIGHT_BITS : WEIGHT_BITS,  index_mode ? WEIGHT_BITS : A_WEIGHT_BITS };
+		
+		uint32_t weights[16], a_weights[16];
+		
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? a_weights : weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[index_mode] - ((!i) ? 1 : 0));
+
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? weights : a_weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[1 - index_mode] - ((!i) ? 1 : 0));
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+
+		color_rgba block_colors[8];
+		for (uint32_t i = 0; i < (1U << weight_bits[0]); i++)
+			for (uint32_t c = 0; c < 3; c++)
+				block_colors[i][c] = (uint8_t)bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]);
+
+		for (uint32_t i = 0; i < (1U << weight_bits[1]); i++)
+			block_colors[i][3] = (uint8_t)bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			pPixels[i] = block_colors[weights[i]];
+			pPixels[i].a = block_colors[a_weights[i]].a;
+			if (comp_rot >= 1)
+				std::swap(pPixels[i].a, pPixels[i].m_comps[comp_rot - 1]);
+		}
+
+		return true;
+	}
+
 	struct bc7_mode_6
 	{
 		struct
@@ -364,9 +656,6 @@ namespace basisu
 		};
 	};
 
-	static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-	
-	// The transcoder only outputs mode 6 at the moment, so this is easy.
 	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
 	{
 		static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16");
@@ -388,7 +677,7 @@ namespace basisu
 		color_rgba vals[16];
 		for (uint32_t i = 0; i < 16; i++)
 		{
-			const uint32_t w = g_bc7_weights4[i];
+			const uint32_t w = basist::g_bc7_weights4[i];
 			const uint32_t iw = 64 - w;
 			vals[i].set_noclamp_rgba( 
 				(r0 * iw + r1 * w + 32) >> 6, 
@@ -420,183 +709,37 @@ namespace basisu
 		return true;
 	}
 
-	static inline uint32_t get_block_bits(const uint8_t* pBytes, uint32_t bit_ofs, uint32_t bits_wanted)
+	bool unpack_bc7(const void *pBlock, color_rgba *pPixels)
 	{
-		assert(bits_wanted < 32);
+		const uint32_t first_byte = static_cast<const uint8_t*>(pBlock)[0];
 
-		uint32_t v = 0;
-		uint32_t total_bits = 0;
-
-		while (total_bits < bits_wanted)
+		for (uint32_t mode = 0; mode <= 7; mode++)
 		{
-			uint32_t k = pBytes[bit_ofs >> 3];
-			k >>= (bit_ofs & 7);
-			uint32_t num_bits_in_byte = 8 - (bit_ofs & 7);
-
-			v |= (k << total_bits);
-			total_bits += num_bits_in_byte;
-			bit_ofs += num_bits_in_byte;
-		}
-
-		return v & ((1 << bits_wanted) - 1);
-	}
-						
-	struct bc7_mode_5
-	{
-		union
-		{
-			struct
+			if (first_byte & (1U << mode))
 			{
-				uint64_t m_mode : 6;
-				uint64_t m_rot : 2;
-				
-				uint64_t m_r0 : 7;
-				uint64_t m_r1 : 7;
-				uint64_t m_g0 : 7;
-				uint64_t m_g1 : 7;
-				uint64_t m_b0 : 7;
-				uint64_t m_b1 : 7;
-				uint64_t m_a0 : 8;
-				uint64_t m_a1_0 : 6;
-
-			} m_lo;
-
-			uint64_t m_lo_bits;
-		};
-
-		union
-		{
-			struct
-			{
-				uint64_t m_a1_1 : 2;
-
-				// bit 2
-				uint64_t m_c00 : 1;
-				uint64_t m_c10 : 2;
-				uint64_t m_c20 : 2;
-				uint64_t m_c30 : 2;
-
-				uint64_t m_c01 : 2;
-				uint64_t m_c11 : 2;
-				uint64_t m_c21 : 2;
-				uint64_t m_c31 : 2;
-
-				uint64_t m_c02 : 2;
-				uint64_t m_c12 : 2;
-				uint64_t m_c22 : 2;
-				uint64_t m_c32 : 2;
-
-				uint64_t m_c03 : 2;
-				uint64_t m_c13 : 2;
-				uint64_t m_c23 : 2;
-				uint64_t m_c33 : 2;
-
-				// bit 33
-				uint64_t m_a00 : 1;
-				uint64_t m_a10 : 2;
-				uint64_t m_a20 : 2;
-				uint64_t m_a30 : 2;
-
-				uint64_t m_a01 : 2;
-				uint64_t m_a11 : 2;
-				uint64_t m_a21 : 2;
-				uint64_t m_a31 : 2;
-
-				uint64_t m_a02 : 2;
-				uint64_t m_a12 : 2;
-				uint64_t m_a22 : 2;
-				uint64_t m_a32 : 2;
-
-				uint64_t m_a03 : 2;
-				uint64_t m_a13 : 2;
-				uint64_t m_a23 : 2;
-				uint64_t m_a33 : 2;
-
-			} m_hi;
-
-			uint64_t m_hi_bits;
-		};
-
-		color_rgba get_low_color() const
-		{
-			return color_rgba(cNoClamp,
-				(int)((m_lo.m_r0 << 1) | (m_lo.m_r0 >> 6)),
-				(int)((m_lo.m_g0 << 1) | (m_lo.m_g0 >> 6)),
-				(int)((m_lo.m_b0 << 1) | (m_lo.m_b0 >> 6)),
-				m_lo.m_a0);
-		}
-
-		color_rgba get_high_color() const
-		{
-			return color_rgba(cNoClamp,
-				(int)((m_lo.m_r1 << 1) | (m_lo.m_r1 >> 6)),
-				(int)((m_lo.m_g1 << 1) | (m_lo.m_g1 >> 6)),
-				(int)((m_lo.m_b1 << 1) | (m_lo.m_b1 >> 6)),
-				(int)m_lo.m_a1_0 | ((int)m_hi.m_a1_1 << 6));
-		}
-
-		void get_block_colors(color_rgba* pColors) const
-		{
-			const color_rgba low_color(get_low_color());
-			const color_rgba high_color(get_high_color());
-
-			for (uint32_t i = 0; i < 4; i++)
-			{
-				static const uint32_t s_bc7_weights2[4] = { 0, 21, 43, 64 };
-
-				pColors[i].set_noclamp_rgba(
-					(low_color.r * (64 - s_bc7_weights2[i]) + high_color.r * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.g * (64 - s_bc7_weights2[i]) + high_color.g * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.b * (64 - s_bc7_weights2[i]) + high_color.b * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.a * (64 - s_bc7_weights2[i]) + high_color.a * s_bc7_weights2[i] + 32) >> 6);
+				switch (mode)
+				{
+				case 0:
+				case 2:
+					return unpack_bc7_mode0_2(mode, pBlock, pPixels);
+				case 1:
+				case 3:
+				case 7:
+					return unpack_bc7_mode1_3_7(mode, pBlock, pPixels);
+				case 4:
+				case 5:
+					return unpack_bc7_mode4_5(mode, pBlock, pPixels);
+				case 6:
+					return unpack_bc7_mode6(pBlock, pPixels);
+				default:
+					break;
+				}
 			}
-		} 
-
-		uint32_t get_selector(uint32_t idx, bool alpha) const
-		{
-			const uint32_t size = (idx == 0) ? 1 : 2;
-
-			uint32_t ofs = alpha ? 97 : 66;
-			
-			if (idx)
-				ofs += 1 + 2 * (idx - 1);
-
-			return get_block_bits(reinterpret_cast<const uint8_t*>(this), ofs, size);
-		}
-	};
-
-	bool unpack_bc7_mode5(const void* pBlock_bits, color_rgba* pPixels)
-	{
-		static_assert(sizeof(bc7_mode_5) == 16, "sizeof(bc7_mode_5) == 16");
-
-		const bc7_mode_5& block = *static_cast<const bc7_mode_5*>(pBlock_bits);
-
-		if (block.m_lo.m_mode != (1 << 5))
-			return false;
-				
-		color_rgba block_colors[4];
-		block.get_block_colors(block_colors);
-
-		const uint32_t rot = block.m_lo.m_rot;
-
-		for (uint32_t i = 0; i < 16; i++)
-		{
-			const uint32_t cs = block.get_selector(i, false);
-
-			color_rgba c(block_colors[cs]);
-
-			const uint32_t as = block.get_selector(i, true);
-			c.a = block_colors[as].a;
-
-			if (rot > 0)
-				std::swap(c[3], c[rot - 1]);
-
-			pPixels[i] = c;
 		}
 
-		return true;
+		return false;
 	}
-
+	
 	struct fxt1_block
 	{
 		union
@@ -903,13 +1046,14 @@ namespace basisu
 		etc2_eac_r11 m_c[2];
 	};
 
-	static void unpack_etc2_eac_r(const etc2_eac_r11* p, color_rgba* pPixels, uint32_t c)
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c)
 	{
-		const uint64_t sels = p->get_sels();
+		const etc2_eac_r11* pBlock = static_cast<const etc2_eac_r11*>(p);
+		const uint64_t sels = pBlock->get_sels();
 
-		const int base = (int)p->m_base * 8 + 4;
-		const int mul = p->m_mul ? ((int)p->m_mul * 8) : 1;
-		const int table = (int)p->m_table;
+		const int base = (int)pBlock->m_base * 8 + 4;
+		const int mul = pBlock->m_mul ? ((int)pBlock->m_mul * 8) : 1;
+		const int table = (int)pBlock->m_table;
 
 		for (uint32_t y = 0; y < 4; y++)
 		{
@@ -923,7 +1067,8 @@ namespace basisu
 				val = clamp<int>(val, 0, 2047);
 
 				// Convert to 8-bits with rounding
-				pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1024) / 2047);
+				//pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1024) / 2047);
+				pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1023) / 2047);
 
 			} // x
 		} // y
@@ -939,6 +1084,11 @@ namespace basisu
 		}
 	}
 	
+	void unpack_uastc(const void* p, color_rgba* pPixels)
+	{
+		basist::unpack_uastc(*static_cast<const basist::uastc_block*>(p), (basist::color32 *)pPixels, false);
+	}
+	
 	// Unpacks to RGBA, R, RG, or A
 	bool unpack_block(texture_format fmt, const void* pBlock, color_rgba* pPixels)
 	{
@@ -949,6 +1099,16 @@ namespace basisu
 			unpack_bc1(pBlock, pPixels, true);
 			break;
 		}
+		case texture_format::cBC1_NV:
+		{
+			unpack_bc1_nv(pBlock, pPixels, true);
+			break;
+		}
+		case texture_format::cBC1_AMD:
+		{
+			unpack_bc1_amd(pBlock, pPixels, true);
+			break;
+		}
 		case texture_format::cBC3:
 		{
 			return unpack_bc3(pBlock, pPixels);
@@ -966,14 +1126,7 @@ namespace basisu
 		}
 		case texture_format::cBC7:
 		{
-			// We only support modes 5 and 6.
-			if (!unpack_bc7_mode5(pBlock, pPixels))
-			{
-				if (!unpack_bc7_mode6(pBlock, pPixels))
-					return false;
-			}
-
-			break;
+			return unpack_bc7(pBlock, pPixels);
 		}
 		// Full ETC2 color blocks (planar/T/H modes) is currently unsupported in basisu, but we do support ETC2 with alpha (using ETC1 for color)
 		case texture_format::cETC2_RGB:
@@ -1032,6 +1185,11 @@ namespace basisu
 			unpack_etc2_eac_rg(pBlock, pPixels);
 			break;
 		}
+		case texture_format::cUASTC4x4:
+		{
+			unpack_uastc(pBlock, pPixels);
+			break;
+		}
 		default:
 		{
 			assert(0);
@@ -1113,6 +1271,7 @@ namespace basisu
 		KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02,
 		KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0,
 		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR = 0x93D0,
+		KTX_COMPRESSED_RGBA_UASTC_4x4_KHR = 0x94CC, // TODO - Use proper value!
 		KTX_ATC_RGB_AMD = 0x8C92,
 		KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD = 0x87EE,
 		KTX_COMPRESSED_RGB_FXT1_3DFX = 0x86B0,
@@ -1143,7 +1302,7 @@ namespace basisu
 	};
 
 	// Input is a texture array of mipmapped gpu_image's: gpu_images[array_index][level_index]
-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
 	{
 		if (!gpu_images.size())
 		{
@@ -1220,6 +1379,8 @@ namespace basisu
 		switch (fmt)
 		{
 		case texture_format::cBC1:
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
 		{
 			internal_fmt = KTX_COMPRESSED_RGB_S3TC_DXT1_EXT;
 			break;
@@ -1305,6 +1466,12 @@ namespace basisu
 			base_internal_fmt = KTX_RG;
 			break;
 		}
+		case texture_format::cUASTC4x4:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_UASTC_4x4_KHR;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
 		case texture_format::cFXT1_RGB:
 		{
 			internal_fmt = KTX_COMPRESSED_RGB_FXT1_3DFX;
@@ -1378,7 +1545,7 @@ namespace basisu
 		return true;
 	}
 
-	bool write_compressed_texture_file(const char* pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag)
+	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag)
 	{
 		std::string extension(string_tolower(string_get_extension(pFilename)));
 
@@ -1410,12 +1577,12 @@ namespace basisu
 
 	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g)
 	{
-		std::vector<gpu_image_vec> v;
+		basisu::vector<gpu_image_vec> v;
 		enlarge_vector(v, 1)->push_back(g);
 		return write_compressed_texture_file(pFilename, v, false);
 	}
 
-	const uint32_t OUT_FILE_MAGIC = 'TEXC';
+	//const uint32_t OUT_FILE_MAGIC = 'TEXC';
 	struct out_file_header 
 	{
 		packed_uint<4> m_magic;
@@ -1428,7 +1595,11 @@ namespace basisu
 	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi)
 	{
 		out_file_header hdr;
-		hdr.m_magic = OUT_FILE_MAGIC;
+		//hdr.m_magic = OUT_FILE_MAGIC;
+		hdr.m_magic.m_bytes[0] = 67;
+		hdr.m_magic.m_bytes[1] = 88;
+		hdr.m_magic.m_bytes[2] = 69;
+		hdr.m_magic.m_bytes[3] = 84;
 		hdr.m_pad = 0;
 		hdr.m_width = gi.get_blocks_x() * 8;
 		hdr.m_height = gi.get_blocks_y() * 4;
diff --git a/thirdparty/basis_universal/basisu_gpu_texture.h b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
index 8a49757ca7..619926f5f9 100644
--- a/thirdparty/basis_universal/basisu_gpu_texture.h
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,13 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_etc.h"
 
 namespace basisu
 {
-	// GPU texture image
-
+	// GPU texture "image"
 	class gpu_image
 	{
 	public:
@@ -115,17 +114,17 @@ namespace basisu
 		uint64_vec m_blocks;
 	};
 
-	typedef std::vector<gpu_image> gpu_image_vec;
+	typedef basisu::vector<gpu_image> gpu_image_vec;
 
 	// KTX file writing
 
-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
 		
-	bool write_compressed_texture_file(const char *pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag);
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
 	
 	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
 	{
-		std::vector<gpu_image_vec> a;
+		basisu::vector<gpu_image_vec> a;
 		a.push_back(g);
 		return write_compressed_texture_file(pFilename, a, false);
 	}
@@ -133,22 +132,23 @@ namespace basisu
 	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g);
 	
 	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi);
-	// GPU texture block unpacking
 
+	// GPU texture block unpacking
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha);
 	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride);
 	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels);
 	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels);
-	bool unpack_bc7_mode5(const void* pBlock_bits, color_rgba* pPixels);
+	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels);
 	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels);
 	bool unpack_fxt1(const void* p, color_rgba* pPixels);
 	bool unpack_pvrtc2(const void* p, color_rgba* pPixels);
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c);
 	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels);
 
-	// unpack_block() is only capable of unpacking texture data created by the transcoder. 
-	// For some texture formats (like BC7, or ETC2) it's not a complete implementation.
+	// unpack_block() is primarily intended to unpack texture data created by the transcoder.
+	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation.
 	bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels);
 			
 } // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
new file mode 100644
index 0000000000..e24bdd7978
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
@@ -0,0 +1,25 @@
+// basisu_kernels_declares.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if BASISU_SUPPORT_SSE
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+#endif
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
new file mode 100644
index 0000000000..046880517b
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
@@ -0,0 +1,584 @@
+// basisu_kernels_imp.h - Do not directly include
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using namespace CPPSPMD;
+
+namespace CPPSPMD_NAME(basisu_kernels_namespace)
+{
+   struct perceptual_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint delta_l = dr * 27 + dg * 92 + db * 9;
+            vint delta_cr = dr * 128 - delta_l;
+            vint delta_cb = db * 128 - delta_l;
+
+            vint id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int delta_l = dr * 27 + dg * 92 + db * 9;
+            int delta_cr = dr * 128 - delta_l;
+            int delta_cb = db * 128 - delta_l;
+
+            int id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct linear_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint id = dr * dr + dg * dg + db * db;
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int id = dr * dr + dg * dg + db * db;
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n,
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+} // namespace
+
+using namespace CPPSPMD_NAME(basisu_kernels_namespace);
+
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
new file mode 100644
index 0000000000..12d2321f20
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
@@ -0,0 +1,161 @@
+// basisu_kernels_sse.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+
+#if BASISU_SUPPORT_SSE
+
+#define CPPSPMD_SSE2 0
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#if !defined(_MSC_VER)
+	#if __AVX__ || __AVX2__ || __AVX512F__
+		#error Please check your compiler options
+	#endif
+	
+	#if CPPSPMD_SSE2
+		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
+			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
+		#endif
+	#else
+		#if !__SSE4_1__ || !__SSE3__ || __SSE4_2__ || !__SSSE3__
+			#error Please check your compiler options
+		#endif
+	#endif
+#endif
+
+#include "cppspmd_sse.h"
+
+#include "cppspmd_type_aliases.h"
+
+using namespace basisu;
+
+#include "basisu_kernels_declares.h"
+#include "basisu_kernels_imp.h"
+
+namespace basisu
+{
+
+struct cpu_info
+{
+	cpu_info() { memset(this, 0, sizeof(*this)); }
+
+	bool m_has_fpu;
+	bool m_has_mmx;
+	bool m_has_sse;
+	bool m_has_sse2;
+	bool m_has_sse3;
+	bool m_has_ssse3;
+	bool m_has_sse41;
+	bool m_has_sse42;
+	bool m_has_avx;
+	bool m_has_avx2;
+	bool m_has_pclmulqdq;
+};
+
+static void extract_x86_flags(cpu_info &info, uint32_t ecx, uint32_t edx)
+{
+	info.m_has_fpu = (edx & (1 << 0)) != 0;
+	info.m_has_mmx = (edx & (1 << 23)) != 0;
+	info.m_has_sse = (edx & (1 << 25)) != 0;
+	info.m_has_sse2 = (edx & (1 << 26)) != 0;
+	info.m_has_sse3 = (ecx & (1 << 0)) != 0;
+	info.m_has_ssse3 = (ecx & (1 << 9)) != 0;
+	info.m_has_sse41 = (ecx & (1 << 19)) != 0;
+	info.m_has_sse42 = (ecx & (1 << 20)) != 0;
+	info.m_has_pclmulqdq = (ecx & (1 << 1)) != 0;
+	info.m_has_avx = (ecx & (1 << 28)) != 0;
+}
+
+static void extract_x86_extended_flags(cpu_info &info, uint32_t ebx)
+{
+	info.m_has_avx2 = (ebx & (1 << 5)) != 0;
+}
+
+#ifndef _MSC_VER
+static void do_cpuid(uint32_t eax, uint32_t ecx, uint32_t* regs)
+{
+	uint32_t ebx = 0, edx = 0;
+
+#if defined(__PIC__) && defined(__i386__)
+	__asm__("movl %%ebx, %%edi;"
+		"cpuid;"
+		"xchgl %%ebx, %%edi;"
+		: "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#else
+	__asm__("cpuid;" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#endif
+
+	regs[0] = eax; regs[1] = ebx; regs[2] = ecx; regs[3] = edx;
+}
+#endif
+
+static void get_cpuinfo(cpu_info &info)
+{
+	int regs[4];
+
+#ifdef _MSC_VER
+	__cpuid(regs, 0);
+#else
+	do_cpuid(0, 0, (uint32_t *)regs);
+#endif
+
+	const uint32_t max_eax = regs[0];
+
+	if (max_eax >= 1U)
+	{
+#ifdef _MSC_VER
+		__cpuid(regs, 1);
+#else
+		do_cpuid(1, 0, (uint32_t*)regs);
+#endif
+		extract_x86_flags(info, regs[2], regs[3]);
+	}
+
+	if (max_eax >= 7U)
+	{
+#ifdef _MSC_VER
+		__cpuidex(regs, 7, 0);
+#else
+		do_cpuid(7, 0, (uint32_t*)regs);
+#endif
+
+		extract_x86_extended_flags(info, regs[1]);
+	}
+}
+
+void detect_sse41()
+{
+	cpu_info info;
+	get_cpuinfo(info);
+
+	// Check for everything from SSE to SSE 4.1
+	g_cpu_supports_sse41 = info.m_has_sse && info.m_has_sse2 && info.m_has_sse3 && info.m_has_ssse3 && info.m_has_sse41;
+}
+
+} // namespace basisu
+#else // #if BASISU_SUPPORT_SSE
+namespace basisu
+{
+
+void detect_sse41()
+{
+}
+
+} // namespace basisu
+#endif // #if BASISU_SUPPORT_SSE
+
diff --git a/thirdparty/basis_universal/encoder/basisu_miniz.h b/thirdparty/basis_universal/encoder/basisu_miniz.h
new file mode 100644
index 0000000000..8627abe893
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_miniz.h
@@ -0,0 +1,2514 @@
+/* miniz.c v1.15 - deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
+   Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951: http://www.ietf.org/rfc/rfc1951.txt
+  
+   Forked from the public domain/unlicense version at: https://code.google.com/archive/p/miniz/ 
+   
+   Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef MINIZ_HEADER_INCLUDED
+#define MINIZ_HEADER_INCLUDED
+
+#include <stdlib.h>
+
+// Defines to completely disable specific portions of miniz.c:
+// If all macros here are defined the only functionality remaining will be CRC-32, adler-32, tinfl, and tdefl.
+
+// Define MINIZ_NO_STDIO to disable all usage and any functions which rely on stdio for file I/O.
+//#define MINIZ_NO_STDIO
+
+// If MINIZ_NO_TIME is specified then the ZIP archive functions will not be able to get the current time, or
+// get/set file times, and the C run-time funcs that get/set times won't be called.
+// The current downside is the times written to your archives will be from 1979.
+//#define MINIZ_NO_TIME
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's.
+//#define MINIZ_NO_ARCHIVE_APIS
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all writing related ZIP archive API's.
+//#define MINIZ_NO_ARCHIVE_WRITING_APIS
+
+// Define MINIZ_NO_ZLIB_APIS to remove all ZLIB-style compression/decompression API's.
+//#define MINIZ_NO_ZLIB_APIS
+
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAME to disable zlib names, to prevent conflicts against stock zlib.
+//#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+// Define MINIZ_NO_MALLOC to disable all calls to malloc, free, and realloc.
+// Note if MINIZ_NO_MALLOC is defined then the user must always provide custom user alloc/free/realloc
+// callbacks to the zlib and archive API's, and a few stand-alone helper API's which don't provide custom user
+// functions (such as tdefl_compress_mem_to_heap() and tinfl_decompress_mem_to_heap()) won't work.
+//#define MINIZ_NO_MALLOC
+
+#if defined(__TINYC__) && (defined(__linux) || defined(__linux__))
+  // TODO: Work around "error: include file 'sys\utime.h' when compiling with tcc on Linux
+  #define MINIZ_NO_TIME
+#endif
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_ARCHIVE_APIS)
+  #include <time.h>
+#endif
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__i386) || defined(__i486__) || defined(__i486) || defined(i386) || defined(__ia64__) || defined(__x86_64__)
+// MINIZ_X86_OR_X64_CPU is only used to help set the below macros.
+#define MINIZ_X86_OR_X64_CPU 1
+#endif
+
+#if (__BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+// Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian.
+#define MINIZ_LITTLE_ENDIAN 1
+#endif
+
+#if MINIZ_X86_OR_X64_CPU
+// Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES to 1 on CPU's that permit efficient integer loads and stores from unaligned addresses.
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+#endif
+
+#if defined(_M_X64) || defined(_WIN64) || defined(__MINGW64__) || defined(_LP64) || defined(__LP64__) || defined(__ia64__) || defined(__x86_64__)
+// Set MINIZ_HAS_64BIT_REGISTERS to 1 if operations on 64-bit integers are reasonably fast (and don't involve compiler generated calls to helper functions).
+#define MINIZ_HAS_64BIT_REGISTERS 1
+#endif
+
+namespace buminiz {
+
+// ------------------- zlib-style API Definitions.
+
+// For more compatibility with zlib, miniz.c uses unsigned long for some parameters/struct members. Beware: mz_ulong can be either 32 or 64-bits!
+typedef unsigned long mz_ulong;
+
+// mz_free() internally uses the MZ_FREE() macro (which by default calls free() unless you've modified the MZ_MALLOC macro) to release a block allocated from the heap.
+void mz_free(void *p);
+
+#define MZ_ADLER32_INIT (1)
+// mz_adler32() returns the initial adler-32 value to use when called with ptr==NULL.
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
+
+#define MZ_CRC32_INIT (0)
+// mz_crc32() returns the initial CRC-32 value to use when called with ptr==NULL.
+mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
+
+// Compression strategies.
+enum { MZ_DEFAULT_STRATEGY = 0, MZ_FILTERED = 1, MZ_HUFFMAN_ONLY = 2, MZ_RLE = 3, MZ_FIXED = 4 };
+
+// Method
+#define MZ_DEFLATED 8
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+// Heap allocation callbacks.
+// Note that mz_alloc_func parameter types purpsosely differ from zlib's: items/size is size_t, not unsigned long.
+typedef void *(*mz_alloc_func)(void *opaque, size_t items, size_t size);
+typedef void (*mz_free_func)(void *opaque, void *address);
+typedef void *(*mz_realloc_func)(void *opaque, void *address, size_t items, size_t size);
+
+#define MZ_VERSION          "9.1.15"
+#define MZ_VERNUM           0x91F0
+#define MZ_VER_MAJOR        9
+#define MZ_VER_MINOR        1
+#define MZ_VER_REVISION     15
+#define MZ_VER_SUBREVISION  0
+
+// Flush values. For typical usage you only need MZ_NO_FLUSH and MZ_FINISH. The other values are for advanced use (refer to the zlib docs).
+enum { MZ_NO_FLUSH = 0, MZ_PARTIAL_FLUSH = 1, MZ_SYNC_FLUSH = 2, MZ_FULL_FLUSH = 3, MZ_FINISH = 4, MZ_BLOCK = 5 };
+
+// Return status codes. MZ_PARAM_ERROR is non-standard.
+enum { MZ_OK = 0, MZ_STREAM_END = 1, MZ_NEED_DICT = 2, MZ_ERRNO = -1, MZ_STREAM_ERROR = -2, MZ_DATA_ERROR = -3, MZ_MEM_ERROR = -4, MZ_BUF_ERROR = -5, MZ_VERSION_ERROR = -6, MZ_PARAM_ERROR = -10000 };
+
+// Compression levels: 0-9 are the standard zlib-style levels, 10 is best possible compression (not zlib compatible, and may be very slow), MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL.
+enum { MZ_NO_COMPRESSION = 0, MZ_BEST_SPEED = 1, MZ_BEST_COMPRESSION = 9, MZ_UBER_COMPRESSION = 10, MZ_DEFAULT_LEVEL = 6, MZ_DEFAULT_COMPRESSION = -1 };
+
+// Window bits
+#define MZ_DEFAULT_WINDOW_BITS 15
+
+struct mz_internal_state;
+
+// Compression/decompression stream struct.
+typedef struct mz_stream_s
+{
+  const unsigned char *next_in;     // pointer to next byte to read
+  unsigned int avail_in;            // number of bytes available at next_in
+  mz_ulong total_in;                // total number of bytes consumed so far
+
+  unsigned char *next_out;          // pointer to next byte to write
+  unsigned int avail_out;           // number of bytes that can be written to next_out
+  mz_ulong total_out;               // total number of bytes produced so far
+
+  char *msg;                        // error msg (unused)
+  struct mz_internal_state *state;  // internal state, allocated by zalloc/zfree
+
+  mz_alloc_func zalloc;             // optional heap allocation function (defaults to malloc)
+  mz_free_func zfree;               // optional heap free function (defaults to free)
+  void *opaque;                     // heap alloc function user pointer
+
+  int data_type;                    // data_type (unused)
+  mz_ulong adler;                   // adler32 of the source or uncompressed data
+  mz_ulong reserved;                // not used
+} mz_stream;
+
+typedef mz_stream *mz_streamp;
+
+// Returns the version string of miniz.c.
+const char *mz_version(void);
+
+// mz_deflateInit() initializes a compressor with default options:
+// Parameters:
+//  pStream must point to an initialized mz_stream struct.
+//  level must be between [MZ_NO_COMPRESSION, MZ_BEST_COMPRESSION].
+//  level 1 enables a specially optimized compression function that's been optimized purely for performance, not ratio.
+//  (This special func. is currently only enabled when MINIZ_USE_UNALIGNED_LOADS_AND_STORES and MINIZ_LITTLE_ENDIAN are defined.)
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+//  MZ_PARAM_ERROR if the input parameters are bogus.
+//  MZ_MEM_ERROR on out of memory.
+int mz_deflateInit(mz_streamp pStream, int level);
+
+// mz_deflateInit2() is like mz_deflate(), except with more control:
+// Additional parameters:
+//   method must be MZ_DEFLATED
+//   window_bits must be MZ_DEFAULT_WINDOW_BITS (to wrap the deflate stream with zlib header/adler-32 footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate/no header or footer)
+//   mem_level must be between [1, 9] (it's checked but ignored by miniz.c)
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy);
+
+// Quickly resets a compressor without having to reallocate anything. Same as calling mz_deflateEnd() followed by mz_deflateInit()/mz_deflateInit2().
+int mz_deflateReset(mz_streamp pStream);
+
+// mz_deflate() compresses the input to output, consuming as much of the input and producing as much output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_PARTIAL_FLUSH/MZ_SYNC_FLUSH, MZ_FULL_FLUSH, or MZ_FINISH.
+// Return values:
+//   MZ_OK on success (when flushing, or if more input is needed but not available, and/or there's more output to be written but the output buffer is full).
+//   MZ_STREAM_END if all input has been consumed and all output bytes have been written. Don't call mz_deflate() on the stream anymore.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input and/or output buffers are empty. (Fill up the input buffer or free up some output space and try again.)
+int mz_deflate(mz_streamp pStream, int flush);
+
+// mz_deflateEnd() deinitializes a compressor:
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+int mz_deflateEnd(mz_streamp pStream);
+
+// mz_deflateBound() returns a (very) conservative upper bound on the amount of data that could be generated by deflate(), assuming flush is set to only MZ_NO_FLUSH or MZ_FINISH.
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
+
+// Single-call compression functions mz_compress() and mz_compress2():
+// Returns MZ_OK on success, or one of the error codes from mz_deflate() on failure.
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level);
+
+// mz_compressBound() returns a (very) conservative upper bound on the amount of data that could be generated by calling mz_compress().
+mz_ulong mz_compressBound(mz_ulong source_len);
+
+// Initializes a decompressor.
+int mz_inflateInit(mz_streamp pStream);
+
+// mz_inflateInit2() is like mz_inflateInit() with an additional option that controls the window size and whether or not the stream has been wrapped with a zlib header/footer:
+// window_bits must be MZ_DEFAULT_WINDOW_BITS (to parse zlib header/footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate).
+int mz_inflateInit2(mz_streamp pStream, int window_bits);
+
+// Decompresses the input stream to the output, consuming only as much of the input as needed, and writing as much to the output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_SYNC_FLUSH, or MZ_FINISH.
+//   On the first call, if flush is MZ_FINISH it's assumed the input and output buffers are both sized large enough to decompress the entire stream in a single call (this is slightly faster).
+//   MZ_FINISH implies that there are no more source bytes available beside what's already in the input buffer, and that the output buffer is large enough to hold the rest of the decompressed data.
+// Return values:
+//   MZ_OK on success. Either more input is needed but not available, and/or there's more output to be written but the output buffer is full.
+//   MZ_STREAM_END if all needed input has been consumed and all output bytes have been written. For zlib streams, the adler-32 of the decompressed data has also been verified.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_DATA_ERROR if the deflate stream is invalid.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input buffer is empty but the inflater needs more input to continue, or if the output buffer is not large enough. Call mz_inflate() again
+//   with more input data, or with more room in the output buffer (except when using single call decompression, described above).
+int mz_inflate(mz_streamp pStream, int flush);
+
+// Deinitializes a decompressor.
+int mz_inflateEnd(mz_streamp pStream);
+
+// Single-call decompression.
+// Returns MZ_OK on success, or one of the error codes from mz_inflate() on failure.
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+
+// Returns a string description of the specified error code, or NULL if the error code is invalid.
+const char *mz_error(int err);
+
+// Redefine zlib-compatible names to miniz equivalents, so miniz.c can be used as a drop-in replacement for the subset of zlib that miniz.c supports.
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAMES to disable zlib-compatibility if you use zlib in the same project.
+#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+  typedef unsigned char Byte;
+  typedef unsigned int uInt;
+  typedef mz_ulong uLong;
+  typedef Byte Bytef;
+  typedef uInt uIntf;
+  typedef char charf;
+  typedef int intf;
+  typedef void *voidpf;
+  typedef uLong uLongf;
+  typedef void *voidp;
+  typedef void *const voidpc;
+  #define Z_NULL                0
+  #define Z_NO_FLUSH            MZ_NO_FLUSH
+  #define Z_PARTIAL_FLUSH       MZ_PARTIAL_FLUSH
+  #define Z_SYNC_FLUSH          MZ_SYNC_FLUSH
+  #define Z_FULL_FLUSH          MZ_FULL_FLUSH
+  #define Z_FINISH              MZ_FINISH
+  #define Z_BLOCK               MZ_BLOCK
+  #define Z_OK                  MZ_OK
+  #define Z_STREAM_END          MZ_STREAM_END
+  #define Z_NEED_DICT           MZ_NEED_DICT
+  #define Z_ERRNO               MZ_ERRNO
+  #define Z_STREAM_ERROR        MZ_STREAM_ERROR
+  #define Z_DATA_ERROR          MZ_DATA_ERROR
+  #define Z_MEM_ERROR           MZ_MEM_ERROR
+  #define Z_BUF_ERROR           MZ_BUF_ERROR
+  #define Z_VERSION_ERROR       MZ_VERSION_ERROR
+  #define Z_PARAM_ERROR         MZ_PARAM_ERROR
+  #define Z_NO_COMPRESSION      MZ_NO_COMPRESSION
+  #define Z_BEST_SPEED          MZ_BEST_SPEED
+  #define Z_BEST_COMPRESSION    MZ_BEST_COMPRESSION
+  #define Z_DEFAULT_COMPRESSION MZ_DEFAULT_COMPRESSION
+  #define Z_DEFAULT_STRATEGY    MZ_DEFAULT_STRATEGY
+  #define Z_FILTERED            MZ_FILTERED
+  #define Z_HUFFMAN_ONLY        MZ_HUFFMAN_ONLY
+  #define Z_RLE                 MZ_RLE
+  #define Z_FIXED               MZ_FIXED
+  #define Z_DEFLATED            MZ_DEFLATED
+  #define Z_DEFAULT_WINDOW_BITS MZ_DEFAULT_WINDOW_BITS
+  #define alloc_func            mz_alloc_func
+  #define free_func             mz_free_func
+  #define internal_state        mz_internal_state
+  #define z_stream              mz_stream
+  #define deflateInit           mz_deflateInit
+  #define deflateInit2          mz_deflateInit2
+  #define deflateReset          mz_deflateReset
+  #define deflate               mz_deflate
+  #define deflateEnd            mz_deflateEnd
+  #define deflateBound          mz_deflateBound
+  #define compress              mz_compress
+  #define compress2             mz_compress2
+  #define compressBound         mz_compressBound
+  #define inflateInit           mz_inflateInit
+  #define inflateInit2          mz_inflateInit2
+  #define inflate               mz_inflate
+  #define inflateEnd            mz_inflateEnd
+  #define uncompress            mz_uncompress
+  #define crc32                 mz_crc32
+  #define adler32               mz_adler32
+  #define MAX_WBITS             15
+  #define MAX_MEM_LEVEL         9
+  #define zError                mz_error
+  #define ZLIB_VERSION          MZ_VERSION
+  #define ZLIB_VERNUM           MZ_VERNUM
+  #define ZLIB_VER_MAJOR        MZ_VER_MAJOR
+  #define ZLIB_VER_MINOR        MZ_VER_MINOR
+  #define ZLIB_VER_REVISION     MZ_VER_REVISION
+  #define ZLIB_VER_SUBREVISION  MZ_VER_SUBREVISION
+  #define zlibVersion           mz_version
+  #define zlib_version          mz_version()
+#endif // #ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+#endif // MINIZ_NO_ZLIB_APIS
+
+// ------------------- Types and macros
+
+typedef unsigned char mz_uint8;
+typedef signed short mz_int16;
+typedef unsigned short mz_uint16;
+typedef unsigned int mz_uint32;
+typedef unsigned int mz_uint;
+typedef long long mz_int64;
+typedef unsigned long long mz_uint64;
+typedef int mz_bool;
+
+#define MZ_FALSE (0)
+#define MZ_TRUE (1)
+
+// An attempt to work around MSVC's spammy "warning C4127: conditional expression is constant" message.
+#ifdef _MSC_VER
+   #define MZ_MACRO_END while (0, 0)
+#else
+   #define MZ_MACRO_END while (0)
+#endif
+
+// ------------------- Low-level Decompression API Definitions
+
+// Decompression flags used by tinfl_decompress().
+// TINFL_FLAG_PARSE_ZLIB_HEADER: If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
+// TINFL_FLAG_HAS_MORE_INPUT: If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
+// TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
+// TINFL_FLAG_COMPUTE_ADLER32: Force adler-32 checksum computation of the decompressed bytes.
+enum
+{
+  TINFL_FLAG_PARSE_ZLIB_HEADER = 1,
+  TINFL_FLAG_HAS_MORE_INPUT = 2,
+  TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = 4,
+  TINFL_FLAG_COMPUTE_ADLER32 = 8
+};
+
+// High level decompression functions:
+// tinfl_decompress_mem_to_heap() decompresses a block in memory to a heap block allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of the Deflate or zlib source data to decompress.
+// On return:
+//  Function returns a pointer to the decompressed data, or NULL on failure.
+//  *pOut_len will be set to the decompressed data's size, which could be larger than src_buf_len on uncompressible data.
+//  The caller must call mz_free() on the returned block when it's no longer needed.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+// tinfl_decompress_mem_to_mem() decompresses a block in memory to another block in memory.
+// Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes written on success.
+#define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// tinfl_decompress_mem_to_callback() decompresses a block in memory to an internal 32KB buffer, and a user provided callback function will be called to flush the buffer.
+// Returns 1 on success or 0 on failure.
+typedef int (*tinfl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+struct tinfl_decompressor_tag; typedef struct tinfl_decompressor_tag tinfl_decompressor;
+
+// Max size of LZ dictionary.
+#define TINFL_LZ_DICT_SIZE 32768
+
+// Return status.
+typedef enum
+{
+  TINFL_STATUS_BAD_PARAM = -3,
+  TINFL_STATUS_ADLER32_MISMATCH = -2,
+  TINFL_STATUS_FAILED = -1,
+  TINFL_STATUS_DONE = 0,
+  TINFL_STATUS_NEEDS_MORE_INPUT = 1,
+  TINFL_STATUS_HAS_MORE_OUTPUT = 2
+} tinfl_status;
+
+// Initializes the decompressor to its initial state.
+#define tinfl_init(r) do { (r)->m_state = 0; } MZ_MACRO_END
+#define tinfl_get_adler32(r) (r)->m_check_adler32
+
+// Main low-level decompressor coroutine function. This is the only function actually needed for decompression. All the other functions are just high-level helpers for improved usability.
+// This is a universal API, i.e. it can be used as a building block to build any desired higher level decompression API. In the limit case, it can be called once per every byte input or output.
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
+
+// Internal/private bits follow.
+enum
+{
+  TINFL_MAX_HUFF_TABLES = 3, TINFL_MAX_HUFF_SYMBOLS_0 = 288, TINFL_MAX_HUFF_SYMBOLS_1 = 32, TINFL_MAX_HUFF_SYMBOLS_2 = 19,
+  TINFL_FAST_LOOKUP_BITS = 10, TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
+};
+
+typedef struct
+{
+  mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
+  mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+} tinfl_huff_table;
+
+#if MINIZ_HAS_64BIT_REGISTERS
+  #define TINFL_USE_64BIT_BITBUF 1
+#endif
+
+#if TINFL_USE_64BIT_BITBUF
+  typedef mz_uint64 tinfl_bit_buf_t;
+  #define TINFL_BITBUF_SIZE (64)
+#else
+  typedef mz_uint32 tinfl_bit_buf_t;
+  #define TINFL_BITBUF_SIZE (32)
+#endif
+
+struct tinfl_decompressor_tag
+{
+  mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
+  tinfl_bit_buf_t m_bit_buf;
+  size_t m_dist_from_out_buf_start;
+  tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+  mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
+};
+
+// ------------------- Low-level Compression API Definitions
+
+// Set TDEFL_LESS_MEMORY to 1 to use less memory (compression will be slightly slower, and raw/dynamic blocks will be output more frequently).
+#define TDEFL_LESS_MEMORY 0
+
+// tdefl_init() compression flags logically OR'd together (low 12 bits contain the max. number of probes per dictionary search):
+// TDEFL_DEFAULT_MAX_PROBES: The compressor defaults to 128 dictionary probes per dictionary search. 0=Huffman only, 1=Huffman+LZ (fastest/crap compression), 4095=Huffman+LZ (slowest/best compression).
+enum
+{
+  TDEFL_HUFFMAN_ONLY = 0, TDEFL_DEFAULT_MAX_PROBES = 128, TDEFL_MAX_PROBES_MASK = 0xFFF
+};
+
+// TDEFL_WRITE_ZLIB_HEADER: If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
+// TDEFL_COMPUTE_ADLER32: Always compute the adler-32 of the input data (even when not writing zlib headers).
+// TDEFL_GREEDY_PARSING_FLAG: Set to use faster greedy parsing, instead of more efficient lazy parsing.
+// TDEFL_NONDETERMINISTIC_PARSING_FLAG: Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
+// TDEFL_RLE_MATCHES: Only look for RLE matches (matches with a distance of 1)
+// TDEFL_FILTER_MATCHES: Discards matches <= 5 chars if enabled.
+// TDEFL_FORCE_ALL_STATIC_BLOCKS: Disable usage of optimized Huffman tables.
+// TDEFL_FORCE_ALL_RAW_BLOCKS: Only use raw (uncompressed) deflate blocks.
+// The low 12 bits are reserved to control the max # of hash probes per dictionary lookup (see TDEFL_MAX_PROBES_MASK).
+enum
+{
+  TDEFL_WRITE_ZLIB_HEADER             = 0x01000,
+  TDEFL_COMPUTE_ADLER32               = 0x02000,
+  TDEFL_GREEDY_PARSING_FLAG           = 0x04000,
+  TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000,
+  TDEFL_RLE_MATCHES                   = 0x10000,
+  TDEFL_FILTER_MATCHES                = 0x20000,
+  TDEFL_FORCE_ALL_STATIC_BLOCKS       = 0x40000,
+  TDEFL_FORCE_ALL_RAW_BLOCKS          = 0x80000
+};
+
+// High level compression functions:
+// tdefl_compress_mem_to_heap() compresses a block in memory to a heap block allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of source block to compress.
+//  flags: The max match finder probes (default is 128) logically OR'd against the above flags. Higher probes are slower but improve compression.
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pOut_len will be set to the compressed data's size, which could be larger than src_buf_len on uncompressible data.
+//  The caller must free() the returned block when it's no longer needed.
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+// tdefl_compress_mem_to_mem() compresses a block in memory to another block in memory.
+// Returns 0 on failure.
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// Compresses an image to a compressed PNG file in memory.
+// On entry:
+//  pImage, w, h, and num_chans describe the image to compress. num_chans may be 1, 2, 3, or 4. 
+//  The image pitch in bytes per scanline will be w*num_chans. The leftmost pixel on the top scanline is stored first in memory.
+//  level may range from [0,10], use MZ_NO_COMPRESSION, MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc. or a decent default is MZ_DEFAULT_LEVEL
+//  If flip is true, the image will be flipped on the Y axis (useful for OpenGL apps).
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pLen_out will be set to the size of the PNG image file.
+//  The caller must mz_free() the returned heap block (which will typically be larger than *pLen_out) when it's no longer needed.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip);
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out);
+
+// Output stream interface. The compressor uses this interface to write compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time.
+typedef mz_bool (*tdefl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+
+// tdefl_compress_mem_to_output() compresses a block to an output stream. The above helpers use this function internally.
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+enum { TDEFL_MAX_HUFF_TABLES = 3, TDEFL_MAX_HUFF_SYMBOLS_0 = 288, TDEFL_MAX_HUFF_SYMBOLS_1 = 32, TDEFL_MAX_HUFF_SYMBOLS_2 = 19, TDEFL_LZ_DICT_SIZE = 32768, TDEFL_LZ_DICT_SIZE_MASK = TDEFL_LZ_DICT_SIZE - 1, TDEFL_MIN_MATCH_LEN = 3, TDEFL_MAX_MATCH_LEN = 258 };
+
+// TDEFL_OUT_BUF_SIZE MUST be large enough to hold a single entire compressed output block (using static/fixed Huffman codes).
+#if TDEFL_LESS_MEMORY
+enum { TDEFL_LZ_CODE_BUF_SIZE = 24 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 12, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#else
+enum { TDEFL_LZ_CODE_BUF_SIZE = 64 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 15, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#endif
+
+// The low-level tdefl functions below may be used directly if the above helper functions aren't flexible enough. The low-level functions don't make any heap allocations, unlike the above helper functions.
+typedef enum
+{
+  TDEFL_STATUS_BAD_PARAM = -2,
+  TDEFL_STATUS_PUT_BUF_FAILED = -1,
+  TDEFL_STATUS_OKAY = 0,
+  TDEFL_STATUS_DONE = 1,
+} tdefl_status;
+
+// Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums
+typedef enum
+{
+  TDEFL_NO_FLUSH = 0,
+  TDEFL_SYNC_FLUSH = 2,
+  TDEFL_FULL_FLUSH = 3,
+  TDEFL_FINISH = 4
+} tdefl_flush;
+
+// tdefl's compression state structure.
+typedef struct
+{
+  tdefl_put_buf_func_ptr m_pPut_buf_func;
+  void *m_pPut_buf_user;
+  mz_uint m_flags, m_max_probes[2];
+  int m_greedy_parsing;
+  mz_uint m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size;
+  mz_uint8 *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end;
+  mz_uint m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos, m_bits_in, m_bit_buffer;
+  mz_uint m_saved_match_dist, m_saved_match_len, m_saved_lit, m_output_flush_ofs, m_output_flush_remaining, m_finished, m_block_index, m_wants_to_finish;
+  tdefl_status m_prev_return_status;
+  const void *m_pIn_buf;
+  void *m_pOut_buf;
+  size_t *m_pIn_buf_size, *m_pOut_buf_size;
+  tdefl_flush m_flush;
+  const mz_uint8 *m_pSrc;
+  size_t m_src_buf_left, m_out_buf_ofs;
+  mz_uint8 m_dict[TDEFL_LZ_DICT_SIZE + TDEFL_MAX_MATCH_LEN - 1];
+  mz_uint16 m_huff_count[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint16 m_huff_codes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_huff_code_sizes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE];
+  mz_uint16 m_next[TDEFL_LZ_DICT_SIZE];
+  mz_uint16 m_hash[TDEFL_LZ_HASH_SIZE];
+  mz_uint8 m_output_buf[TDEFL_OUT_BUF_SIZE];
+} tdefl_compressor;
+
+// Initializes the compressor.
+// There is no corresponding deinit() function because the tdefl API's do not dynamically allocate memory.
+// pBut_buf_func: If NULL, output data will be supplied to the specified callback. In this case, the user should call the tdefl_compress_buffer() API for compression.
+// If pBut_buf_func is NULL the user should always call the tdefl_compress() API.
+// flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER, etc.)
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+// Compresses a block of data, consuming as much of the specified input buffer as possible, and writing as much compressed data to the specified output buffer as possible.
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
+
+// tdefl_compress_buffer() is only usable when the tdefl_init() is called with a non-NULL tdefl_put_buf_func_ptr.
+// tdefl_compress_buffer() always consumes the entire input buffer.
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+
+// Can't use tdefl_create_comp_flags_from_zip_params if MINIZ_NO_ZLIB_APIS isn't defined, because it uses some of its macros.
+#ifndef MINIZ_NO_ZLIB_APIS
+// Create tdefl_compress() flags given zlib-style compression parameters.
+// level may range from [0,10] (where 10 is absolute max compression, but may be much slower on some files)
+// window_bits may be -15 (raw deflate) or 15 (zlib)
+// strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY, MZ_RLE, or MZ_FIXED
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
+#endif // #ifndef MINIZ_NO_ZLIB_APIS
+
+} // namespace buminiz
+
+#endif // MINIZ_HEADER_INCLUDED
+
+// ------------------- End of Header: Implementation follows. (If you only want the header, define MINIZ_HEADER_FILE_ONLY.)
+
+#ifndef MINIZ_HEADER_FILE_ONLY
+
+#include <string.h>
+#include <assert.h>
+
+namespace buminiz {
+
+typedef unsigned char mz_validate_uint16[sizeof(mz_uint16)==2 ? 1 : -1];
+typedef unsigned char mz_validate_uint32[sizeof(mz_uint32)==4 ? 1 : -1];
+typedef unsigned char mz_validate_uint64[sizeof(mz_uint64)==8 ? 1 : -1];
+
+#define MZ_ASSERT(x) assert(x)
+
+#ifdef MINIZ_NO_MALLOC
+  #define MZ_MALLOC(x) NULL
+  #define MZ_FREE(x) (void)x, ((void)0)
+  #define MZ_REALLOC(p, x) NULL
+#else
+  #define MZ_MALLOC(x) malloc(x)
+  #define MZ_FREE(x) free(x)
+  #define MZ_REALLOC(p, x) realloc(p, x)
+#endif
+
+#define MZ_MAX(a,b) (((a)>(b))?(a):(b))
+#define MZ_MIN(a,b) (((a)<(b))?(a):(b))
+#define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  #define MZ_READ_LE16(p) *((const mz_uint16 *)(p))
+  #define MZ_READ_LE32(p) *((const mz_uint32 *)(p))
+#else
+  #define MZ_READ_LE16(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U))
+  #define MZ_READ_LE32(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U) | ((mz_uint32)(((const mz_uint8 *)(p))[2]) << 16U) | ((mz_uint32)(((const mz_uint8 *)(p))[3]) << 24U))
+#endif
+
+#ifdef _MSC_VER
+  #define MZ_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+  #define MZ_FORCEINLINE inline __attribute__((__always_inline__))
+#else
+  #define MZ_FORCEINLINE inline
+#endif
+
+// ------------------- zlib-style API's
+
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len)
+{
+  mz_uint32 i, s1 = (mz_uint32)(adler & 0xffff), s2 = (mz_uint32)(adler >> 16); size_t block_len = buf_len % 5552;
+  if (!ptr) return MZ_ADLER32_INIT;
+  while (buf_len) {
+    for (i = 0; i + 7 < block_len; i += 8, ptr += 8) {
+      s1 += ptr[0], s2 += s1; s1 += ptr[1], s2 += s1; s1 += ptr[2], s2 += s1; s1 += ptr[3], s2 += s1;
+      s1 += ptr[4], s2 += s1; s1 += ptr[5], s2 += s1; s1 += ptr[6], s2 += s1; s1 += ptr[7], s2 += s1;
+    }
+    for ( ; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+    s1 %= 65521U, s2 %= 65521U; buf_len -= block_len; block_len = 5552;
+  }
+  return (s2 << 16) + s1;
+}
+
+// Karl Malbrain's compact CRC-32. See "A compact CCITT crc16 and crc32 C implementation that balances processor cache usage against speed": http://www.geocities.com/malbrain/
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
+{
+  static const mz_uint32 s_crc32[16] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
+    0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };
+  mz_uint32 crcu32 = (mz_uint32)crc;
+  if (!ptr) return MZ_CRC32_INIT;
+  crcu32 = ~crcu32; while (buf_len--) { mz_uint8 b = *ptr++; crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b & 0xF)]; crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b >> 4)]; }
+  return ~crcu32;
+}
+
+void mz_free(void *p)
+{
+  MZ_FREE(p);
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+static void *def_alloc_func(void *opaque, size_t items, size_t size) { (void)opaque, (void)items, (void)size; return MZ_MALLOC(items * size); }
+static void def_free_func(void *opaque, void *address) { (void)opaque, (void)address; MZ_FREE(address); }
+//static void *def_realloc_func(void *opaque, void *address, size_t items, size_t size) { (void)opaque, (void)address, (void)items, (void)size; return MZ_REALLOC(address, items * size); }
+
+const char *mz_version(void)
+{
+  return MZ_VERSION;
+}
+
+int mz_deflateInit(mz_streamp pStream, int level)
+{
+  return mz_deflateInit2(pStream, level, MZ_DEFLATED, MZ_DEFAULT_WINDOW_BITS, 9, MZ_DEFAULT_STRATEGY);
+}
+
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy)
+{
+  tdefl_compressor *pComp;
+  mz_uint comp_flags = TDEFL_COMPUTE_ADLER32 | tdefl_create_comp_flags_from_zip_params(level, window_bits, strategy);
+
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((method != MZ_DEFLATED) || ((mem_level < 1) || (mem_level > 9)) || ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS))) return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = MZ_ADLER32_INIT;
+  pStream->msg = NULL;
+  pStream->reserved = 0;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pComp = (tdefl_compressor *)pStream->zalloc(pStream->opaque, 1, sizeof(tdefl_compressor));
+  if (!pComp)
+    return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pComp;
+
+  if (tdefl_init(pComp, NULL, NULL, comp_flags) != TDEFL_STATUS_OKAY)
+  {
+    mz_deflateEnd(pStream);
+    return MZ_PARAM_ERROR;
+  }
+
+  return MZ_OK;
+}
+
+int mz_deflateReset(mz_streamp pStream)
+{
+  if ((!pStream) || (!pStream->state) || (!pStream->zalloc) || (!pStream->zfree)) return MZ_STREAM_ERROR;
+  pStream->total_in = pStream->total_out = 0;
+  tdefl_init((tdefl_compressor*)pStream->state, NULL, NULL, ((tdefl_compressor*)pStream->state)->m_flags);
+  return MZ_OK;
+}
+
+int mz_deflate(mz_streamp pStream, int flush)
+{
+  size_t in_bytes, out_bytes;
+  mz_ulong orig_total_in, orig_total_out;
+  int mz_status = MZ_OK;
+
+  if ((!pStream) || (!pStream->state) || (flush < 0) || (flush > MZ_FINISH) || (!pStream->next_out)) return MZ_STREAM_ERROR;
+  if (!pStream->avail_out) return MZ_BUF_ERROR;
+
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+
+  if (((tdefl_compressor*)pStream->state)->m_prev_return_status == TDEFL_STATUS_DONE)
+    return (flush == MZ_FINISH) ? MZ_STREAM_END : MZ_BUF_ERROR;
+
+  orig_total_in = pStream->total_in; orig_total_out = pStream->total_out;
+  for ( ; ; )
+  {
+    tdefl_status defl_status;
+    in_bytes = pStream->avail_in; out_bytes = pStream->avail_out;
+
+    defl_status = tdefl_compress((tdefl_compressor*)pStream->state, pStream->next_in, &in_bytes, pStream->next_out, &out_bytes, (tdefl_flush)flush);
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes; pStream->adler = tdefl_get_adler32((tdefl_compressor*)pStream->state);
+
+    pStream->next_out += (mz_uint)out_bytes; pStream->avail_out -= (mz_uint)out_bytes;
+    pStream->total_out += (mz_uint)out_bytes;
+
+    if (defl_status < 0)
+    {
+      mz_status = MZ_STREAM_ERROR;
+      break;
+    }
+    else if (defl_status == TDEFL_STATUS_DONE)
+    {
+      mz_status = MZ_STREAM_END;
+      break;
+    }
+    else if (!pStream->avail_out)
+      break;
+    else if ((!pStream->avail_in) && (flush != MZ_FINISH))
+    {
+      if ((flush) || (pStream->total_in != orig_total_in) || (pStream->total_out != orig_total_out))
+        break;
+      return MZ_BUF_ERROR; // Can't make forward progress without some input.
+    }
+  }
+  return mz_status;
+}
+
+int mz_deflateEnd(mz_streamp pStream)
+{
+  if (!pStream) return MZ_STREAM_ERROR;
+  if (pStream->state)
+  {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len)
+{
+  (void)pStream;
+  // This is really over conservative. (And lame, but it's actually pretty tricky to compute a true upper bound given the way tdefl's blocking works.)
+  mz_uint64 a = 128ULL + (source_len * 110ULL) / 100ULL;
+  mz_uint64 b = 128ULL + (mz_uint64)source_len + ((source_len / (31 * 1024)) + 1ULL) * 5ULL;
+  
+  mz_uint64 t = MZ_MAX(a, b);
+  if (((mz_ulong)t) != t)
+     t = (mz_ulong)(-1);
+
+  return (mz_ulong)t;
+}
+
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level)
+{
+  int status;
+  mz_stream stream;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_deflateInit(&stream, level);
+  if (status != MZ_OK) return status;
+
+  status = mz_deflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END)
+  {
+    mz_deflateEnd(&stream);
+    return (status == MZ_OK) ? MZ_BUF_ERROR : status;
+  }
+
+  *pDest_len = stream.total_out;
+  return mz_deflateEnd(&stream);
+}
+
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+  return mz_compress2(pDest, pDest_len, pSource, source_len, MZ_DEFAULT_COMPRESSION);
+}
+
+mz_ulong mz_compressBound(mz_ulong source_len)
+{
+  return mz_deflateBound(NULL, source_len);
+}
+
+typedef struct
+{
+  tinfl_decompressor m_decomp;
+  mz_uint m_dict_ofs, m_dict_avail, m_first_call, m_has_flushed; int m_window_bits;
+  mz_uint8 m_dict[TINFL_LZ_DICT_SIZE];
+  tinfl_status m_last_status;
+} inflate_state;
+
+int mz_inflateInit2(mz_streamp pStream, int window_bits)
+{
+  inflate_state *pDecomp;
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS)) return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = 0;
+  pStream->msg = NULL;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  pStream->reserved = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pDecomp = (inflate_state*)pStream->zalloc(pStream->opaque, 1, sizeof(inflate_state));
+  if (!pDecomp) return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pDecomp;
+
+  tinfl_init(&pDecomp->m_decomp);
+  pDecomp->m_dict_ofs = 0;
+  pDecomp->m_dict_avail = 0;
+  pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+  pDecomp->m_first_call = 1;
+  pDecomp->m_has_flushed = 0;
+  pDecomp->m_window_bits = window_bits;
+
+  return MZ_OK;
+}
+
+int mz_inflateInit(mz_streamp pStream)
+{
+   return mz_inflateInit2(pStream, MZ_DEFAULT_WINDOW_BITS);
+}
+
+int mz_inflate(mz_streamp pStream, int flush)
+{
+  inflate_state* pState;
+  mz_uint n, first_call, decomp_flags = TINFL_FLAG_COMPUTE_ADLER32;
+  size_t in_bytes, out_bytes, orig_avail_in;
+  tinfl_status status;
+
+  if ((!pStream) || (!pStream->state)) return MZ_STREAM_ERROR;
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+  if ((flush) && (flush != MZ_SYNC_FLUSH) && (flush != MZ_FINISH)) return MZ_STREAM_ERROR;
+
+  pState = (inflate_state*)pStream->state;
+  if (pState->m_window_bits > 0) decomp_flags |= TINFL_FLAG_PARSE_ZLIB_HEADER;
+  orig_avail_in = pStream->avail_in;
+
+  first_call = pState->m_first_call; pState->m_first_call = 0;
+  if (pState->m_last_status < 0) return MZ_DATA_ERROR;
+
+  if (pState->m_has_flushed && (flush != MZ_FINISH)) return MZ_STREAM_ERROR;
+  pState->m_has_flushed |= (flush == MZ_FINISH);
+
+  if ((flush == MZ_FINISH) && (first_call))
+  {
+    // MZ_FINISH on the first call implies that the input and output buffers are large enough to hold the entire compressed/decompressed file.
+    decomp_flags |= TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF;
+    in_bytes = pStream->avail_in; out_bytes = pStream->avail_out;
+    status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pStream->next_out, pStream->next_out, &out_bytes, decomp_flags);
+    pState->m_last_status = status;
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes; pStream->total_in += (mz_uint)in_bytes;
+    pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+    pStream->next_out += (mz_uint)out_bytes; pStream->avail_out -= (mz_uint)out_bytes; pStream->total_out += (mz_uint)out_bytes;
+
+    if (status < 0)
+      return MZ_DATA_ERROR;
+    else if (status != TINFL_STATUS_DONE)
+    {
+      pState->m_last_status = TINFL_STATUS_FAILED;
+      return MZ_BUF_ERROR;
+    }
+    return MZ_STREAM_END;
+  }
+  // flush != MZ_FINISH then we must assume there's more input.
+  if (flush != MZ_FINISH) decomp_flags |= TINFL_FLAG_HAS_MORE_INPUT;
+
+  if (pState->m_dict_avail)
+  {
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n; pStream->avail_out -= n; pStream->total_out += n;
+    pState->m_dict_avail -= n; pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+    return ((pState->m_last_status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+  }
+
+  for ( ; ; )
+  {
+    in_bytes = pStream->avail_in;
+    out_bytes = TINFL_LZ_DICT_SIZE - pState->m_dict_ofs;
+
+    status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pState->m_dict, pState->m_dict + pState->m_dict_ofs, &out_bytes, decomp_flags);
+    pState->m_last_status = status;
+
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes; pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+
+    pState->m_dict_avail = (mz_uint)out_bytes;
+
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n; pStream->avail_out -= n; pStream->total_out += n;
+    pState->m_dict_avail -= n; pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+
+    if (status < 0)
+       return MZ_DATA_ERROR; // Stream is corrupted (there could be some uncompressed data left in the output dictionary - oh well).
+    else if ((status == TINFL_STATUS_NEEDS_MORE_INPUT) && (!orig_avail_in))
+      return MZ_BUF_ERROR; // Signal caller that we can't make forward progress without supplying more input or by setting flush to MZ_FINISH.
+    else if (flush == MZ_FINISH)
+    {
+       // The output buffer MUST be large to hold the remaining uncompressed data when flush==MZ_FINISH.
+       if (status == TINFL_STATUS_DONE)
+          return pState->m_dict_avail ? MZ_BUF_ERROR : MZ_STREAM_END;
+       // status here must be TINFL_STATUS_HAS_MORE_OUTPUT, which means there's at least 1 more byte on the way. If there's no more room left in the output buffer then something is wrong.
+       else if (!pStream->avail_out)
+          return MZ_BUF_ERROR;
+    }
+    else if ((status == TINFL_STATUS_DONE) || (!pStream->avail_in) || (!pStream->avail_out) || (pState->m_dict_avail))
+      break;
+  }
+
+  return ((status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+}
+
+int mz_inflateEnd(mz_streamp pStream)
+{
+  if (!pStream)
+    return MZ_STREAM_ERROR;
+  if (pStream->state)
+  {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+  mz_stream stream;
+  int status;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_inflateInit(&stream);
+  if (status != MZ_OK)
+    return status;
+
+  status = mz_inflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END)
+  {
+    mz_inflateEnd(&stream);
+    return ((status == MZ_BUF_ERROR) && (!stream.avail_in)) ? MZ_DATA_ERROR : status;
+  }
+  *pDest_len = stream.total_out;
+
+  return mz_inflateEnd(&stream);
+}
+
+const char *mz_error(int err)
+{
+  static struct { int m_err; const char *m_pDesc; } s_error_descs[] =
+  {
+    { MZ_OK, "" }, { MZ_STREAM_END, "stream end" }, { MZ_NEED_DICT, "need dictionary" }, { MZ_ERRNO, "file error" }, { MZ_STREAM_ERROR, "stream error" },
+    { MZ_DATA_ERROR, "data error" }, { MZ_MEM_ERROR, "out of memory" }, { MZ_BUF_ERROR, "buf error" }, { MZ_VERSION_ERROR, "version error" }, { MZ_PARAM_ERROR, "parameter error" }
+  };
+  mz_uint i; for (i = 0; i < sizeof(s_error_descs) / sizeof(s_error_descs[0]); ++i) if (s_error_descs[i].m_err == err) return s_error_descs[i].m_pDesc;
+  return NULL;
+}
+
+#endif //MINIZ_NO_ZLIB_APIS
+
+// ------------------- Low-level Decompression (completely independent from all compression API's)
+
+#define TINFL_MEMCPY(d, s, l) memcpy(d, s, l)
+#define TINFL_MEMSET(p, c, l) memset(p, c, l)
+
+#define TINFL_CR_BEGIN switch(r->m_state) { case 0:
+#define TINFL_CR_RETURN(state_index, result) do { status = result; r->m_state = state_index; goto common_exit; case state_index:; } MZ_MACRO_END
+#define TINFL_CR_RETURN_FOREVER(state_index, result) do { for ( ; ; ) { TINFL_CR_RETURN(state_index, result); } } MZ_MACRO_END
+#define TINFL_CR_FINISH }
+
+// TODO: If the caller has indicated that there's no more input, and we attempt to read beyond the input buf, then something is wrong with the input because the inflator never
+// reads ahead more than it needs to. Currently TINFL_GET_BYTE() pads the end of the stream with 0's in this scenario.
+#define TINFL_GET_BYTE(state_index, c) do { \
+  if (pIn_buf_cur >= pIn_buf_end) { \
+    for ( ; ; ) { \
+      if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) { \
+        TINFL_CR_RETURN(state_index, TINFL_STATUS_NEEDS_MORE_INPUT); \
+        if (pIn_buf_cur < pIn_buf_end) { \
+          c = *pIn_buf_cur++; \
+          break; \
+        } \
+      } else { \
+        c = 0; \
+        break; \
+      } \
+    } \
+  } else c = *pIn_buf_cur++; } MZ_MACRO_END
+
+#define TINFL_NEED_BITS(state_index, n) do { mz_uint c; TINFL_GET_BYTE(state_index, c); bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); num_bits += 8; } while (num_bits < (mz_uint)(n))
+#define TINFL_SKIP_BITS(state_index, n) do { if (num_bits < (mz_uint)(n)) { TINFL_NEED_BITS(state_index, n); } bit_buf >>= (n); num_bits -= (n); } MZ_MACRO_END
+#define TINFL_GET_BITS(state_index, b, n) do { if (num_bits < (mz_uint)(n)) { TINFL_NEED_BITS(state_index, n); } b = bit_buf & ((1 << (n)) - 1); bit_buf >>= (n); num_bits -= (n); } MZ_MACRO_END
+
+// TINFL_HUFF_BITBUF_FILL() is only used rarely, when the number of bytes remaining in the input buffer falls below 2.
+// It reads just enough bytes from the input stream that are needed to decode the next Huffman code (and absolutely no more). It works by trying to fully decode a
+// Huffman code by using whatever bits are currently present in the bit buffer. If this fails, it reads another byte, and tries again until it succeeds or until the
+// bit buffer contains >=15 bits (deflate's max. Huffman code size).
+#define TINFL_HUFF_BITBUF_FILL(state_index, pHuff) \
+  do { \
+    temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]; \
+    if (temp >= 0) { \
+      code_len = temp >> 9; \
+      if ((code_len) && (num_bits >= code_len)) \
+      break; \
+    } else if (num_bits > TINFL_FAST_LOOKUP_BITS) { \
+       code_len = TINFL_FAST_LOOKUP_BITS; \
+       do { \
+          temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; \
+       } while ((temp < 0) && (num_bits >= (code_len + 1))); if (temp >= 0) break; \
+    } TINFL_GET_BYTE(state_index, c); bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); num_bits += 8; \
+  } while (num_bits < 15);
+
+// TINFL_HUFF_DECODE() decodes the next Huffman coded symbol. It's more complex than you would initially expect because the zlib API expects the decompressor to never read
+// beyond the final byte of the deflate stream. (In other words, when this macro wants to read another byte from the input, it REALLY needs another byte in order to fully
+// decode the next Huffman code.) Handling this properly is particularly important on raw deflate (non-zlib) streams, which aren't followed by a byte aligned adler-32.
+// The slow path is only executed at the very end of the input buffer.
+#define TINFL_HUFF_DECODE(state_index, sym, pHuff) do { \
+  int temp; mz_uint code_len, c; \
+  if (num_bits < 15) { \
+    if ((pIn_buf_end - pIn_buf_cur) < 2) { \
+       TINFL_HUFF_BITBUF_FILL(state_index, pHuff); \
+    } else { \
+       bit_buf |= (((tinfl_bit_buf_t)pIn_buf_cur[0]) << num_bits) | (((tinfl_bit_buf_t)pIn_buf_cur[1]) << (num_bits + 8)); pIn_buf_cur += 2; num_bits += 16; \
+    } \
+  } \
+  if ((temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0) \
+    code_len = temp >> 9, temp &= 511; \
+  else { \
+    code_len = TINFL_FAST_LOOKUP_BITS; do { temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; } while (temp < 0); \
+  } sym = temp; bit_buf >>= code_len; num_bits -= code_len; } MZ_MACRO_END
+
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags)
+{
+  static const int s_length_base[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 };
+  static const int s_length_extra[31]= { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+  static const int s_dist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+  static const int s_dist_extra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+  static const mz_uint8 s_length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+  static const int s_min_table_sizes[3] = { 257, 1, 4 };
+
+  tinfl_status status = TINFL_STATUS_FAILED; mz_uint32 num_bits, dist, counter, num_extra; tinfl_bit_buf_t bit_buf;
+  const mz_uint8 *pIn_buf_cur = pIn_buf_next, *const pIn_buf_end = pIn_buf_next + *pIn_buf_size;
+  mz_uint8 *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = pOut_buf_next + *pOut_buf_size;
+  size_t out_buf_size_mask = (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) ? (size_t)-1 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, dist_from_out_buf_start;
+
+  // Ensure the output buffer's size is a power of 2, unless the output buffer is large enough to hold the entire output file (in which case it doesn't matter).
+  if (((out_buf_size_mask + 1) & out_buf_size_mask) || (pOut_buf_next < pOut_buf_start)) { *pIn_buf_size = *pOut_buf_size = 0; return TINFL_STATUS_BAD_PARAM; }
+
+  num_bits = r->m_num_bits; bit_buf = r->m_bit_buf; dist = r->m_dist; counter = r->m_counter; num_extra = r->m_num_extra; dist_from_out_buf_start = r->m_dist_from_out_buf_start;
+  TINFL_CR_BEGIN
+
+  bit_buf = num_bits = dist = counter = num_extra = r->m_zhdr0 = r->m_zhdr1 = 0; r->m_z_adler32 = r->m_check_adler32 = 1;
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+  {
+    TINFL_GET_BYTE(1, r->m_zhdr0); TINFL_GET_BYTE(2, r->m_zhdr1);
+    counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8));
+    if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (size_t)(1ULL << (8U + (r->m_zhdr0 >> 4)))));
+    if (counter) { TINFL_CR_RETURN_FOREVER(36, TINFL_STATUS_FAILED); }
+  }
+
+  do
+  {
+    TINFL_GET_BITS(3, r->m_final, 3); r->m_type = r->m_final >> 1;
+    if (r->m_type == 0)
+    {
+      TINFL_SKIP_BITS(5, num_bits & 7);
+      for (counter = 0; counter < 4; ++counter) { if (num_bits) TINFL_GET_BITS(6, r->m_raw_header[counter], 8); else TINFL_GET_BYTE(7, r->m_raw_header[counter]); }
+      if ((counter = (r->m_raw_header[0] | (r->m_raw_header[1] << 8))) != (mz_uint)(0xFFFF ^ (r->m_raw_header[2] | (r->m_raw_header[3] << 8)))) { TINFL_CR_RETURN_FOREVER(39, TINFL_STATUS_FAILED); }
+      while ((counter) && (num_bits))
+      {
+        TINFL_GET_BITS(51, dist, 8);
+        while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(52, TINFL_STATUS_HAS_MORE_OUTPUT); }
+        *pOut_buf_cur++ = (mz_uint8)dist;
+        counter--;
+      }
+      while (counter)
+      {
+        size_t n; while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(9, TINFL_STATUS_HAS_MORE_OUTPUT); }
+        while (pIn_buf_cur >= pIn_buf_end)
+        {
+          if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT)
+          {
+            TINFL_CR_RETURN(38, TINFL_STATUS_NEEDS_MORE_INPUT);
+          }
+          else
+          {
+            TINFL_CR_RETURN_FOREVER(40, TINFL_STATUS_FAILED);
+          }
+        }
+        n = MZ_MIN(MZ_MIN((size_t)(pOut_buf_end - pOut_buf_cur), (size_t)(pIn_buf_end - pIn_buf_cur)), counter);
+        TINFL_MEMCPY(pOut_buf_cur, pIn_buf_cur, n); pIn_buf_cur += n; pOut_buf_cur += n; counter -= (mz_uint)n;
+      }
+    }
+    else if (r->m_type == 3)
+    {
+      TINFL_CR_RETURN_FOREVER(10, TINFL_STATUS_FAILED);
+    }
+    else
+    {
+      if (r->m_type == 1)
+      {
+        mz_uint8 *p = r->m_tables[0].m_code_size; mz_uint i;
+        r->m_table_sizes[0] = 288; r->m_table_sizes[1] = 32; TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
+        for ( i = 0; i <= 143; ++i) *p++ = 8; for ( ; i <= 255; ++i) *p++ = 9; for ( ; i <= 279; ++i) *p++ = 7; for ( ; i <= 287; ++i) *p++ = 8;
+      }
+      else
+      {
+        for (counter = 0; counter < 3; counter++) { TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]); r->m_table_sizes[counter] += s_min_table_sizes[counter]; }
+        MZ_CLEAR_OBJ(r->m_tables[2].m_code_size); for (counter = 0; counter < r->m_table_sizes[2]; counter++) { mz_uint s; TINFL_GET_BITS(14, s, 3); r->m_tables[2].m_code_size[s_length_dezigzag[counter]] = (mz_uint8)s; }
+        r->m_table_sizes[2] = 19;
+      }
+      for ( ; (int)r->m_type >= 0; r->m_type--)
+      {
+        int tree_next, tree_cur; tinfl_huff_table *pTable;
+        mz_uint i, j, used_syms, total, sym_index, next_code[17], total_syms[16]; pTable = &r->m_tables[r->m_type]; MZ_CLEAR_OBJ(total_syms); MZ_CLEAR_OBJ(pTable->m_look_up); MZ_CLEAR_OBJ(pTable->m_tree);
+        for (i = 0; i < r->m_table_sizes[r->m_type]; ++i) total_syms[pTable->m_code_size[i]]++;
+        used_syms = 0, total = 0; next_code[0] = next_code[1] = 0;
+        for (i = 1; i <= 15; ++i) { used_syms += total_syms[i]; next_code[i + 1] = (total = ((total + total_syms[i]) << 1)); }
+        if ((65536 != total) && (used_syms > 1))
+        {
+          TINFL_CR_RETURN_FOREVER(35, TINFL_STATUS_FAILED);
+        }
+        for (tree_next = -1, sym_index = 0; sym_index < r->m_table_sizes[r->m_type]; ++sym_index)
+        {
+          mz_uint rev_code = 0, l, cur_code, code_size = pTable->m_code_size[sym_index]; if (!code_size) continue;
+          cur_code = next_code[code_size]++; for (l = code_size; l > 0; l--, cur_code >>= 1) rev_code = (rev_code << 1) | (cur_code & 1);
+          if (code_size <= TINFL_FAST_LOOKUP_BITS) { mz_int16 k = (mz_int16)((code_size << 9) | sym_index); while (rev_code < TINFL_FAST_LOOKUP_SIZE) { pTable->m_look_up[rev_code] = k; rev_code += (1 << code_size); } continue; }
+          if (0 == (tree_cur = pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)])) { pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] = (mz_int16)tree_next; tree_cur = tree_next; tree_next -= 2; }
+          rev_code >>= (TINFL_FAST_LOOKUP_BITS - 1);
+          for (j = code_size; j > (TINFL_FAST_LOOKUP_BITS + 1); j--)
+          {
+            tree_cur -= ((rev_code >>= 1) & 1);
+            if (!pTable->m_tree[-tree_cur - 1]) { pTable->m_tree[-tree_cur - 1] = (mz_int16)tree_next; tree_cur = tree_next; tree_next -= 2; } else tree_cur = pTable->m_tree[-tree_cur - 1];
+          }
+          tree_cur -= ((rev_code >>= 1) & 1); pTable->m_tree[-tree_cur - 1] = (mz_int16)sym_index;
+        }
+        if (r->m_type == 2)
+        {
+          for (counter = 0; counter < (r->m_table_sizes[0] + r->m_table_sizes[1]); )
+          {
+            mz_uint s; TINFL_HUFF_DECODE(16, dist, &r->m_tables[2]); if (dist < 16) { r->m_len_codes[counter++] = (mz_uint8)dist; continue; }
+            if ((dist == 16) && (!counter))
+            {
+              TINFL_CR_RETURN_FOREVER(17, TINFL_STATUS_FAILED);
+            }
+            num_extra = "\02\03\07"[dist - 16]; TINFL_GET_BITS(18, s, num_extra); s += "\03\03\013"[dist - 16];
+            TINFL_MEMSET(r->m_len_codes + counter, (dist == 16) ? r->m_len_codes[counter - 1] : 0, s); counter += s;
+          }
+          if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter)
+          {
+            TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
+          }
+          TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]); TINFL_MEMCPY(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
+        }
+      }
+      for ( ; ; )
+      {
+        mz_uint8 *pSrc;
+        for ( ; ; )
+        {
+          if (((pIn_buf_end - pIn_buf_cur) < 4) || ((pOut_buf_end - pOut_buf_cur) < 2))
+          {
+            TINFL_HUFF_DECODE(23, counter, &r->m_tables[0]);
+            if (counter >= 256)
+              break;
+            while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(24, TINFL_STATUS_HAS_MORE_OUTPUT); }
+            *pOut_buf_cur++ = (mz_uint8)counter;
+          }
+          else
+          {
+            int sym2; mz_uint code_len;
+#if TINFL_USE_64BIT_BITBUF
+            if (num_bits < 30) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE32(pIn_buf_cur)) << num_bits); pIn_buf_cur += 4; num_bits += 32; }
+#else
+            if (num_bits < 15) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits); pIn_buf_cur += 2; num_bits += 16; }
+#endif
+            if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+              code_len = sym2 >> 9;
+            else
+            {
+              code_len = TINFL_FAST_LOOKUP_BITS; do { sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)]; } while (sym2 < 0);
+            }
+            counter = sym2; bit_buf >>= code_len; num_bits -= code_len;
+            if (counter & 256)
+              break;
+
+#if !TINFL_USE_64BIT_BITBUF
+            if (num_bits < 15) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits); pIn_buf_cur += 2; num_bits += 16; }
+#endif
+            if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+              code_len = sym2 >> 9;
+            else
+            {
+              code_len = TINFL_FAST_LOOKUP_BITS; do { sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)]; } while (sym2 < 0);
+            }
+            bit_buf >>= code_len; num_bits -= code_len;
+
+            pOut_buf_cur[0] = (mz_uint8)counter;
+            if (sym2 & 256)
+            {
+              pOut_buf_cur++;
+              counter = sym2;
+              break;
+            }
+            pOut_buf_cur[1] = (mz_uint8)sym2;
+            pOut_buf_cur += 2;
+          }
+        }
+        if ((counter &= 511) == 256) break;
+
+        num_extra = s_length_extra[counter - 257]; counter = s_length_base[counter - 257];
+        if (num_extra) { mz_uint extra_bits; TINFL_GET_BITS(25, extra_bits, num_extra); counter += extra_bits; }
+
+        TINFL_HUFF_DECODE(26, dist, &r->m_tables[1]);
+        num_extra = s_dist_extra[dist]; dist = s_dist_base[dist];
+        if (num_extra) { mz_uint extra_bits; TINFL_GET_BITS(27, extra_bits, num_extra); dist += extra_bits; }
+
+        dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start;
+        if ((dist > dist_from_out_buf_start) && (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+        {
+          TINFL_CR_RETURN_FOREVER(37, TINFL_STATUS_FAILED);
+        }
+
+        pSrc = pOut_buf_start + ((dist_from_out_buf_start - dist) & out_buf_size_mask);
+
+        if ((MZ_MAX(pOut_buf_cur, pSrc) + counter) > pOut_buf_end)
+        {
+          while (counter--)
+          {
+            while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(53, TINFL_STATUS_HAS_MORE_OUTPUT); }
+            *pOut_buf_cur++ = pOut_buf_start[(dist_from_out_buf_start++ - dist) & out_buf_size_mask];
+          }
+          continue;
+        }
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+        else if ((counter >= 9) && (counter <= dist))
+        {
+          const mz_uint8 *pSrc_end = pSrc + (counter & ~7);
+          do
+          {
+            ((mz_uint32 *)pOut_buf_cur)[0] = ((const mz_uint32 *)pSrc)[0];
+            ((mz_uint32 *)pOut_buf_cur)[1] = ((const mz_uint32 *)pSrc)[1];
+            pOut_buf_cur += 8;
+          } while ((pSrc += 8) < pSrc_end);
+          if ((counter &= 7) < 3)
+          {
+            if (counter)
+            {
+              pOut_buf_cur[0] = pSrc[0];
+              if (counter > 1)
+                pOut_buf_cur[1] = pSrc[1];
+              pOut_buf_cur += counter;
+            }
+            continue;
+          }
+        }
+#endif
+        do
+        {
+          pOut_buf_cur[0] = pSrc[0];
+          pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur[2] = pSrc[2];
+          pOut_buf_cur += 3; pSrc += 3;
+        } while ((int)(counter -= 3) > 2);
+        if ((int)counter > 0)
+        {
+          pOut_buf_cur[0] = pSrc[0];
+          if ((int)counter > 1)
+            pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur += counter;
+        }
+      }
+    }
+  } while (!(r->m_final & 1));
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+  {
+    TINFL_SKIP_BITS(32, num_bits & 7); for (counter = 0; counter < 4; ++counter) { mz_uint s; if (num_bits) TINFL_GET_BITS(41, s, 8); else TINFL_GET_BYTE(42, s); r->m_z_adler32 = (r->m_z_adler32 << 8) | s; }
+  }
+  TINFL_CR_RETURN_FOREVER(34, TINFL_STATUS_DONE);
+  TINFL_CR_FINISH
+
+common_exit:
+  r->m_num_bits = num_bits; r->m_bit_buf = bit_buf; r->m_dist = dist; r->m_counter = counter; r->m_num_extra = num_extra; r->m_dist_from_out_buf_start = dist_from_out_buf_start;
+  *pIn_buf_size = pIn_buf_cur - pIn_buf_next; *pOut_buf_size = pOut_buf_cur - pOut_buf_next;
+  if ((decomp_flags & (TINFL_FLAG_PARSE_ZLIB_HEADER | TINFL_FLAG_COMPUTE_ADLER32)) && (status >= 0))
+  {
+    const mz_uint8 *ptr = pOut_buf_next; size_t buf_len = *pOut_buf_size;
+    mz_uint32 i, s1 = r->m_check_adler32 & 0xffff, s2 = r->m_check_adler32 >> 16; size_t block_len = buf_len % 5552;
+    while (buf_len)
+    {
+      for (i = 0; i + 7 < block_len; i += 8, ptr += 8)
+      {
+        s1 += ptr[0], s2 += s1; s1 += ptr[1], s2 += s1; s1 += ptr[2], s2 += s1; s1 += ptr[3], s2 += s1;
+        s1 += ptr[4], s2 += s1; s1 += ptr[5], s2 += s1; s1 += ptr[6], s2 += s1; s1 += ptr[7], s2 += s1;
+      }
+      for ( ; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+      s1 %= 65521U, s2 %= 65521U; buf_len -= block_len; block_len = 5552;
+    }
+    r->m_check_adler32 = (s2 << 16) + s1; if ((status == TINFL_STATUS_DONE) && (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) && (r->m_check_adler32 != r->m_z_adler32)) status = TINFL_STATUS_ADLER32_MISMATCH;
+  }
+  return status;
+}
+
+// Higher level helper functions.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+  tinfl_decompressor decomp; void *pBuf = NULL, *pNew_buf; size_t src_buf_ofs = 0, out_buf_capacity = 0;
+  *pOut_len = 0;
+  tinfl_init(&decomp);
+  for ( ; ; )
+  {
+    size_t src_buf_size = src_buf_len - src_buf_ofs, dst_buf_size = out_buf_capacity - *pOut_len, new_out_buf_capacity;
+    tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8*)pSrc_buf + src_buf_ofs, &src_buf_size, (mz_uint8*)pBuf, pBuf ? (mz_uint8*)pBuf + *pOut_len : NULL, &dst_buf_size,
+      (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+    if ((status < 0) || (status == TINFL_STATUS_NEEDS_MORE_INPUT))
+    {
+      MZ_FREE(pBuf); *pOut_len = 0; return NULL;
+    }
+    src_buf_ofs += src_buf_size;
+    *pOut_len += dst_buf_size;
+    if (status == TINFL_STATUS_DONE) break;
+    new_out_buf_capacity = out_buf_capacity * 2; if (new_out_buf_capacity < 128) new_out_buf_capacity = 128;
+    pNew_buf = MZ_REALLOC(pBuf, new_out_buf_capacity);
+    if (!pNew_buf)
+    {
+      MZ_FREE(pBuf); *pOut_len = 0; return NULL;
+    }
+    pBuf = pNew_buf; out_buf_capacity = new_out_buf_capacity;
+  }
+  return pBuf;
+}
+
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+  tinfl_decompressor decomp; tinfl_status status; tinfl_init(&decomp);
+  status = tinfl_decompress(&decomp, (const mz_uint8*)pSrc_buf, &src_buf_len, (mz_uint8*)pOut_buf, (mz_uint8*)pOut_buf, &out_buf_len, (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+  return (status != TINFL_STATUS_DONE) ? TINFL_DECOMPRESS_MEM_TO_MEM_FAILED : out_buf_len;
+}
+
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  int result = 0;
+  tinfl_decompressor decomp;
+  mz_uint8 *pDict = (mz_uint8*)MZ_MALLOC(TINFL_LZ_DICT_SIZE); size_t in_buf_ofs = 0, dict_ofs = 0;
+  if (!pDict)
+    return TINFL_STATUS_FAILED;
+  tinfl_init(&decomp);
+  for ( ; ; )
+  {
+    size_t in_buf_size = *pIn_buf_size - in_buf_ofs, dst_buf_size = TINFL_LZ_DICT_SIZE - dict_ofs;
+    tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8*)pIn_buf + in_buf_ofs, &in_buf_size, pDict, pDict + dict_ofs, &dst_buf_size,
+      (flags & ~(TINFL_FLAG_HAS_MORE_INPUT | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)));
+    in_buf_ofs += in_buf_size;
+    if ((dst_buf_size) && (!(*pPut_buf_func)(pDict + dict_ofs, (int)dst_buf_size, pPut_buf_user)))
+      break;
+    if (status != TINFL_STATUS_HAS_MORE_OUTPUT)
+    {
+      result = (status == TINFL_STATUS_DONE);
+      break;
+    }
+    dict_ofs = (dict_ofs + dst_buf_size) & (TINFL_LZ_DICT_SIZE - 1);
+  }
+  MZ_FREE(pDict);
+  *pIn_buf_size = in_buf_ofs;
+  return result;
+}
+
+// ------------------- Low-level Compression (independent from all decompression API's)
+
+// Purposely making these tables static for faster init and thread safety.
+static const mz_uint16 s_tdefl_len_sym[256] = {
+  257,258,259,260,261,262,263,264,265,265,266,266,267,267,268,268,269,269,269,269,270,270,270,270,271,271,271,271,272,272,272,272,
+  273,273,273,273,273,273,273,273,274,274,274,274,274,274,274,274,275,275,275,275,275,275,275,275,276,276,276,276,276,276,276,276,
+  277,277,277,277,277,277,277,277,277,277,277,277,277,277,277,277,278,278,278,278,278,278,278,278,278,278,278,278,278,278,278,278,
+  279,279,279,279,279,279,279,279,279,279,279,279,279,279,279,279,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,
+  281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,
+  282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,
+  283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,
+  284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,285 };
+
+static const mz_uint8 s_tdefl_len_extra[256] = {
+  0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+  4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,0 };
+
+static const mz_uint8 s_tdefl_small_dist_sym[512] = {
+  0,1,2,3,4,4,5,5,6,6,6,6,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+  11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,
+  13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,
+  14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,
+  14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+  15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17 };
+
+static const mz_uint8 s_tdefl_small_dist_extra[512] = {
+  0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7 };
+
+static const mz_uint8 s_tdefl_large_dist_sym[128] = {
+  0,0,18,19,20,20,21,21,22,22,22,22,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,26,26,26,26,
+  26,26,26,26,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,
+  28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29 };
+
+static const mz_uint8 s_tdefl_large_dist_extra[128] = {
+  0,0,8,8,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,
+  12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+  13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13 };
+
+// Radix sorts tdefl_sym_freq[] array by 16-bit key m_key. Returns ptr to sorted values.
+typedef struct { mz_uint16 m_key, m_sym_index; } tdefl_sym_freq;
+static tdefl_sym_freq* tdefl_radix_sort_syms(mz_uint num_syms, tdefl_sym_freq* pSyms0, tdefl_sym_freq* pSyms1)
+{
+  mz_uint32 total_passes = 2, pass_shift, pass, i, hist[256 * 2]; tdefl_sym_freq* pCur_syms = pSyms0, *pNew_syms = pSyms1; MZ_CLEAR_OBJ(hist);
+  for (i = 0; i < num_syms; i++) { mz_uint freq = pSyms0[i].m_key; hist[freq & 0xFF]++; hist[256 + ((freq >> 8) & 0xFF)]++; }
+  while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256])) total_passes--;
+  for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8)
+  {
+    const mz_uint32* pHist = &hist[pass << 8];
+    mz_uint offsets[256], cur_ofs = 0;
+    for (i = 0; i < 256; i++) { offsets[i] = cur_ofs; cur_ofs += pHist[i]; }
+    for (i = 0; i < num_syms; i++) pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = pCur_syms[i];
+    { tdefl_sym_freq* t = pCur_syms; pCur_syms = pNew_syms; pNew_syms = t; }
+  }
+  return pCur_syms;
+}
+
+// tdefl_calculate_minimum_redundancy() originally written by: Alistair Moffat, alistair@cs.mu.oz.au, Jyrki Katajainen, jyrki@diku.dk, November 1996.
+static void tdefl_calculate_minimum_redundancy(tdefl_sym_freq *A, int n)
+{
+  int root, leaf, next, avbl, used, dpth;
+  if (n==0) return; else if (n==1) { A[0].m_key = 1; return; }
+  A[0].m_key += A[1].m_key; root = 0; leaf = 2;
+  for (next=1; next < n-1; next++)
+  {
+    if (leaf>=n || A[root].m_key<A[leaf].m_key) { A[next].m_key = A[root].m_key; A[root++].m_key = (mz_uint16)next; } else A[next].m_key = A[leaf++].m_key;
+    if (leaf>=n || (root<next && A[root].m_key<A[leaf].m_key)) { A[next].m_key = (mz_uint16)(A[next].m_key + A[root].m_key); A[root++].m_key = (mz_uint16)next; } else A[next].m_key = (mz_uint16)(A[next].m_key + A[leaf++].m_key);
+  }
+  A[n-2].m_key = 0; for (next=n-3; next>=0; next--) A[next].m_key = A[A[next].m_key].m_key+1;
+  avbl = 1; used = dpth = 0; root = n-2; next = n-1;
+  while (avbl>0)
+  {
+    while (root>=0 && (int)A[root].m_key==dpth) { used++; root--; }
+    while (avbl>used) { A[next--].m_key = (mz_uint16)(dpth); avbl--; }
+    avbl = 2*used; dpth++; used = 0;
+  }
+}
+
+// Limits canonical Huffman code table's max code size.
+enum { TDEFL_MAX_SUPPORTED_HUFF_CODESIZE = 32 };
+static void tdefl_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size)
+{
+  int i; mz_uint32 total = 0; if (code_list_len <= 1) return;
+  for (i = max_code_size + 1; i <= TDEFL_MAX_SUPPORTED_HUFF_CODESIZE; i++) pNum_codes[max_code_size] += pNum_codes[i];
+  for (i = max_code_size; i > 0; i--) total += (((mz_uint32)pNum_codes[i]) << (max_code_size - i));
+  while (total != (1UL << max_code_size))
+  {
+    pNum_codes[max_code_size]--;
+    for (i = max_code_size - 1; i > 0; i--) if (pNum_codes[i]) { pNum_codes[i]--; pNum_codes[i + 1] += 2; break; }
+    total--;
+  }
+}
+
+static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int table_len, int code_size_limit, int static_table)
+{
+  int i, j, l, num_codes[1 + TDEFL_MAX_SUPPORTED_HUFF_CODESIZE]; mz_uint next_code[TDEFL_MAX_SUPPORTED_HUFF_CODESIZE + 1]; MZ_CLEAR_OBJ(num_codes);
+  if (static_table)
+  {
+    for (i = 0; i < table_len; i++) num_codes[d->m_huff_code_sizes[table_num][i]]++;
+  }
+  else
+  {
+    tdefl_sym_freq syms0[TDEFL_MAX_HUFF_SYMBOLS], syms1[TDEFL_MAX_HUFF_SYMBOLS], *pSyms;
+    int num_used_syms = 0;
+    const mz_uint16 *pSym_count = &d->m_huff_count[table_num][0];
+    for (i = 0; i < table_len; i++) if (pSym_count[i]) { syms0[num_used_syms].m_key = (mz_uint16)pSym_count[i]; syms0[num_used_syms++].m_sym_index = (mz_uint16)i; }
+
+    pSyms = tdefl_radix_sort_syms(num_used_syms, syms0, syms1); tdefl_calculate_minimum_redundancy(pSyms, num_used_syms);
+
+    for (i = 0; i < num_used_syms; i++) num_codes[pSyms[i].m_key]++;
+
+    tdefl_huffman_enforce_max_code_size(num_codes, num_used_syms, code_size_limit);
+
+    MZ_CLEAR_OBJ(d->m_huff_code_sizes[table_num]); MZ_CLEAR_OBJ(d->m_huff_codes[table_num]);
+    for (i = 1, j = num_used_syms; i <= code_size_limit; i++)
+      for (l = num_codes[i]; l > 0; l--) d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (mz_uint8)(i);
+  }
+
+  next_code[1] = 0; for (j = 0, i = 2; i <= code_size_limit; i++) next_code[i] = j = ((j + num_codes[i - 1]) << 1);
+
+  for (i = 0; i < table_len; i++)
+  {
+    mz_uint rev_code = 0, code, code_size; if ((code_size = d->m_huff_code_sizes[table_num][i]) == 0) continue;
+    code = next_code[code_size]++; for (l = code_size; l > 0; l--, code >>= 1) rev_code = (rev_code << 1) | (code & 1);
+    d->m_huff_codes[table_num][i] = (mz_uint16)rev_code;
+  }
+}
+
+#define TDEFL_PUT_BITS(b, l) do { \
+  mz_uint bits = b; mz_uint len = l; MZ_ASSERT(bits <= ((1U << len) - 1U)); \
+  d->m_bit_buffer |= (bits << d->m_bits_in); d->m_bits_in += len; \
+  while (d->m_bits_in >= 8) { \
+    if (d->m_pOutput_buf < d->m_pOutput_buf_end) \
+      *d->m_pOutput_buf++ = (mz_uint8)(d->m_bit_buffer); \
+      d->m_bit_buffer >>= 8; \
+      d->m_bits_in -= 8; \
+  } \
+} MZ_MACRO_END
+
+#define TDEFL_RLE_PREV_CODE_SIZE() { if (rle_repeat_count) { \
+  if (rle_repeat_count < 3) { \
+    d->m_huff_count[2][prev_code_size] = (mz_uint16)(d->m_huff_count[2][prev_code_size] + rle_repeat_count); \
+    while (rle_repeat_count--) packed_code_sizes[num_packed_code_sizes++] = prev_code_size; \
+  } else { \
+    d->m_huff_count[2][16] = (mz_uint16)(d->m_huff_count[2][16] + 1); packed_code_sizes[num_packed_code_sizes++] = 16; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_repeat_count - 3); \
+} rle_repeat_count = 0; } }
+
+#define TDEFL_RLE_ZERO_CODE_SIZE() { if (rle_z_count) { \
+  if (rle_z_count < 3) { \
+    d->m_huff_count[2][0] = (mz_uint16)(d->m_huff_count[2][0] + rle_z_count); while (rle_z_count--) packed_code_sizes[num_packed_code_sizes++] = 0; \
+  } else if (rle_z_count <= 10) { \
+    d->m_huff_count[2][17] = (mz_uint16)(d->m_huff_count[2][17] + 1); packed_code_sizes[num_packed_code_sizes++] = 17; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 3); \
+  } else { \
+    d->m_huff_count[2][18] = (mz_uint16)(d->m_huff_count[2][18] + 1); packed_code_sizes[num_packed_code_sizes++] = 18; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 11); \
+} rle_z_count = 0; } }
+
+static mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+static void tdefl_start_dynamic_block(tdefl_compressor *d)
+{
+  int num_lit_codes, num_dist_codes, num_bit_lengths; mz_uint i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count, rle_repeat_count, packed_code_sizes_index;
+  mz_uint8 code_sizes_to_pack[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], packed_code_sizes[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], prev_code_size = 0xFF;
+
+  d->m_huff_count[0][256] = 1;
+
+  tdefl_optimize_huffman_table(d, 0, TDEFL_MAX_HUFF_SYMBOLS_0, 15, MZ_FALSE);
+  tdefl_optimize_huffman_table(d, 1, TDEFL_MAX_HUFF_SYMBOLS_1, 15, MZ_FALSE);
+
+  for (num_lit_codes = 286; num_lit_codes > 257; num_lit_codes--) if (d->m_huff_code_sizes[0][num_lit_codes - 1]) break;
+  for (num_dist_codes = 30; num_dist_codes > 1; num_dist_codes--) if (d->m_huff_code_sizes[1][num_dist_codes - 1]) break;
+
+  memcpy(code_sizes_to_pack, &d->m_huff_code_sizes[0][0], num_lit_codes);
+  memcpy(code_sizes_to_pack + num_lit_codes, &d->m_huff_code_sizes[1][0], num_dist_codes);
+  total_code_sizes_to_pack = num_lit_codes + num_dist_codes; num_packed_code_sizes = 0; rle_z_count = 0; rle_repeat_count = 0;
+
+  memset(&d->m_huff_count[2][0], 0, sizeof(d->m_huff_count[2][0]) * TDEFL_MAX_HUFF_SYMBOLS_2);
+  for (i = 0; i < total_code_sizes_to_pack; i++)
+  {
+    mz_uint8 code_size = code_sizes_to_pack[i];
+    if (!code_size)
+    {
+      TDEFL_RLE_PREV_CODE_SIZE();
+      if (++rle_z_count == 138) { TDEFL_RLE_ZERO_CODE_SIZE(); }
+    }
+    else
+    {
+      TDEFL_RLE_ZERO_CODE_SIZE();
+      if (code_size != prev_code_size)
+      {
+        TDEFL_RLE_PREV_CODE_SIZE();
+        d->m_huff_count[2][code_size] = (mz_uint16)(d->m_huff_count[2][code_size] + 1); packed_code_sizes[num_packed_code_sizes++] = code_size;
+      }
+      else if (++rle_repeat_count == 6)
+      {
+        TDEFL_RLE_PREV_CODE_SIZE();
+      }
+    }
+    prev_code_size = code_size;
+  }
+  if (rle_repeat_count) { TDEFL_RLE_PREV_CODE_SIZE(); } else { TDEFL_RLE_ZERO_CODE_SIZE(); }
+
+  tdefl_optimize_huffman_table(d, 2, TDEFL_MAX_HUFF_SYMBOLS_2, 7, MZ_FALSE);
+
+  TDEFL_PUT_BITS(2, 2);
+
+  TDEFL_PUT_BITS(num_lit_codes - 257, 5);
+  TDEFL_PUT_BITS(num_dist_codes - 1, 5);
+
+  for (num_bit_lengths = 18; num_bit_lengths >= 0; num_bit_lengths--) if (d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[num_bit_lengths]]) break;
+  num_bit_lengths = MZ_MAX(4, (num_bit_lengths + 1)); TDEFL_PUT_BITS(num_bit_lengths - 4, 4);
+  for (i = 0; (int)i < num_bit_lengths; i++) TDEFL_PUT_BITS(d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[i]], 3);
+
+  for (packed_code_sizes_index = 0; packed_code_sizes_index < num_packed_code_sizes; )
+  {
+    mz_uint code = packed_code_sizes[packed_code_sizes_index++]; MZ_ASSERT(code < TDEFL_MAX_HUFF_SYMBOLS_2);
+    TDEFL_PUT_BITS(d->m_huff_codes[2][code], d->m_huff_code_sizes[2][code]);
+    if (code >= 16) TDEFL_PUT_BITS(packed_code_sizes[packed_code_sizes_index++], "\02\03\07"[code - 16]);
+  }
+}
+
+static void tdefl_start_static_block(tdefl_compressor *d)
+{
+  mz_uint i;
+  mz_uint8 *p = &d->m_huff_code_sizes[0][0];
+
+  for (i = 0; i <= 143; ++i) *p++ = 8;
+  for ( ; i <= 255; ++i) *p++ = 9;
+  for ( ; i <= 279; ++i) *p++ = 7;
+  for ( ; i <= 287; ++i) *p++ = 8;
+
+  memset(d->m_huff_code_sizes[1], 5, 32);
+
+  tdefl_optimize_huffman_table(d, 0, 288, 15, MZ_TRUE);
+  tdefl_optimize_huffman_table(d, 1, 32, 15, MZ_TRUE);
+
+  TDEFL_PUT_BITS(1, 2);
+}
+
+static const mz_uint mz_bitmasks[17] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF };
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+  mz_uint8 *pOutput_buf = d->m_pOutput_buf;
+  mz_uint8 *pLZ_code_buf_end = d->m_pLZ_code_buf;
+  mz_uint64 bit_buffer = d->m_bit_buffer;
+  mz_uint bits_in = d->m_bits_in;
+
+#define TDEFL_PUT_BITS_FAST(b, l) { bit_buffer |= (((mz_uint64)(b)) << bits_in); bits_in += (l); }
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < pLZ_code_buf_end; flags >>= 1)
+  {
+    if (flags == 1)
+      flags = *pLZ_codes++ | 0x100;
+
+    if (flags & 1)
+    {
+      mz_uint s0, s1, n0, n1, sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0], match_dist = *(const mz_uint16 *)(pLZ_codes + 1); pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+      // This sequence coaxes MSVC into using cmov's vs. jmp's.
+      s0 = s_tdefl_small_dist_sym[match_dist & 511];
+      n0 = s_tdefl_small_dist_extra[match_dist & 511];
+      s1 = s_tdefl_large_dist_sym[match_dist >> 8];
+      n1 = s_tdefl_large_dist_extra[match_dist >> 8];
+      sym = (match_dist < 512) ? s0 : s1;
+      num_extra_bits = (match_dist < 512) ? n0 : n1;
+
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+    }
+    else
+    {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+      if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+      {
+        flags >>= 1;
+        lit = *pLZ_codes++;
+        MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+        TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+        if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+        {
+          flags >>= 1;
+          lit = *pLZ_codes++;
+          MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+          TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+        }
+      }
+    }
+
+    if (pOutput_buf >= d->m_pOutput_buf_end)
+      return MZ_FALSE;
+
+    *(mz_uint64*)pOutput_buf = bit_buffer;
+    pOutput_buf += (bits_in >> 3);
+    bit_buffer >>= (bits_in & ~7);
+    bits_in &= 7;
+  }
+
+#undef TDEFL_PUT_BITS_FAST
+
+  d->m_pOutput_buf = pOutput_buf;
+  d->m_bits_in = 0;
+  d->m_bit_buffer = 0;
+
+  while (bits_in)
+  {
+    mz_uint32 n = MZ_MIN(bits_in, 16);
+    TDEFL_PUT_BITS((mz_uint)bit_buffer & mz_bitmasks[n], n);
+    bit_buffer >>= n;
+    bits_in -= n;
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#else
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < d->m_pLZ_code_buf; flags >>= 1)
+  {
+    if (flags == 1)
+      flags = *pLZ_codes++ | 0x100;
+    if (flags & 1)
+    {
+      mz_uint sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0], match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8)); pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+      if (match_dist < 512)
+      {
+        sym = s_tdefl_small_dist_sym[match_dist]; num_extra_bits = s_tdefl_small_dist_extra[match_dist];
+      }
+      else
+      {
+        sym = s_tdefl_large_dist_sym[match_dist >> 8]; num_extra_bits = s_tdefl_large_dist_extra[match_dist >> 8];
+      }
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+    }
+    else
+    {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+    }
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#endif // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
+
+static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_block)
+{
+  if (static_block)
+    tdefl_start_static_block(d);
+  else
+    tdefl_start_dynamic_block(d);
+  return tdefl_compress_lz_codes(d);
+}
+
+static int tdefl_flush_block(tdefl_compressor *d, int flush)
+{
+  mz_uint saved_bit_buf, saved_bits_in;
+  mz_uint8 *pSaved_output_buf;
+  mz_bool comp_block_succeeded = MZ_FALSE;
+  int n, use_raw_block = ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) && (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
+  mz_uint8 *pOutput_buf_start = ((d->m_pPut_buf_func == NULL) && ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= TDEFL_OUT_BUF_SIZE)) ? ((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs) : d->m_output_buf;
+
+  d->m_pOutput_buf = pOutput_buf_start;
+  d->m_pOutput_buf_end = d->m_pOutput_buf + TDEFL_OUT_BUF_SIZE - 16;
+
+  MZ_ASSERT(!d->m_output_flush_remaining);
+  d->m_output_flush_ofs = 0;
+  d->m_output_flush_remaining = 0;
+
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> d->m_num_flags_left);
+  d->m_pLZ_code_buf -= (d->m_num_flags_left == 8);
+
+  if ((d->m_flags & TDEFL_WRITE_ZLIB_HEADER) && (!d->m_block_index))
+  {
+    TDEFL_PUT_BITS(0x78, 8); TDEFL_PUT_BITS(0x01, 8);
+  }
+
+  TDEFL_PUT_BITS(flush == TDEFL_FINISH, 1);
+
+  pSaved_output_buf = d->m_pOutput_buf; saved_bit_buf = d->m_bit_buffer; saved_bits_in = d->m_bits_in;
+
+  if (!use_raw_block)
+    comp_block_succeeded = tdefl_compress_block(d, (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) || (d->m_total_lz_bytes < 48));
+
+  // If the block gets expanded, forget the current contents of the output buffer and send a raw block instead.
+  if ( ((use_raw_block) || ((d->m_total_lz_bytes) && ((d->m_pOutput_buf - pSaved_output_buf + 1U) >= d->m_total_lz_bytes))) &&
+       ((d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size) )
+  {
+    mz_uint i; d->m_pOutput_buf = pSaved_output_buf; d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    TDEFL_PUT_BITS(0, 2);
+    if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); }
+    for (i = 2; i; --i, d->m_total_lz_bytes ^= 0xFFFF)
+    {
+      TDEFL_PUT_BITS(d->m_total_lz_bytes & 0xFFFF, 16);
+    }
+    for (i = 0; i < d->m_total_lz_bytes; ++i)
+    {
+      TDEFL_PUT_BITS(d->m_dict[(d->m_lz_code_buf_dict_pos + i) & TDEFL_LZ_DICT_SIZE_MASK], 8);
+    }
+  }
+  // Check for the extremely unlikely (if not impossible) case of the compressed block not fitting into the output buffer when using dynamic codes.
+  else if (!comp_block_succeeded)
+  {
+    d->m_pOutput_buf = pSaved_output_buf; d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    tdefl_compress_block(d, MZ_TRUE);
+  }
+
+  if (flush)
+  {
+    if (flush == TDEFL_FINISH)
+    {
+      if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); }
+      if (d->m_flags & TDEFL_WRITE_ZLIB_HEADER) { mz_uint i, a = d->m_adler32; for (i = 0; i < 4; i++) { TDEFL_PUT_BITS((a >> 24) & 0xFF, 8); a <<= 8; } }
+    }
+    else
+    {
+      mz_uint i, z = 0; TDEFL_PUT_BITS(0, 3); if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); } for (i = 2; i; --i, z ^= 0xFFFF) { TDEFL_PUT_BITS(z & 0xFFFF, 16); }
+    }
+  }
+
+  MZ_ASSERT(d->m_pOutput_buf < d->m_pOutput_buf_end);
+
+  memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1; d->m_pLZ_flags = d->m_lz_code_buf; d->m_num_flags_left = 8; d->m_lz_code_buf_dict_pos += d->m_total_lz_bytes; d->m_total_lz_bytes = 0; d->m_block_index++;
+
+  if ((n = (int)(d->m_pOutput_buf - pOutput_buf_start)) != 0)
+  {
+    if (d->m_pPut_buf_func)
+    {
+      *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+      if (!(*d->m_pPut_buf_func)(d->m_output_buf, n, d->m_pPut_buf_user))
+        return (d->m_prev_return_status = TDEFL_STATUS_PUT_BUF_FAILED);
+    }
+    else if (pOutput_buf_start == d->m_output_buf)
+    {
+      int bytes_to_copy = (int)MZ_MIN((size_t)n, (size_t)(*d->m_pOut_buf_size - d->m_out_buf_ofs));
+      memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf, bytes_to_copy);
+      d->m_out_buf_ofs += bytes_to_copy;
+      if ((n -= bytes_to_copy) != 0)
+      {
+        d->m_output_flush_ofs = bytes_to_copy;
+        d->m_output_flush_remaining = n;
+      }
+    }
+    else
+    {
+      d->m_out_buf_ofs += n;
+    }
+  }
+
+  return d->m_output_flush_remaining;
+}
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+#define TDEFL_READ_UNALIGNED_WORD(p) *(const mz_uint16*)(p)
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint16 *s = (const mz_uint16*)(d->m_dict + pos), *p, *q;
+  mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]), s01 = TDEFL_READ_UNALIGNED_WORD(s);
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN); if (max_match_len <= match_len) return;
+  for ( ; ; )
+  {
+    for ( ; ; )
+    {
+      if (--num_probes_left == 0) return;
+      #define TDEFL_PROBE \
+        next_probe_pos = d->m_next[probe_pos]; \
+        if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) return; \
+        probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK; \
+        if (TDEFL_READ_UNALIGNED_WORD(&d->m_dict[probe_pos + match_len - 1]) == c01) break;
+      TDEFL_PROBE; TDEFL_PROBE; TDEFL_PROBE;
+    }
+    if (!dist) break; q = (const mz_uint16*)(d->m_dict + probe_pos); if (TDEFL_READ_UNALIGNED_WORD(q) != s01) continue; p = s; probe_len = 32;
+    do { } while ( (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                   (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) );
+    if (!probe_len)
+    {
+      *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, TDEFL_MAX_MATCH_LEN); break;
+    }
+    else if ((probe_len = ((mz_uint)(p - s) * 2) + (mz_uint)(*(const mz_uint8*)p == *(const mz_uint8*)q)) > match_len)
+    {
+      *pMatch_dist = dist; if ((*pMatch_len = match_len = MZ_MIN(max_match_len, probe_len)) == max_match_len) break;
+      c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]);
+    }
+  }
+}
+#else
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint8 *s = d->m_dict + pos, *p, *q;
+  mz_uint8 c0 = d->m_dict[pos + match_len], c1 = d->m_dict[pos + match_len - 1];
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN); if (max_match_len <= match_len) return;
+  for ( ; ; )
+  {
+    for ( ; ; )
+    {
+      if (--num_probes_left == 0) return;
+      #define TDEFL_PROBE \
+        next_probe_pos = d->m_next[probe_pos]; \
+        if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) return; \
+        probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK; \
+        if ((d->m_dict[probe_pos + match_len] == c0) && (d->m_dict[probe_pos + match_len - 1] == c1)) break;
+      TDEFL_PROBE; TDEFL_PROBE; TDEFL_PROBE;
+    }
+    if (!dist) break; p = s; q = d->m_dict + probe_pos; for (probe_len = 0; probe_len < max_match_len; probe_len++) if (*p++ != *q++) break;
+    if (probe_len > match_len)
+    {
+      *pMatch_dist = dist; if ((*pMatch_len = match_len = probe_len) == max_match_len) return;
+      c0 = d->m_dict[pos + match_len]; c1 = d->m_dict[pos + match_len - 1];
+    }
+  }
+}
+#endif // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+static mz_bool tdefl_compress_fast(tdefl_compressor *d)
+{
+  // Faster, minimally featured LZRW1-style match+parse loop with better register utilization. Intended for applications where raw throughput is valued more highly than ratio.
+  mz_uint lookahead_pos = d->m_lookahead_pos, lookahead_size = d->m_lookahead_size, dict_size = d->m_dict_size, total_lz_bytes = d->m_total_lz_bytes, num_flags_left = d->m_num_flags_left;
+  mz_uint8 *pLZ_code_buf = d->m_pLZ_code_buf, *pLZ_flags = d->m_pLZ_flags;
+  mz_uint cur_pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+
+  while ((d->m_src_buf_left) || ((d->m_flush) && (lookahead_size)))
+  {
+    const mz_uint TDEFL_COMP_FAST_LOOKAHEAD_SIZE = 4096;
+    mz_uint dst_pos = (lookahead_pos + lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+    mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(d->m_src_buf_left, TDEFL_COMP_FAST_LOOKAHEAD_SIZE - lookahead_size);
+    d->m_src_buf_left -= num_bytes_to_process;
+    lookahead_size += num_bytes_to_process;
+
+    while (num_bytes_to_process)
+    {
+      mz_uint32 n = MZ_MIN(TDEFL_LZ_DICT_SIZE - dst_pos, num_bytes_to_process);
+      memcpy(d->m_dict + dst_pos, d->m_pSrc, n);
+      if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+        memcpy(d->m_dict + TDEFL_LZ_DICT_SIZE + dst_pos, d->m_pSrc, MZ_MIN(n, (TDEFL_MAX_MATCH_LEN - 1) - dst_pos));
+      d->m_pSrc += n;
+      dst_pos = (dst_pos + n) & TDEFL_LZ_DICT_SIZE_MASK;
+      num_bytes_to_process -= n;
+    }
+
+    dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - lookahead_size, dict_size);
+    if ((!d->m_flush) && (lookahead_size < TDEFL_COMP_FAST_LOOKAHEAD_SIZE)) break;
+
+    while (lookahead_size >= 4)
+    {
+      mz_uint cur_match_dist, cur_match_len = 1;
+      mz_uint8 *pCur_dict = d->m_dict + cur_pos;
+      mz_uint first_trigram = (*(const mz_uint32 *)pCur_dict) & 0xFFFFFF;
+      mz_uint hash = (first_trigram ^ (first_trigram >> (24 - (TDEFL_LZ_HASH_BITS - 8)))) & TDEFL_LEVEL1_HASH_SIZE_MASK;
+      mz_uint probe_pos = d->m_hash[hash];
+      d->m_hash[hash] = (mz_uint16)lookahead_pos;
+
+      if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <= dict_size) && ((*(const mz_uint32 *)(d->m_dict + (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) & 0xFFFFFF) == first_trigram))
+      {
+        const mz_uint16 *p = (const mz_uint16 *)pCur_dict;
+        const mz_uint16 *q = (const mz_uint16 *)(d->m_dict + probe_pos);
+        mz_uint32 probe_len = 32;
+        do { } while ( (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+          (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) );
+        cur_match_len = ((mz_uint)(p - (const mz_uint16 *)pCur_dict) * 2) + (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q);
+        if (!probe_len)
+          cur_match_len = cur_match_dist ? TDEFL_MAX_MATCH_LEN : 0;
+
+        if ((cur_match_len < TDEFL_MIN_MATCH_LEN) || ((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U*1024U)))
+        {
+          cur_match_len = 1;
+          *pLZ_code_buf++ = (mz_uint8)first_trigram;
+          *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+          d->m_huff_count[0][(mz_uint8)first_trigram]++;
+        }
+        else
+        {
+          mz_uint32 s0, s1;
+          cur_match_len = MZ_MIN(cur_match_len, lookahead_size);
+
+          MZ_ASSERT((cur_match_len >= TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 1) && (cur_match_dist <= TDEFL_LZ_DICT_SIZE));
+
+          cur_match_dist--;
+
+          pLZ_code_buf[0] = (mz_uint8)(cur_match_len - TDEFL_MIN_MATCH_LEN);
+          *(mz_uint16 *)(&pLZ_code_buf[1]) = (mz_uint16)cur_match_dist;
+          pLZ_code_buf += 3;
+          *pLZ_flags = (mz_uint8)((*pLZ_flags >> 1) | 0x80);
+
+          s0 = s_tdefl_small_dist_sym[cur_match_dist & 511];
+          s1 = s_tdefl_large_dist_sym[cur_match_dist >> 8];
+          d->m_huff_count[1][(cur_match_dist < 512) ? s0 : s1]++;
+
+          d->m_huff_count[0][s_tdefl_len_sym[cur_match_len - TDEFL_MIN_MATCH_LEN]]++;
+        }
+      }
+      else
+      {
+        *pLZ_code_buf++ = (mz_uint8)first_trigram;
+        *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+        d->m_huff_count[0][(mz_uint8)first_trigram]++;
+      }
+
+      if (--num_flags_left == 0) { num_flags_left = 8; pLZ_flags = pLZ_code_buf++; }
+
+      total_lz_bytes += cur_match_len;
+      lookahead_pos += cur_match_len;
+      dict_size = MZ_MIN(dict_size + cur_match_len, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK;
+      MZ_ASSERT(lookahead_size >= cur_match_len);
+      lookahead_size -= cur_match_len;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+      {
+        int n;
+        d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes; pLZ_code_buf = d->m_pLZ_code_buf; pLZ_flags = d->m_pLZ_flags; num_flags_left = d->m_num_flags_left;
+      }
+    }
+
+    while (lookahead_size)
+    {
+      mz_uint8 lit = d->m_dict[cur_pos];
+
+      total_lz_bytes++;
+      *pLZ_code_buf++ = lit;
+      *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+      if (--num_flags_left == 0) { num_flags_left = 8; pLZ_flags = pLZ_code_buf++; }
+
+      d->m_huff_count[0][lit]++;
+
+      lookahead_pos++;
+      dict_size = MZ_MIN(dict_size + 1, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+      lookahead_size--;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+      {
+        int n;
+        d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes; pLZ_code_buf = d->m_pLZ_code_buf; pLZ_flags = d->m_pLZ_flags; num_flags_left = d->m_num_flags_left;
+      }
+    }
+  }
+
+  d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+  d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+  return MZ_TRUE;
+}
+#endif // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+
+static MZ_FORCEINLINE void tdefl_record_literal(tdefl_compressor *d, mz_uint8 lit)
+{
+  d->m_total_lz_bytes++;
+  *d->m_pLZ_code_buf++ = lit;
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> 1); if (--d->m_num_flags_left == 0) { d->m_num_flags_left = 8; d->m_pLZ_flags = d->m_pLZ_code_buf++; }
+  d->m_huff_count[0][lit]++;
+}
+
+static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d, mz_uint match_len, mz_uint match_dist)
+{
+  mz_uint32 s0, s1;
+
+  MZ_ASSERT((match_len >= TDEFL_MIN_MATCH_LEN) && (match_dist >= 1) && (match_dist <= TDEFL_LZ_DICT_SIZE));
+
+  d->m_total_lz_bytes += match_len;
+
+  d->m_pLZ_code_buf[0] = (mz_uint8)(match_len - TDEFL_MIN_MATCH_LEN);
+
+  match_dist -= 1;
+  d->m_pLZ_code_buf[1] = (mz_uint8)(match_dist & 0xFF);
+  d->m_pLZ_code_buf[2] = (mz_uint8)(match_dist >> 8); d->m_pLZ_code_buf += 3;
+
+  *d->m_pLZ_flags = (mz_uint8)((*d->m_pLZ_flags >> 1) | 0x80); if (--d->m_num_flags_left == 0) { d->m_num_flags_left = 8; d->m_pLZ_flags = d->m_pLZ_code_buf++; }
+
+  s0 = s_tdefl_small_dist_sym[match_dist & 511]; s1 = s_tdefl_large_dist_sym[(match_dist >> 8) & 127];
+  d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++;
+
+  if (match_len >= TDEFL_MIN_MATCH_LEN) d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
+}
+
+static mz_bool tdefl_compress_normal(tdefl_compressor *d)
+{
+  const mz_uint8 *pSrc = d->m_pSrc; size_t src_buf_left = d->m_src_buf_left;
+  tdefl_flush flush = d->m_flush;
+
+  while ((src_buf_left) || ((flush) && (d->m_lookahead_size)))
+  {
+    mz_uint len_to_move, cur_match_dist, cur_match_len, cur_pos;
+    // Update dictionary and hash chains. Keeps the lookahead size equal to TDEFL_MAX_MATCH_LEN.
+    if ((d->m_lookahead_size + d->m_dict_size) >= (TDEFL_MIN_MATCH_LEN - 1))
+    {
+      mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK, ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2;
+      mz_uint hash = (d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK];
+      mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(src_buf_left, TDEFL_MAX_MATCH_LEN - d->m_lookahead_size);
+      const mz_uint8 *pSrc_end = pSrc + num_bytes_to_process;
+      src_buf_left -= num_bytes_to_process;
+      d->m_lookahead_size += num_bytes_to_process;
+      while (pSrc != pSrc_end)
+      {
+        mz_uint8 c = *pSrc++; d->m_dict[dst_pos] = c; if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1)) d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        hash = ((hash << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+        d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; d->m_hash[hash] = (mz_uint16)(ins_pos);
+        dst_pos = (dst_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK; ins_pos++;
+      }
+    }
+    else
+    {
+      while ((src_buf_left) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+      {
+        mz_uint8 c = *pSrc++;
+        mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+        src_buf_left--;
+        d->m_dict[dst_pos] = c;
+        if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+          d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        if ((++d->m_lookahead_size + d->m_dict_size) >= TDEFL_MIN_MATCH_LEN)
+        {
+          mz_uint ins_pos = d->m_lookahead_pos + (d->m_lookahead_size - 1) - 2;
+          mz_uint hash = ((d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << (TDEFL_LZ_HASH_SHIFT * 2)) ^ (d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+          d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; d->m_hash[hash] = (mz_uint16)(ins_pos);
+        }
+      }
+    }
+    d->m_dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - d->m_lookahead_size, d->m_dict_size);
+    if ((!flush) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+      break;
+
+    // Simple lazy/greedy parsing state machine.
+    len_to_move = 1; cur_match_dist = 0; cur_match_len = d->m_saved_match_len ? d->m_saved_match_len : (TDEFL_MIN_MATCH_LEN - 1); cur_pos = d->m_lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+    if (d->m_flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS))
+    {
+      if ((d->m_dict_size) && (!(d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS)))
+      {
+        mz_uint8 c = d->m_dict[(cur_pos - 1) & TDEFL_LZ_DICT_SIZE_MASK];
+        cur_match_len = 0; while (cur_match_len < d->m_lookahead_size) { if (d->m_dict[cur_pos + cur_match_len] != c) break; cur_match_len++; }
+        if (cur_match_len < TDEFL_MIN_MATCH_LEN) cur_match_len = 0; else cur_match_dist = 1;
+      }
+    }
+    else
+    {
+      tdefl_find_match(d, d->m_lookahead_pos, d->m_dict_size, d->m_lookahead_size, &cur_match_dist, &cur_match_len);
+    }
+    if (((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U*1024U)) || (cur_pos == cur_match_dist) || ((d->m_flags & TDEFL_FILTER_MATCHES) && (cur_match_len <= 5)))
+    {
+      cur_match_dist = cur_match_len = 0;
+    }
+    if (d->m_saved_match_len)
+    {
+      if (cur_match_len > d->m_saved_match_len)
+      {
+        tdefl_record_literal(d, (mz_uint8)d->m_saved_lit);
+        if (cur_match_len >= 128)
+        {
+          tdefl_record_match(d, cur_match_len, cur_match_dist);
+          d->m_saved_match_len = 0; len_to_move = cur_match_len;
+        }
+        else
+        {
+          d->m_saved_lit = d->m_dict[cur_pos]; d->m_saved_match_dist = cur_match_dist; d->m_saved_match_len = cur_match_len;
+        }
+      }
+      else
+      {
+        tdefl_record_match(d, d->m_saved_match_len, d->m_saved_match_dist);
+        len_to_move = d->m_saved_match_len - 1; d->m_saved_match_len = 0;
+      }
+    }
+    else if (!cur_match_dist)
+      tdefl_record_literal(d, d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]);
+    else if ((d->m_greedy_parsing) || (d->m_flags & TDEFL_RLE_MATCHES) || (cur_match_len >= 128))
+    {
+      tdefl_record_match(d, cur_match_len, cur_match_dist);
+      len_to_move = cur_match_len;
+    }
+    else
+    {
+      d->m_saved_lit = d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]; d->m_saved_match_dist = cur_match_dist; d->m_saved_match_len = cur_match_len;
+    }
+    // Move the lookahead forward by len_to_move bytes.
+    d->m_lookahead_pos += len_to_move;
+    MZ_ASSERT(d->m_lookahead_size >= len_to_move);
+    d->m_lookahead_size -= len_to_move;
+    d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, TDEFL_LZ_DICT_SIZE);
+    // Check if it's time to flush the current LZ codes to the internal output buffer.
+    if ( (d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) ||
+         ( (d->m_total_lz_bytes > 31*1024) && (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= d->m_total_lz_bytes) || (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))) )
+    {
+      int n;
+      d->m_pSrc = pSrc; d->m_src_buf_left = src_buf_left;
+      if ((n = tdefl_flush_block(d, 0)) != 0)
+        return (n < 0) ? MZ_FALSE : MZ_TRUE;
+    }
+  }
+
+  d->m_pSrc = pSrc; d->m_src_buf_left = src_buf_left;
+  return MZ_TRUE;
+}
+
+static tdefl_status tdefl_flush_output_buffer(tdefl_compressor *d)
+{
+  if (d->m_pIn_buf_size)
+  {
+    *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+  }
+
+  if (d->m_pOut_buf_size)
+  {
+    size_t n = MZ_MIN(*d->m_pOut_buf_size - d->m_out_buf_ofs, d->m_output_flush_remaining);
+    memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf + d->m_output_flush_ofs, n);
+    d->m_output_flush_ofs += (mz_uint)n;
+    d->m_output_flush_remaining -= (mz_uint)n;
+    d->m_out_buf_ofs += n;
+
+    *d->m_pOut_buf_size = d->m_out_buf_ofs;
+  }
+
+  return (d->m_finished && !d->m_output_flush_remaining) ? TDEFL_STATUS_DONE : TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush)
+{
+  if (!d)
+  {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return TDEFL_STATUS_BAD_PARAM;
+  }
+
+  d->m_pIn_buf = pIn_buf; d->m_pIn_buf_size = pIn_buf_size;
+  d->m_pOut_buf = pOut_buf; d->m_pOut_buf_size = pOut_buf_size;
+  d->m_pSrc = (const mz_uint8 *)(pIn_buf); d->m_src_buf_left = pIn_buf_size ? *pIn_buf_size : 0;
+  d->m_out_buf_ofs = 0;
+  d->m_flush = flush;
+
+  if ( ((d->m_pPut_buf_func != NULL) == ((pOut_buf != NULL) || (pOut_buf_size != NULL))) || (d->m_prev_return_status != TDEFL_STATUS_OKAY) ||
+        (d->m_wants_to_finish && (flush != TDEFL_FINISH)) || (pIn_buf_size && *pIn_buf_size && !pIn_buf) || (pOut_buf_size && *pOut_buf_size && !pOut_buf) )
+  {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return (d->m_prev_return_status = TDEFL_STATUS_BAD_PARAM);
+  }
+  d->m_wants_to_finish |= (flush == TDEFL_FINISH);
+
+  if ((d->m_output_flush_remaining) || (d->m_finished))
+    return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  if (((d->m_flags & TDEFL_MAX_PROBES_MASK) == 1) &&
+      ((d->m_flags & TDEFL_GREEDY_PARSING_FLAG) != 0) &&
+      ((d->m_flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES)) == 0))
+  {
+    if (!tdefl_compress_fast(d))
+      return d->m_prev_return_status;
+  }
+  else
+#endif // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  {
+    if (!tdefl_compress_normal(d))
+      return d->m_prev_return_status;
+  }
+
+  if ((d->m_flags & (TDEFL_WRITE_ZLIB_HEADER | TDEFL_COMPUTE_ADLER32)) && (pIn_buf))
+    d->m_adler32 = (mz_uint32)mz_adler32(d->m_adler32, (const mz_uint8 *)pIn_buf, d->m_pSrc - (const mz_uint8 *)pIn_buf);
+
+  if ((flush) && (!d->m_lookahead_size) && (!d->m_src_buf_left) && (!d->m_output_flush_remaining))
+  {
+    if (tdefl_flush_block(d, flush) < 0)
+      return d->m_prev_return_status;
+    d->m_finished = (flush == TDEFL_FINISH);
+    if (flush == TDEFL_FULL_FLUSH) { MZ_CLEAR_OBJ(d->m_hash); MZ_CLEAR_OBJ(d->m_next); d->m_dict_size = 0; }
+  }
+
+  return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+}
+
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush)
+{
+  MZ_ASSERT(d->m_pPut_buf_func); return tdefl_compress(d, pIn_buf, &in_buf_size, NULL, NULL, flush);
+}
+
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  d->m_pPut_buf_func = pPut_buf_func; d->m_pPut_buf_user = pPut_buf_user;
+  d->m_flags = (mz_uint)(flags); d->m_max_probes[0] = 1 + ((flags & 0xFFF) + 2) / 3; d->m_greedy_parsing = (flags & TDEFL_GREEDY_PARSING_FLAG) != 0;
+  d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3;
+  if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG)) MZ_CLEAR_OBJ(d->m_hash);
+  d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size = d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0;
+  d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished = d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0;
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1; d->m_pLZ_flags = d->m_lz_code_buf; d->m_num_flags_left = 8;
+  d->m_pOutput_buf = d->m_output_buf; d->m_pOutput_buf_end = d->m_output_buf; d->m_prev_return_status = TDEFL_STATUS_OKAY;
+  d->m_saved_match_dist = d->m_saved_match_len = d->m_saved_lit = 0; d->m_adler32 = 1;
+  d->m_pIn_buf = NULL; d->m_pOut_buf = NULL;
+  d->m_pIn_buf_size = NULL; d->m_pOut_buf_size = NULL;
+  d->m_flush = TDEFL_NO_FLUSH; d->m_pSrc = NULL; d->m_src_buf_left = 0; d->m_out_buf_ofs = 0;
+  memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+  return TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d)
+{
+  return d->m_prev_return_status;
+}
+
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d)
+{
+  return d->m_adler32;
+}
+
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  tdefl_compressor *pComp; mz_bool succeeded; if (((buf_len) && (!pBuf)) || (!pPut_buf_func)) return MZ_FALSE;
+  pComp = (tdefl_compressor*)MZ_MALLOC(sizeof(tdefl_compressor)); if (!pComp) return MZ_FALSE;
+  succeeded = (tdefl_init(pComp, pPut_buf_func, pPut_buf_user, flags) == TDEFL_STATUS_OKAY);
+  succeeded = succeeded && (tdefl_compress_buffer(pComp, pBuf, buf_len, TDEFL_FINISH) == TDEFL_STATUS_DONE);
+  MZ_FREE(pComp); return succeeded;
+}
+
+typedef struct
+{
+  size_t m_size, m_capacity;
+  mz_uint8 *m_pBuf;
+  mz_bool m_expandable;
+} tdefl_output_buffer;
+
+static mz_bool tdefl_output_buffer_putter(const void *pBuf, int len, void *pUser)
+{
+  tdefl_output_buffer *p = (tdefl_output_buffer *)pUser;
+  size_t new_size = p->m_size + len;
+  if (new_size > p->m_capacity)
+  {
+    size_t new_capacity = p->m_capacity; mz_uint8 *pNew_buf; if (!p->m_expandable) return MZ_FALSE;
+    do { new_capacity = MZ_MAX(128U, new_capacity << 1U); } while (new_size > new_capacity);
+    pNew_buf = (mz_uint8*)MZ_REALLOC(p->m_pBuf, new_capacity); if (!pNew_buf) return MZ_FALSE;
+    p->m_pBuf = pNew_buf; p->m_capacity = new_capacity;
+  }
+  memcpy((mz_uint8*)p->m_pBuf + p->m_size, pBuf, len); p->m_size = new_size;
+  return MZ_TRUE;
+}
+
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+  tdefl_output_buffer out_buf; MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_len) return MZ_FALSE; else *pOut_len = 0;
+  out_buf.m_expandable = MZ_TRUE;
+  if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags)) return NULL;
+  *pOut_len = out_buf.m_size; return out_buf.m_pBuf;
+}
+
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+  tdefl_output_buffer out_buf; MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_buf) return 0;
+  out_buf.m_pBuf = (mz_uint8*)pOut_buf; out_buf.m_capacity = out_buf_len;
+  if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags)) return 0;
+  return out_buf.m_size;
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32,  16, 32, 128, 256,  512, 768, 1500 };
+
+// level may actually range from [0,10] (10 is a "hidden" max level, where we want a bit more compression and it's fine if throughput to fall off a cliff on some files).
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy)
+{
+  mz_uint comp_flags = s_tdefl_num_probes[(level >= 0) ? MZ_MIN(10, level) : MZ_DEFAULT_LEVEL] | ((level <= 3) ? TDEFL_GREEDY_PARSING_FLAG : 0);
+  if (window_bits > 0) comp_flags |= TDEFL_WRITE_ZLIB_HEADER;
+
+  if (!level) comp_flags |= TDEFL_FORCE_ALL_RAW_BLOCKS;
+  else if (strategy == MZ_FILTERED) comp_flags |= TDEFL_FILTER_MATCHES;
+  else if (strategy == MZ_HUFFMAN_ONLY) comp_flags &= ~TDEFL_MAX_PROBES_MASK;
+  else if (strategy == MZ_FIXED) comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS;
+  else if (strategy == MZ_RLE) comp_flags |= TDEFL_RLE_MATCHES;
+
+  return comp_flags;
+}
+#endif //MINIZ_NO_ZLIB_APIS
+
+#ifdef _MSC_VER
+#pragma warning (push)
+#pragma warning (disable:4204) // nonstandard extension used : non-constant aggregate initializer (also supported by GNU C and C99, so no big deal)
+#endif
+
+// Simple PNG writer function by Alex Evans, 2011. Released into the public domain: https://gist.github.com/908299, more context at
+// http://altdevblogaday.org/2011/04/06/a-smaller-jpg-encoder/.
+// This is actually a modification of Alex's original code so PNG files generated by this function pass pngcheck.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip)
+{
+  // Using a local copy of this array here in case MINIZ_NO_ZLIB_APIS was defined.
+  static const mz_uint s_tdefl_png_num_probes[11] = { 0, 1, 6, 32,  16, 32, 128, 256,  512, 768, 1500 };
+  tdefl_compressor *pComp = (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor)); tdefl_output_buffer out_buf; int i, bpl = w * num_chans, y, z; mz_uint32 c; *pLen_out = 0;
+  if (!pComp) return NULL;
+  MZ_CLEAR_OBJ(out_buf); out_buf.m_expandable = MZ_TRUE; out_buf.m_capacity = 57+MZ_MAX(64, (1+bpl)*h); if (NULL == (out_buf.m_pBuf = (mz_uint8*)MZ_MALLOC(out_buf.m_capacity))) { MZ_FREE(pComp); return NULL; }
+  // write dummy header
+  for (z = 41; z; --z) tdefl_output_buffer_putter(&z, 1, &out_buf);
+  // compress image data
+  tdefl_init(pComp, tdefl_output_buffer_putter, &out_buf, s_tdefl_png_num_probes[MZ_MIN(10, level)] | TDEFL_WRITE_ZLIB_HEADER);
+  for (y = 0; y < h; ++y) { tdefl_compress_buffer(pComp, &z, 1, TDEFL_NO_FLUSH); tdefl_compress_buffer(pComp, (mz_uint8*)pImage + (flip ? (h - 1 - y) : y) * bpl, bpl, TDEFL_NO_FLUSH); }
+  if (tdefl_compress_buffer(pComp, NULL, 0, TDEFL_FINISH) != TDEFL_STATUS_DONE) { MZ_FREE(pComp); MZ_FREE(out_buf.m_pBuf); return NULL; }
+  // write real header
+  *pLen_out = out_buf.m_size-41;
+  {
+    static const mz_uint8 chans[] = {0x00, 0x00, 0x04, 0x02, 0x06};
+    mz_uint8 pnghdr[41]={0x89,0x50,0x4e,0x47,0x0d,0x0a,0x1a,0x0a,0x00,0x00,0x00,0x0d,0x49,0x48,0x44,0x52,
+      0,0,(mz_uint8)(w>>8),(mz_uint8)w,0,0,(mz_uint8)(h>>8),(mz_uint8)h,8,chans[num_chans],0,0,0,0,0,0,0,
+      (mz_uint8)(*pLen_out>>24),(mz_uint8)(*pLen_out>>16),(mz_uint8)(*pLen_out>>8),(mz_uint8)*pLen_out,0x49,0x44,0x41,0x54};
+    c=(mz_uint32)mz_crc32(MZ_CRC32_INIT,pnghdr+12,17); for (i=0; i<4; ++i, c<<=8) ((mz_uint8*)(pnghdr+29))[i]=(mz_uint8)(c>>24);
+    memcpy(out_buf.m_pBuf, pnghdr, 41);
+  }
+  // write footer (IDAT CRC-32, followed by IEND chunk)
+  if (!tdefl_output_buffer_putter("\0\0\0\0\0\0\0\0\x49\x45\x4e\x44\xae\x42\x60\x82", 16, &out_buf)) { *pLen_out = 0; MZ_FREE(pComp); MZ_FREE(out_buf.m_pBuf); return NULL; }
+  c = (mz_uint32)mz_crc32(MZ_CRC32_INIT,out_buf.m_pBuf+41-4, *pLen_out+4); for (i=0; i<4; ++i, c<<=8) (out_buf.m_pBuf+out_buf.m_size-16)[i] = (mz_uint8)(c >> 24);
+  // compute final size of file, grab compressed data buffer and return
+  *pLen_out += 57; MZ_FREE(pComp); return out_buf.m_pBuf;
+}
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out)
+{
+  // Level 6 corresponds to TDEFL_DEFAULT_MAX_PROBES or MZ_DEFAULT_LEVEL (but we can't depend on MZ_DEFAULT_LEVEL being available in case the zlib API's where #defined out)
+  return tdefl_write_image_to_png_file_in_memory_ex(pImage, w, h, num_chans, pLen_out, 6, MZ_FALSE);
+}
+
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
+} // namespace buminiz
+
+#endif // MINIZ_HEADER_FILE_ONLY
+
diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
new file mode 100644
index 0000000000..596fc197e6
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
@@ -0,0 +1,564 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_pvrtc1_4.h"
+
+namespace basisu
+{
+#if 0
+	static const uint8_t g_pvrtc_5[32] = { 0,8,16,24,33,41,49,57,66,74,82,90,99,107,115,123,132,140,148,156,165,173,181,189,198,206,214,222,231,239,247,255 };
+	static const uint8_t g_pvrtc_4[16] = { 0,16,33,49,66,82,99,115,140,156,173,189,206,222,239,255 };
+	static const uint8_t g_pvrtc_3[8] = { 0,33,74,107,148,181,222,255 };
+	static const uint8_t g_pvrtc_alpha[9] = { 0,34,68,102,136,170,204,238,255 };
+#endif
+
+	static const uint8_t g_pvrtc_5_nearest[256] = { 0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31 };
+	static const uint8_t g_pvrtc_4_nearest[256] = { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15 };
+#if 0
+	static const uint8_t g_pvrtc_3_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 };
+	static const uint8_t g_pvrtc_alpha_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8 };
+#endif
+
+#if 0
+	static const uint8_t g_pvrtc_5_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,
+		3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,
+		7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+		11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,
+		15,15,15,15,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,
+		19,19,19,19,19,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,
+		23,23,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,
+		27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31
+	};
+
+	static const uint8_t g_pvrtc_5_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,
+		4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,
+		8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,
+		12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,
+		16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,
+		20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,
+		24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,
+		28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31,31,31,31,31
+	};
+
+	static const uint8_t g_pvrtc_4_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,
+		11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,
+		13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15
+	};
+
+	static const uint8_t g_pvrtc_4_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,
+		8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,
+		10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,
+		12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,
+		14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+	};
+
+	static const uint8_t g_pvrtc_3_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7
+	};
+
+	static const uint8_t g_pvrtc_3_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+	};
+
+	static const uint8_t g_pvrtc_alpha_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8
+	};
+
+	static const uint8_t g_pvrtc_alpha_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+	};
+#endif
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
+	{
+		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
+				
+		uint32_t min_d = width, max_v = y;
+		if (height < width)
+		{
+			min_d = height;
+			max_v = x;
+		}
+
+		// Interleave the XY LSB's
+		uint32_t shift_ofs = 0, swizzled = 0;
+		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
+		{
+			if (y & s_bit) swizzled |= d_bit;
+			if (x & s_bit) swizzled |= (2 * d_bit);
+		}
+
+		max_v >>= shift_ofs;
+		
+		// OR in the rest of the bits from the largest dimension
+		swizzled |= (max_v << (2 * shift_ofs));
+
+		return swizzled;
+	}
+
+	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+					
+				if (unpack)
+				{
+					b = (b << 1) | (b >> 3);
+				}
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = unpack ? 255 : 7;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+						
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 2) | (b >> 1);
+				}
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 1) | (b >> 3);
+				}
+			}
+		}
+
+		if (unpack)
+		{
+			r = (r << 3) | (r >> 2);
+			g = (g << 3) | (g >> 2);
+			b = (b << 3) | (b >> 2);
+		}
+
+		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+
+				b = (b << 1) | (b >> 3);
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = 15;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 2) | (b >> 1);
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 1) | (b >> 3);
+			}
+		}
+						
+		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
+				pColors[1][c] = static_cast<uint8_t>(m);
+				pColors[2][c] = static_cast<uint8_t>(m);
+			}
+			pColors[2][3] = 0;
+			return true;
+		}
+
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
+			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
+		}
+
+		return false;
+	}
+		
+	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
+		}
+		else
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			if (m == 2)
+				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
+			else
+				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
+		}
+	}
+
+	uint64_t pvrtc4_image::local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual)
+	{
+		uint64_t initial_error = evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false);
+		if (!initial_error)
+			return initial_error;
+
+		vec3F c_avg_orig(0);
+
+		for (int y = 0; y < 7; y++)
+		{
+			const uint32_t py = wrap_y(by * 4 + y - 1);
+			for (uint32_t x = 0; x < 7; x++)
+			{
+				const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+				const color_rgba& c = orig_img(px, py);
+
+				c_avg_orig[0] += c[0];
+				c_avg_orig[1] += c[1];
+				c_avg_orig[2] += c[2];
+			}
+		}
+
+		c_avg_orig *= 1.0f / 49.0f;
+
+		vec3F quant_colors[2];
+		quant_colors[0].set(c_avg_orig);
+		quant_colors[0] -= vec3F(.0125f);
+
+		quant_colors[1].set(c_avg_orig);
+		quant_colors[1] += vec3F(.0125f);
+
+		float total_weight[2];
+
+		bool success = true;
+
+		for (uint32_t pass = 0; pass < 4; pass++)
+		{
+			vec3F new_colors[2] = { vec3F(0), vec3F(0) };
+			memset(total_weight, 0, sizeof(total_weight));
+
+			static const float s_weights[7][7] =
+			{
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 2.242640f, 3.242640f, 4.242640f, 5.000000f, 4.242640f, 3.242640f, 2.242640f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f }
+			};
+
+			for (int y = 0; y < 7; y++)
+			{
+				const uint32_t py = wrap_y(by * 4 + y - 1);
+				for (uint32_t x = 0; x < 7; x++)
+				{
+					const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+					const color_rgba& orig_c = orig_img(px, py);
+
+					vec3F color(orig_c[0], orig_c[1], orig_c[2]);
+
+					uint32_t c = quant_colors[0].squared_distance(color) > quant_colors[1].squared_distance(color);
+
+					const float weight = s_weights[y][x];
+					new_colors[c] += color * weight;
+
+					total_weight[c] += weight;
+				}
+			}
+
+			if (!total_weight[0] || !total_weight[1])
+				success = false;
+
+			quant_colors[0] = new_colors[0] / (float)total_weight[0];
+			quant_colors[1] = new_colors[1] / (float)total_weight[1];
+		}
+
+		if (!success)
+		{
+			quant_colors[0] = c_avg_orig;
+			quant_colors[1] = c_avg_orig;
+		}
+
+		vec4F colors[2] = { quant_colors[0], quant_colors[1] };
+
+		colors[0] += vec3F(.5f);
+		colors[1] += vec3F(.5f);
+		color_rgba color_0((int)colors[0][0], (int)colors[0][1], (int)colors[0][2], 0);
+		color_rgba color_1((int)colors[1][0], (int)colors[1][1], (int)colors[1][2], 0);
+
+		pvrtc4_block cur_blocks[3][3];
+		
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				cur_blocks[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		color_rgba l1(0), h1(0);
+
+		l1[0] = g_pvrtc_5_nearest[color_0[0]];
+		h1[0] = g_pvrtc_5_nearest[color_1[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_0[1]];
+		h1[1] = g_pvrtc_5_nearest[color_1[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_0[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_0 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		pvrtc4_block blocks0[3][3];
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				blocks0[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		l1[0] = g_pvrtc_5_nearest[color_1[0]];
+		h1[0] = g_pvrtc_5_nearest[color_0[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_1[1]];
+		h1[1] = g_pvrtc_5_nearest[color_0[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_1[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_1 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		if (initial_error < basisu::minimum(e03_err_0, e03_err_1))
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = cur_blocks[x + 1][y + 1];
+				}
+			}
+			return initial_error;
+		}
+		else if (e03_err_0 < e03_err_1)
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = blocks0[x + 1][y + 1];
+				}
+			}
+			assert(e03_err_0 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+			return e03_err_0;
+		}
+
+		assert(e03_err_1 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+		return e03_err_1;
+	}
+
+} // basisu
diff --git a/thirdparty/basis_universal/basisu_pvrtc1_4.h b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
index 80b4413351..db6985a439 100644
--- a/thirdparty/basis_universal/basisu_pvrtc1_4.h
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
@@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -87,6 +87,14 @@ namespace basisu
 			return (m_modulation >> ((y * 4 + x) * 2)) & 3;
 		}
 
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 4));
+			uint32_t n = (y * 4 + x) * 2;
+			m_modulation = (m_modulation & (~(3 << n))) | (s << n);
+			assert(get_modulation(x, y) == s);
+		}
+
 		// Scaled by 8
 		inline const uint32_t* get_scaled_modulation_values(bool block_uses_transparent_modulation) const
 		{
@@ -107,7 +115,7 @@ namespace basisu
 		}
 
 		// opaque endpoints:	554, 555
-		// transparent endpoints: 3443 or 3444
+		// transparent endpoints: 3443, 3444
 		inline void set_endpoint_raw(uint32_t endpoint_index, const color_rgba& c, bool opaque_endpoint)
 		{
 			assert(endpoint_index < 2);
@@ -352,7 +360,93 @@ namespace basisu
 
 			return result;
 		}
-						
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).set_modulation(x & 3, y & 3, s);
+		}
+
+		inline uint64_t map_pixel(uint32_t x, uint32_t y, const color_rgba& c, bool perceptual, bool alpha_is_significant, bool record = true)
+		{
+			color_rgba v[4];
+			get_interpolated_colors(x, y, v);
+
+			uint64_t best_dist = color_distance(perceptual, c, v[0], alpha_is_significant);
+			uint32_t best_v = 0;
+			for (uint32_t i = 1; i < 4; i++)
+			{
+				uint64_t dist = color_distance(perceptual, c, v[i], alpha_is_significant);
+				if (dist < best_dist)
+				{
+					best_dist = dist;
+					best_v = i;
+				}
+			}
+
+			if (record)
+				set_modulation(x, y, best_v);
+
+			return best_dist;
+		}
+
+		inline uint64_t remap_pixels_influenced_by_endpoint(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant)
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += map_pixel(x, y, orig_img(x, y), perceptual, alpha_is_significant);
+				}
+			}
+
+			return total_error;
+		}
+
+		inline uint64_t evaluate_1x1_endpoint_error(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant, uint64_t threshold_error = 0) const
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += color_distance(perceptual, get_pixel(x, y), orig_img(x, y), alpha_is_significant);
+
+					if ((threshold_error) && (total_error >= threshold_error))
+						return total_error;
+				}
+			}
+
+			return total_error;
+		}
+
+		uint64_t local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual);
+
+		inline uint64_t map_all_pixels(const image& img, bool perceptual, bool alpha_is_significant)
+		{
+			assert(m_width == img.get_width());
+			assert(m_height == img.get_height());
+
+			uint64_t total_error = 0;
+			for (uint32_t y = 0; y < img.get_height(); y++)
+				for (uint32_t x = 0; x < img.get_width(); x++)
+					total_error += map_pixel(x, y, img(x, y), perceptual, alpha_is_significant);
+
+			return total_error;
+		}
+	
+	public:						
 		uint32_t m_width, m_height;
 		pvrtc4_block_vector2D m_blocks;
 		uint32_t m_block_width, m_block_height;
diff --git a/thirdparty/basis_universal/basisu_resample_filters.cpp b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
index d0b2fd77bb..597cb3f618 100644
--- a/thirdparty/basis_universal/basisu_resample_filters.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
@@ -1,5 +1,5 @@
 // basisu_resampler_filters.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -283,7 +283,7 @@ namespace basisu
 		return sum;
 	}
 
-	static const float KAISER_ALPHA = 4.0;
+	//static const float KAISER_ALPHA = 4.0;
 	static double kaiser(double alpha, double half_width, double x)
 	{
 		const double ratio = (x / half_width);
@@ -310,10 +310,22 @@ namespace basisu
 
 	const resample_filter g_resample_filters[] =
 	{
-		 { "box", box_filter, BOX_FILTER_SUPPORT }, { "tent", tent_filter, TENT_FILTER_SUPPORT }, { "bell", bell_filter, BELL_SUPPORT }, { "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
-		 { "mitchell", mitchell_filter, MITCHELL_SUPPORT }, { "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT }, { "blackman", blackman_filter, BLACKMAN_SUPPORT }, { "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
-		 { "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, { "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, { "kaiser", kaiser_filter, KAISER_SUPPORT }, { "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
-		 { "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, { "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, { "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, { "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
+		{ "box", box_filter, BOX_FILTER_SUPPORT }, 
+		{ "tent", tent_filter, TENT_FILTER_SUPPORT }, 
+		{ "bell", bell_filter, BELL_SUPPORT }, 
+		{ "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
+		{ "mitchell", mitchell_filter, MITCHELL_SUPPORT }, 
+		{ "blackman", blackman_filter, BLACKMAN_SUPPORT }, 
+		{ "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT },
+		{ "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
+		{ "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, 
+		{ "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, 
+		{ "kaiser", kaiser_filter, KAISER_SUPPORT }, 
+		{ "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
+		{ "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, 
+		{ "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
 	};
 
 	const int g_num_resample_filters = BASISU_ARRAY_SIZE(g_resample_filters);
diff --git a/thirdparty/basis_universal/basisu_resampler.cpp b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
index e193ce83ff..e193ce83ff 100644
--- a/thirdparty/basis_universal/basisu_resampler.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
diff --git a/thirdparty/basis_universal/basisu_resampler.h b/thirdparty/basis_universal/encoder/basisu_resampler.h
index c3f2e05c25..dc0978caeb 100644
--- a/thirdparty/basis_universal/basisu_resampler.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.h
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 
 #define BASISU_RESAMPLER_DEBUG_OPS (0)
 #define BASISU_RESAMPLER_DEFAULT_FILTER "lanczos4"
diff --git a/thirdparty/basis_universal/basisu_resampler_filters.h b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
index 5659c5fe86..0ebb51c334 100644
--- a/thirdparty/basis_universal/basisu_resampler_filters.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
@@ -14,7 +14,7 @@
 // limitations under the License.
 #pragma once
 
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 
 namespace basisu
 {
diff --git a/thirdparty/basis_universal/basisu_ssim.cpp b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
index cceb400b88..cceb400b88 100644
--- a/thirdparty/basis_universal/basisu_ssim.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
diff --git a/thirdparty/basis_universal/basisu_ssim.h b/thirdparty/basis_universal/encoder/basisu_ssim.h
index 986ca3bbdf..986ca3bbdf 100644
--- a/thirdparty/basis_universal/basisu_ssim.h
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.h
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
new file mode 100644
index 0000000000..ca2b325693
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
@@ -0,0 +1,4189 @@
+// basisu_uastc_enc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_uastc_enc.h"
+#include "basisu_astc_decomp.h"
+#include "basisu_gpu_texture.h"
+#include "basisu_bc7enc.h"
+
+#ifdef _DEBUG
+// When BASISU_VALIDATE_UASTC_ENC is 1, we pack and unpack to/from UASTC and ASTC, then validate that each codec returns the exact same results. This is slower.
+#define BASISU_VALIDATE_UASTC_ENC 1
+#endif
+
+#define BASISU_SUPPORT_FORCE_MODE 0
+
+using namespace basist;
+
+namespace basisu
+{
+	const uint32_t MAX_ENCODE_RESULTS = 512;
+
+#if BASISU_VALIDATE_UASTC_ENC
+	static void validate_func(bool condition, int line)
+	{
+		if (!condition)
+		{
+			fprintf(stderr, "basisu_uastc_enc: Internal validation failed on line %u!\n", line);
+		}
+	}
+
+	#define VALIDATE(c) validate_func(c, __LINE__);
+#else
+	#define VALIDATE(c)
+#endif
+
+	enum dxt_constants
+	{
+		cDXT1SelectorBits = 2U, cDXT1SelectorValues = 1U << cDXT1SelectorBits, cDXT1SelectorMask = cDXT1SelectorValues - 1U,
+		cDXT5SelectorBits = 3U, cDXT5SelectorValues = 1U << cDXT5SelectorBits, cDXT5SelectorMask = cDXT5SelectorValues - 1U,
+	};
+
+	struct dxt1_block
+	{
+		enum { cTotalEndpointBytes = 2, cTotalSelectorBytes = 4 };
+
+		uint8_t m_low_color[cTotalEndpointBytes];
+		uint8_t m_high_color[cTotalEndpointBytes];
+		uint8_t m_selectors[cTotalSelectorBytes];
+
+		inline void clear() { basisu::clear_obj(*this); }
+
+		inline uint32_t get_high_color() const { return m_high_color[0] | (m_high_color[1] << 8U); }
+		inline uint32_t get_low_color() const { return m_low_color[0] | (m_low_color[1] << 8U); }
+		inline void set_low_color(uint16_t c) { m_low_color[0] = static_cast<uint8_t>(c & 0xFF); m_low_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
+		inline void set_high_color(uint16_t c) { m_high_color[0] = static_cast<uint8_t>(c & 0xFF); m_high_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
+		inline uint32_t get_selector(uint32_t x, uint32_t y) const { assert((x < 4U) && (y < 4U)); return (m_selectors[y] >> (x * cDXT1SelectorBits))& cDXT1SelectorMask; }
+		inline void set_selector(uint32_t x, uint32_t y, uint32_t val) { assert((x < 4U) && (y < 4U) && (val < 4U)); m_selectors[y] &= (~(cDXT1SelectorMask << (x * cDXT1SelectorBits))); m_selectors[y] |= (val << (x * cDXT1SelectorBits)); }
+
+		static uint16_t pack_color(const color_rgba& color, bool scaled, uint32_t bias = 127U)
+		{
+			uint32_t r = color.r, g = color.g, b = color.b;
+			if (scaled)
+			{
+				r = (r * 31U + bias) / 255U;
+				g = (g * 63U + bias) / 255U;
+				b = (b * 31U + bias) / 255U;
+			}
+			return static_cast<uint16_t>(basisu::minimum(b, 31U) | (basisu::minimum(g, 63U) << 5U) | (basisu::minimum(r, 31U) << 11U));
+		}
+
+		static uint16_t pack_unscaled_color(uint32_t r, uint32_t g, uint32_t b) { return static_cast<uint16_t>(b | (g << 5U) | (r << 11U)); }
+	};
+
+#define UASTC_WRITE_MODE_DESCS 0
+
+	static inline void uastc_write_bits(uint8_t* pBuf, uint32_t& bit_offset, uint64_t code, uint32_t codesize, const char* pDesc)
+	{
+		(void)pDesc;
+
+#if UASTC_WRITE_MODE_DESCS
+		if (pDesc)
+			printf("%s: %u %u\n", pDesc, bit_offset, codesize);
+#endif
+
+		assert((codesize == 64) || (code < (1ULL << codesize)));
+
+		while (codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_write = basisu::minimum<int>(codesize, 8 - byte_bit_offset);
+
+			pBuf[bit_offset >> 3] |= (code << byte_bit_offset);
+
+			code >>= bits_to_write;
+			codesize -= bits_to_write;
+			bit_offset += bits_to_write;
+		}
+	}
+
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1)
+	{
+		if ((g_uastc_mode_has_alpha[result.m_uastc_mode]) && (result.m_uastc_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			assert(etc_eac_a8_blk.m_multiplier >= 1);
+		}
+
+		uint8_t buf[32];
+		memset(buf, 0, sizeof(buf));
+
+		uint32_t block_bit_offset = 0;
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("**** Mode: %u\n", result.m_uastc_mode);
+#endif
+
+		uastc_write_bits(buf, block_bit_offset, g_uastc_mode_huff_codes[result.m_uastc_mode][0], g_uastc_mode_huff_codes[result.m_uastc_mode][1], "mode");
+
+		if (result.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.r, 8, "R");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.g, 8, "G");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.b, 8, "B");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.a, 8, "A");
+
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_diff_bit(), 1, "ETC1D");
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(0), 3, "ETC1I");
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_selector(0, 0), 2, "ETC1S");
+
+			uint32_t r, g, b;
+			if (etc1_blk.get_diff_bit())
+				etc_block::unpack_color5(r, g, b, etc1_blk.get_base5_color(), false);
+			else
+				etc_block::unpack_color4(r, g, b, etc1_blk.get_base4_color(0), false);
+
+			uastc_write_bits(buf, block_bit_offset, r, 5, "ETC1R");
+			uastc_write_bits(buf, block_bit_offset, g, 5, "ETC1G");
+			uastc_write_bits(buf, block_bit_offset, b, 5, "ETC1B");
+
+			memcpy(&blk, buf, sizeof(blk));
+			return;
+		}
+
+		if (g_uastc_mode_has_bc1_hint0[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, bc1_hint0, 1, "BC1H0");
+		else
+		{
+			assert(bc1_hint0 == false);
+		}
+
+		if (g_uastc_mode_has_bc1_hint1[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, bc1_hint1, 1, "BC1H1");
+		else
+		{
+			assert(bc1_hint1 == false);
+		}
+
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_flip_bit(), 1, "ETC1F");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_diff_bit(), 1, "ETC1D");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(0), 3, "ETC1I0");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(1), 3, "ETC1I1");
+
+		if (g_uastc_mode_has_etc1_bias[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, etc1_bias, 5, "ETC1BIAS");
+		else
+		{
+			assert(etc1_bias == 0);
+		}
+
+		if (g_uastc_mode_has_alpha[result.m_uastc_mode])
+		{
+			const uint32_t etc2_hints = etc_eac_a8_blk.m_table | (etc_eac_a8_blk.m_multiplier << 4);
+
+			assert(etc2_hints > 0 && etc2_hints <= 0xFF);
+			uastc_write_bits(buf, block_bit_offset, etc2_hints, 8, "ETC2TM");
+		}
+
+		uint32_t subsets = 1;
+		switch (result.m_uastc_mode)
+		{
+		case 2:
+		case 4:
+		case 7:
+		case 9:
+		case 16:
+			uastc_write_bits(buf, block_bit_offset, result.m_common_pattern, 5, "PAT");
+			subsets = 2;
+			break;
+		case 3:
+			uastc_write_bits(buf, block_bit_offset, result.m_common_pattern, 4, "PAT");
+			subsets = 3;
+			break;
+		default:
+			break;
+		}
+
+#ifdef _DEBUG
+		uint32_t part_seed = 0;
+		switch (result.m_uastc_mode)
+		{
+		case 2:
+		case 4:
+		case 9:
+		case 16:
+			part_seed = g_astc_bc7_common_partitions2[result.m_common_pattern].m_astc;
+			break;
+		case 3:
+			part_seed = g_astc_bc7_common_partitions3[result.m_common_pattern].m_astc;
+			break;
+		case 7:
+			part_seed = g_bc7_3_astc2_common_partitions[result.m_common_pattern].m_astc2;
+			break;
+		default:
+			break;
+		}
+#endif		
+
+		uint32_t total_planes = 1;
+		switch (result.m_uastc_mode)
+		{
+		case 6:
+		case 11:
+		case 13:
+			uastc_write_bits(buf, block_bit_offset, result.m_astc.m_ccs, 2, "COMPSEL");
+			total_planes = 2;
+			break;
+		case 17:
+			// CCS field is always 3 for dual plane LA.
+			assert(result.m_astc.m_ccs == 3);
+			total_planes = 2;
+			break;
+		default:
+			break;
+		}
+
+		uint8_t weights[32];
+		memcpy(weights, result.m_astc.m_weights, 16 * total_planes);
+
+		uint8_t endpoints[18];
+		memcpy(endpoints, result.m_astc.m_endpoints, sizeof(endpoints));
+
+		const uint32_t total_comps = g_uastc_mode_comps[result.m_uastc_mode];
+
+		// LLAA
+		// LLAA LLAA
+		// LLAA LLAA LLAA
+		// RRGGBB
+		// RRGGBB RRGGBB
+		// RRGGBB RRGGBB RRGGBB
+		// RRGGBBAA
+		// RRGGBBAA RRGGBBAA
+
+		const uint32_t weight_bits = g_uastc_mode_weight_bits[result.m_uastc_mode];
+
+		const uint8_t* pPartition_pattern;
+		const uint8_t* pSubset_anchor_indices = basist::get_anchor_indices(subsets, result.m_uastc_mode, result.m_common_pattern, pPartition_pattern);
+
+		for (uint32_t plane_index = 0; plane_index < total_planes; plane_index++)
+		{
+			for (uint32_t subset_index = 0; subset_index < subsets; subset_index++)
+			{
+				const uint32_t anchor_index = pSubset_anchor_indices[subset_index];
+
+#ifdef _DEBUG
+				if (subsets >= 2)
+				{
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const uint32_t part_index = astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true);
+						if (part_index == subset_index)
+						{
+							assert(anchor_index == i);
+							break;
+						}
+					}
+				}
+				else
+				{
+					assert(!anchor_index);
+				}
+#endif
+
+				// Check anchor weight's MSB - if it's set then invert this subset's weights and swap the endpoints
+				if (weights[anchor_index * total_planes + plane_index] & (1 << (weight_bits - 1)))
+				{
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const uint32_t part_index = pPartition_pattern[i];
+
+#ifdef _DEBUG
+						if (subsets >= 2)
+						{
+							assert(part_index == (uint32_t)astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true));
+						}
+						else
+						{
+							assert(!part_index);
+						}
+#endif
+
+						if (part_index == subset_index)
+							weights[i * total_planes + plane_index] = ((1 << weight_bits) - 1) - weights[i * total_planes + plane_index];
+					}
+
+					if (total_planes == 2)
+					{
+						for (int c = 0; c < (int)total_comps; c++)
+						{
+							const uint32_t comp_plane = (total_comps == 2) ? c : ((c == result.m_astc.m_ccs) ? 1 : 0);
+
+							if (comp_plane == plane_index)
+								std::swap(endpoints[c * 2 + 0], endpoints[c * 2 + 1]);
+						}
+					}
+					else
+					{
+						for (uint32_t c = 0; c < total_comps; c++)
+							std::swap(endpoints[subset_index * total_comps * 2 + c * 2 + 0], endpoints[subset_index * total_comps * 2 + c * 2 + 1]);
+					}
+				}
+			} // subset_index
+		} // plane_index
+
+		const uint32_t total_values = total_comps * 2 * subsets;
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[result.m_uastc_mode];
+
+		uint32_t bit_values[18];
+		uint32_t tq_values[8];
+		uint32_t total_tq_values = 0;
+		uint32_t tq_accum = 0;
+		uint32_t tq_mul = 1;
+
+		const uint32_t ep_bits = g_astc_bise_range_table[endpoint_range][0];
+		const uint32_t ep_trits = g_astc_bise_range_table[endpoint_range][1];
+		const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2];
+
+		for (uint32_t i = 0; i < total_values; i++)
+		{
+			uint32_t val = endpoints[i];
+
+			uint32_t bits = val & ((1 << ep_bits) - 1);
+			uint32_t tq = val >> ep_bits;
+
+			bit_values[i] = bits;
+
+			if (ep_trits)
+			{
+				assert(tq < 3);
+				tq_accum += tq * tq_mul;
+				tq_mul *= 3;
+				if (tq_mul == 243)
+				{
+					tq_values[total_tq_values++] = tq_accum;
+					tq_accum = 0;
+					tq_mul = 1;
+				}
+			}
+			else if (ep_quints)
+			{
+				assert(tq < 5);
+				tq_accum += tq * tq_mul;
+				tq_mul *= 5;
+				if (tq_mul == 125)
+				{
+					tq_values[total_tq_values++] = tq_accum;
+					tq_accum = 0;
+					tq_mul = 1;
+				}
+			}
+		}
+
+		uint32_t total_endpoint_bits = 0;
+
+		for (uint32_t i = 0; i < total_tq_values; i++)
+		{
+			const uint32_t num_bits = ep_trits ? 8 : 7;
+			uastc_write_bits(buf, block_bit_offset, tq_values[i], num_bits, "ETQ");
+			total_endpoint_bits += num_bits;
+		}
+
+		if (tq_mul > 1)
+		{
+			uint32_t num_bits;
+			if (ep_trits)
+			{
+				if (tq_mul == 3)
+					num_bits = 2;
+				else if (tq_mul == 9)
+					num_bits = 4;
+				else if (tq_mul == 27)
+					num_bits = 5;
+				else //if (tq_mul == 81)
+					num_bits = 7;
+			}
+			else
+			{
+				if (tq_mul == 5)
+					num_bits = 3;
+				else //if (tq_mul == 25)
+					num_bits = 5;
+			}
+			uastc_write_bits(buf, block_bit_offset, tq_accum, num_bits, "ETQ");
+			total_endpoint_bits += num_bits;
+		}
+
+		for (uint32_t i = 0; i < total_values; i++)
+		{
+			uastc_write_bits(buf, block_bit_offset, bit_values[i], ep_bits, "EBITS");
+			total_endpoint_bits += ep_bits;
+		}
+
+#if UASTC_WRITE_MODE_DESCS
+		uint32_t weight_start = block_bit_offset;
+#endif
+
+		uint32_t total_weight_bits = 0;
+		const uint32_t plane_shift = (total_planes == 2) ? 1 : 0;
+		for (uint32_t i = 0; i < 16 * total_planes; i++)
+		{
+			uint32_t numbits = weight_bits;
+			for (uint32_t s = 0; s < subsets; s++)
+			{
+				if (pSubset_anchor_indices[s] == (i >> plane_shift))
+				{
+					numbits--;
+					break;
+				}
+			}
+
+			uastc_write_bits(buf, block_bit_offset, weights[i], numbits, nullptr);
+
+			total_weight_bits += numbits;
+		}
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("WEIGHTS: %u %u\n", weight_start, total_weight_bits);
+#endif
+
+		assert(block_bit_offset <= 128);
+		memcpy(&blk, buf, sizeof(blk));
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("Total bits: %u, endpoint bits: %u, weight bits: %u\n", block_bit_offset, total_endpoint_bits, total_weight_bits);
+#endif
+	}
+	
+	// MODE 0
+	// 0. DualPlane: 0, WeightRange: 8 (16), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 19 (192)       MODE6 RGB
+	// 18. DualPlane: 0, WeightRange: 11 (32), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 11 (32)       MODE6 RGB
+	static void astc_mode0_or_18(uint32_t mode, const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, const uint8_t *pForce_selectors = nullptr)
+	{
+		const uint32_t endpoint_range = (mode == 18) ? 11 : 19;
+		const uint32_t weight_range = (mode == 18) ? 11 : 8;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = (mode == 18) ? 32 : 16;
+		ccell_params.m_pSelector_weights = (mode == 18) ? g_astc_weights5 : g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (mode == 18) ? (const bc7enc_vec4F*)g_astc_weights5x : (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_pForce_selectors = pForce_selectors;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;// (mode == 18) ? 11 : 8;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 8;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+				
+		bool invert = false;
+
+		if (pForce_selectors == nullptr)
+		{
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			invert = true;
+			}
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = ((mode == 18) ? 31 : 15) - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = mode;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 1
+	// 1-subset, 2-bit indices, 8-bit endpoints, BC7 mode 3
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 20 (256)        MODE3 or MODE5 RGB
+	static void astc_mode1(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 4;
+		ccell_params.m_pSelector_weights = g_bc7_weights2;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+		ccell_params.m_astc_endpoint_range = 20;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = 2;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 8;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+
+		const uint32_t range = 20;
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 1;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	static uint32_t estimate_partition2(uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeights, const color_rgba block[4][4], const uint32_t weights[4])
+	{
+		assert(pWeights[0] == 0 && pWeights[num_weights - 1] == 64);
+
+		uint64_t best_err = UINT64_MAX;
+		uint32_t best_common_pattern = 0;
+
+		for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS2; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			const uint8_t* pPartition = &g_bc7_partition2[bc7_pattern * 16];
+
+			color_quad_u8 subset_colors[2][16];
+			uint32_t subset_total_colors[2] = { 0, 0 };
+			for (uint32_t index = 0; index < 16; index++)
+				subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+			uint64_t total_subset_err = 0;
+			for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
+				total_subset_err += color_cell_compression_est_astc(num_weights, num_comps, pWeights, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+			if (total_subset_err < best_err)
+			{
+				best_err = total_subset_err;
+				best_common_pattern = common_pattern;
+			}
+		}
+
+		return best_common_pattern;
+	}
+
+	// MODE 2
+	// 2-subset, 3-bit indices, 4-bit endpoints, BC7 mode 1
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 8 (16)          MODE1
+	static void astc_mode2(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+
+		if (estimate_partition)
+		{
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+			first_common_pattern = estimate_partition2(8, 3, g_bc7_weights3, block, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 8;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights3;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+				ccell_params[part].m_astc_endpoint_range = 8;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			{
+				// ASTC
+				astc_block_desc astc_results;
+				memset(&astc_results, 0, sizeof(astc_results));
+
+				astc_results.m_dual_plane = false;
+				astc_results.m_weight_range = 5;
+
+				astc_results.m_ccs = 0;
+				astc_results.m_subsets = 2;
+				astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+				astc_results.m_cem = 8;
+
+				uint32_t p0 = 0;
+				uint32_t p1 = 1;
+				if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+					std::swap(p0, p1);
+
+				astc_results.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+				astc_results.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+				astc_results.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+				astc_results.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+				astc_results.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+				astc_results.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+				const uint32_t range = 8;
+
+				bool invert[2] = { false, false };
+
+				int s0 = g_astc_unquant[range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4]].m_unquant;
+				int s1 = g_astc_unquant[range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5]].m_unquant;
+				if (s1 < s0)
+				{
+					std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+					std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+					std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+					invert[0] = true;
+				}
+
+				astc_results.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+				astc_results.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+				astc_results.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+				astc_results.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+				astc_results.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+				astc_results.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+				s0 = g_astc_unquant[range][astc_results.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4 + 6]].m_unquant;
+				s1 = g_astc_unquant[range][astc_results.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5 + 6]].m_unquant;
+
+				if (s1 < s0)
+				{
+					std::swap(astc_results.m_endpoints[0 + 6], astc_results.m_endpoints[1 + 6]);
+					std::swap(astc_results.m_endpoints[2 + 6], astc_results.m_endpoints[3 + 6]);
+					std::swap(astc_results.m_endpoints[4 + 6], astc_results.m_endpoints[5 + 6]);
+					invert[1] = true;
+				}
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+						astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+						uint32_t astc_part = bc7_part;
+						if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+							astc_part = 1 - astc_part;
+
+						if (invert[astc_part])
+							astc_results.m_weights[x + y * 4] = 7 - astc_results.m_weights[x + y * 4];
+					}
+				}
+
+				assert(total_results < MAX_ENCODE_RESULTS);
+				if (total_results < MAX_ENCODE_RESULTS)
+				{
+					pResults[total_results].m_uastc_mode = 2;
+					pResults[total_results].m_common_pattern = common_pattern;
+					pResults[total_results].m_astc = astc_results;
+					pResults[total_results].m_astc_err = total_part_err;
+					total_results++;
+				}
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 3
+	// 3-subsets, 2-bit indices, [0,11] endpoints, BC7 mode 2
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 3, CEM: 8 (RGB Direct	     ), EndpointRange: 7 (12)		   MODE2
+	static void astc_mode3(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS3;
+
+		if (estimate_partition)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_common_pattern = 0;
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+			for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS3; common_pattern++)
+			{
+				const uint32_t bc7_pattern = g_astc_bc7_common_partitions3[common_pattern].m_bc7;
+
+				const uint8_t* pPartition = &g_bc7_partition3[bc7_pattern * 16];
+
+				color_quad_u8 subset_colors[3][16];
+				uint32_t subset_total_colors[3] = { 0, 0 };
+				for (uint32_t index = 0; index < 16; index++)
+					subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+				uint64_t total_subset_err = 0;
+				for (uint32_t subset = 0; (subset < 3) && (total_subset_err < best_err); subset++)
+					total_subset_err += color_cell_compression_est_astc(4, 3, g_bc7_weights2, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+				if (total_subset_err < best_err)
+				{
+					best_err = total_subset_err;
+					best_common_pattern = common_pattern;
+				}
+			}
+
+			first_common_pattern = best_common_pattern;
+			last_common_pattern = best_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t endpoint_range = 7;
+
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions3[common_pattern].m_bc7;
+
+			color_rgba part_pixels[3][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[3] = { 0, 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition3[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[bc7_part];
+					part_pixels[bc7_part][num_part_pixels[bc7_part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[3];
+			color_cell_compressor_results ccell_results[3];
+			uint8_t ccell_result_selectors[3][16];
+			uint8_t ccell_result_selectors_temp[3][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t bc7_part = 0; bc7_part < 3; bc7_part++)
+			{
+				memset(&ccell_params[bc7_part], 0, sizeof(ccell_params[bc7_part]));
+
+				ccell_params[bc7_part].m_num_pixels = num_part_pixels[bc7_part];
+				ccell_params[bc7_part].m_pPixels = (color_quad_u8*)&part_pixels[bc7_part][0];
+				ccell_params[bc7_part].m_num_selector_weights = 4;
+				ccell_params[bc7_part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[bc7_part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[bc7_part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[bc7_part].m_weights[0] = 1;
+				ccell_params[bc7_part].m_weights[1] = 1;
+				ccell_params[bc7_part].m_weights[2] = 1;
+				ccell_params[bc7_part].m_weights[3] = 1;
+
+				memset(&ccell_results[bc7_part], 0, sizeof(ccell_results[bc7_part]));
+				ccell_results[bc7_part].m_pSelectors = &ccell_result_selectors[bc7_part][0];
+				ccell_results[bc7_part].m_pSelectors_temp = &ccell_result_selectors_temp[bc7_part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[bc7_part], &ccell_results[bc7_part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			{
+				// ASTC
+				astc_block_desc astc_results;
+				memset(&astc_results, 0, sizeof(astc_results));
+
+				astc_results.m_dual_plane = false;
+				astc_results.m_weight_range = 2;
+
+				astc_results.m_ccs = 0;
+				astc_results.m_subsets = 3;
+				astc_results.m_partition_seed = g_astc_bc7_common_partitions3[common_pattern].m_astc;
+				astc_results.m_cem = 8;
+
+				uint32_t astc_to_bc7_part[3]; // converts ASTC to BC7 partition index
+				const uint32_t perm = g_astc_bc7_common_partitions3[common_pattern].m_astc_to_bc7_perm;
+				astc_to_bc7_part[0] = g_astc_to_bc7_partition_index_perm_tables[perm][0];
+				astc_to_bc7_part[1] = g_astc_to_bc7_partition_index_perm_tables[perm][1];
+				astc_to_bc7_part[2] = g_astc_to_bc7_partition_index_perm_tables[perm][2];
+
+				bool invert_astc_part[3] = { false, false, false };
+
+				for (uint32_t astc_part = 0; astc_part < 3; astc_part++)
+				{
+					uint8_t* pEndpoints = &astc_results.m_endpoints[6 * astc_part];
+
+					pEndpoints[0] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[0];
+					pEndpoints[1] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[0];
+					pEndpoints[2] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[1];
+					pEndpoints[3] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[1];
+					pEndpoints[4] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[2];
+					pEndpoints[5] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[2];
+
+					int s0 = g_astc_unquant[endpoint_range][pEndpoints[0]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[2]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[4]].m_unquant;
+					int s1 = g_astc_unquant[endpoint_range][pEndpoints[1]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[3]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[5]].m_unquant;
+					if (s1 < s0)
+					{
+						std::swap(pEndpoints[0], pEndpoints[1]);
+						std::swap(pEndpoints[2], pEndpoints[3]);
+						std::swap(pEndpoints[4], pEndpoints[5]);
+						invert_astc_part[astc_part] = true;
+					}
+				}
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t bc7_part = g_bc7_partition3[16 * bc7_pattern + x + y * 4];
+
+						astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+						uint32_t astc_part = 0;
+						for (uint32_t i = 0; i < 3; i++)
+						{
+							if (astc_to_bc7_part[i] == bc7_part)
+							{
+								astc_part = i;
+								break;
+							}
+						}
+
+						if (invert_astc_part[astc_part])
+							astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+					}
+				}
+
+				assert(total_results < MAX_ENCODE_RESULTS);
+				if (total_results < MAX_ENCODE_RESULTS)
+				{
+					pResults[total_results].m_uastc_mode = 3;
+					pResults[total_results].m_common_pattern = common_pattern;
+					pResults[total_results].m_astc = astc_results;
+					pResults[total_results].m_astc_err = total_part_err;
+					total_results++;
+				}
+
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 4
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 12 (40)         MODE3
+	static void astc_mode4(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 12;
+
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+
+		if (estimate_partition)
+		{
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+			first_common_pattern = estimate_partition2(4, 3, g_bc7_weights2, block, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 4;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			// ASTC
+			astc_block_desc astc_results;
+			memset(&astc_results, 0, sizeof(astc_results));
+
+			astc_results.m_dual_plane = false;
+			astc_results.m_weight_range = 2;
+
+			astc_results.m_ccs = 0;
+			astc_results.m_subsets = 2;
+			astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+			astc_results.m_cem = 8;
+
+			uint32_t p0 = 0;
+			uint32_t p1 = 1;
+			if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+				std::swap(p0, p1);
+
+			astc_results.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+			astc_results.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+			astc_results.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+			astc_results.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+			astc_results.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+			astc_results.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+			bool invert[2] = { false, false };
+
+			int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+				std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+				std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+				invert[0] = true;
+			}
+
+			astc_results.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+			astc_results.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+			astc_results.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+			astc_results.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+			astc_results.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+			astc_results.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+			s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4 + 6]].m_unquant;
+			s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5 + 6]].m_unquant;
+
+			if (s1 < s0)
+			{
+				std::swap(astc_results.m_endpoints[0 + 6], astc_results.m_endpoints[1 + 6]);
+				std::swap(astc_results.m_endpoints[2 + 6], astc_results.m_endpoints[3 + 6]);
+				std::swap(astc_results.m_endpoints[4 + 6], astc_results.m_endpoints[5 + 6]);
+				invert[1] = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+					astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+					uint32_t astc_part = bc7_part;
+					if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+						astc_part = 1 - astc_part;
+
+					if (invert[astc_part])
+						astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 4;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = astc_results;
+				pResults[total_results].m_astc_err = total_part_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 5 
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 20 (256) 		BC7 MODE 6 (or MODE 1 1-subset)
+	static void astc_mode5(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 5;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 8;
+		ccell_params.m_pSelector_weights = g_bc7_weights3;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc blk;
+		memset(&blk, 0, sizeof(blk));
+
+		blk.m_dual_plane = false;
+		blk.m_weight_range = weight_range;
+
+		blk.m_ccs = 0;
+		blk.m_subsets = 1;
+		blk.m_partition_seed = 0;
+		blk.m_cem = 8;
+
+		blk.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		blk.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		blk.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		blk.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		blk.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		blk.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+			std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+			std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				blk.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					blk.m_weights[x + y * 4] = 7 - blk.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 5;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = blk;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 6
+	// DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 18 (160)		BC7 MODE5
+	static void astc_mode6(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		for (uint32_t rot_comp = 0; rot_comp < 3; rot_comp++)
+		{
+			const uint32_t weight_range = 2;
+			const uint32_t endpoint_range = 18;
+
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				uint8_t c = block_a[i].m_c[rot_comp];
+				block_a[i].m_c[0] = c;
+				block_a[i].m_c[1] = c;
+				block_a[i].m_c[2] = c;
+				block_a[i].m_c[3] = 255;
+
+				block_rgb[i].m_c[rot_comp] = 255;
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 4;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &comp_params);
+			
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 4;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &comp_params) / 3;
+
+			uint64_t total_err = part_err_rgb + part_err_a;
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = 8;
+
+			blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+
+			bool invert = false;
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				invert = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 3 - rgb_index;
+						a_index = 3 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 6;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE 7 - 2 subset ASTC, 3 subset BC7
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 12 (40)         MODE2
+	static void astc_mode7(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS;
+
+		if (estimate_partition)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_common_pattern = 0;
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+			for (uint32_t common_pattern = 0; common_pattern < TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS; common_pattern++)
+			{
+				const uint8_t* pPartition = &g_bc7_3_astc2_patterns2[common_pattern][0];
+
+#ifdef _DEBUG
+				const uint32_t astc_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_astc2;
+				const uint32_t bc7_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_bc73;
+				const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[common_pattern].k;
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+						assert((int)astc_part == astc_compute_texel_partition(astc_pattern, x, y, 0, 2, true));
+						assert(astc_part == pPartition[x + y * 4]);
+					}
+				}
+#endif
+
+				color_quad_u8 subset_colors[2][16];
+				uint32_t subset_total_colors[2] = { 0, 0 };
+				for (uint32_t index = 0; index < 16; index++)
+					subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+				uint64_t total_subset_err = 0;
+				for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
+					total_subset_err += color_cell_compression_est_astc(4, 3, g_bc7_weights2, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+				if (total_subset_err < best_err)
+				{
+					best_err = total_subset_err;
+					best_common_pattern = common_pattern;
+				}
+			}
+
+			first_common_pattern = best_common_pattern;
+			last_common_pattern = best_common_pattern + 1;
+		}
+
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 12;
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t astc_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_astc2;
+			const uint32_t bc7_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_bc73;
+			const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[common_pattern].k;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+#ifdef _DEBUG					
+					assert((int)astc_part == astc_compute_texel_partition(astc_pattern, x, y, 0, 2, true));
+#endif					
+
+					part_pixel_index[y][x] = num_part_pixels[astc_part];
+					part_pixels[astc_part][num_part_pixels[astc_part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 4;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = false;
+			blk.m_weight_range = 2;
+
+			blk.m_ccs = 0;
+			blk.m_subsets = 2;
+			blk.m_partition_seed = astc_pattern;
+			blk.m_cem = 8;
+
+			const uint32_t p0 = 0;
+			const uint32_t p1 = 1;
+
+			blk.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+			bool invert[2] = { false, false };
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				invert[0] = true;
+			}
+
+			blk.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+			s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4 + 6]].m_unquant;
+			s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5 + 6]].m_unquant;
+
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0 + 6], blk.m_endpoints[1 + 6]);
+				std::swap(blk.m_endpoints[2 + 6], blk.m_endpoints[3 + 6]);
+				std::swap(blk.m_endpoints[4 + 6], blk.m_endpoints[5 + 6]);
+				invert[1] = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+
+					blk.m_weights[x + y * 4] = ccell_result_selectors[astc_part][part_pixel_index[y][x]];
+
+					if (invert[astc_part])
+						blk.m_weights[x + y * 4] = 3 - blk.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 7;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_part_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	static void estimate_partition2_list(uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeights, const color_rgba block[4][4], uint32_t* pParts, uint32_t max_parts, const uint32_t weights[4])
+	{
+		assert(pWeights[0] == 0 && pWeights[num_weights - 1] == 64);
+
+		const uint32_t MAX_PARTS = 8;
+		assert(max_parts <= MAX_PARTS);
+
+		uint64_t part_error[MAX_PARTS];
+		memset(part_error, 0xFF, sizeof(part_error));
+		memset(pParts, 0, sizeof(pParts[0]) * max_parts);
+
+		for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS2; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			const uint8_t* pPartition = &g_bc7_partition2[bc7_pattern * 16];
+
+			color_quad_u8 subset_colors[2][16];
+			uint32_t subset_total_colors[2] = { 0, 0 };
+			for (uint32_t index = 0; index < 16; index++)
+				subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+			uint64_t total_subset_err = 0;
+			for (uint32_t subset = 0; subset < 2; subset++)
+				total_subset_err += color_cell_compression_est_astc(num_weights, num_comps, pWeights, subset_total_colors[subset], &subset_colors[subset][0], UINT64_MAX, weights);
+
+			for (int i = 0; i < (int)max_parts; i++)
+			{
+				if (total_subset_err < part_error[i])
+				{
+					for (int j = max_parts - 1; j > i; --j)
+					{
+						pParts[j] = pParts[j - 1];
+						part_error[j] = part_error[j - 1];
+					}
+
+					pParts[i] = common_pattern;
+					part_error[i] = total_subset_err;
+
+					break;
+				}
+			}
+		}
+
+#ifdef _DEBUG
+		for (uint32_t i = 0; i < max_parts - 1; i++)
+		{
+			assert(part_error[i] <= part_error[i + 1]);
+		}
+#endif
+	}
+		
+	// 9. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 12 (RGBA Direct), EndpointRange: 8 (16) - BC7 MODE 7
+	// 16. DualPlane: 0, WeightRange : 2 (4), Subsets : 2, CEM: 4 (LA Direct), EndpointRange : 20 (256) - BC7 MODE 7
+	static void astc_mode9_or_16(uint32_t mode, const color_rgba source_block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, uint32_t estimate_partition_list_size)
+	{
+		assert(mode == 9 || mode == 16);
+
+		const color_rgba* pBlock = &source_block[0][0];
+
+		color_rgba temp_block[16];
+		if (mode == 16)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if (mode == 16)
+				{
+					assert(pBlock[i].r == pBlock[i].g);
+					assert(pBlock[i].r == pBlock[i].b);
+				}
+
+				const uint32_t l = pBlock[i].r;
+				const uint32_t a = pBlock[i].a;
+
+				// Use (l,0,0,a) not (l,l,l,a) so both components are treated equally.
+				temp_block[i].set_noclamp_rgba(l, 0, 0, a);
+			}
+
+			pBlock = temp_block;
+		}
+
+		const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = (mode == 16) ? 20 : 8;
+
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+		bool use_part_list = false;
+
+		const uint32_t MAX_PARTS = 8;
+		uint32_t parts[MAX_PARTS];
+
+		if (estimate_partition_list_size == 1)
+		{
+			first_common_pattern = estimate_partition2(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+		else if (estimate_partition_list_size > 0)
+		{
+			assert(estimate_partition_list_size <= MAX_PARTS);
+			estimate_partition_list_size = basisu::minimum(estimate_partition_list_size, MAX_PARTS);
+
+			estimate_partition2_list(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, parts, estimate_partition_list_size, weights);
+
+			first_common_pattern = 0;
+			last_common_pattern = estimate_partition_list_size;
+			use_part_list = true;
+
+#ifdef _DEBUG
+			assert(parts[0] == estimate_partition2(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, weights));
+#endif
+		}
+
+		for (uint32_t common_pattern_iter = first_common_pattern; common_pattern_iter < last_common_pattern; common_pattern_iter++)
+		{
+			const uint32_t common_pattern = use_part_list ? parts[common_pattern_iter] : common_pattern_iter;
+
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = pBlock[y * 4 + x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_err = 0;
+			for (uint32_t subset = 0; subset < 2; subset++)
+			{
+				memset(&ccell_params[subset], 0, sizeof(ccell_params[subset]));
+
+				ccell_params[subset].m_num_pixels = num_part_pixels[subset];
+				ccell_params[subset].m_pPixels = (color_quad_u8*)&part_pixels[subset][0];
+				ccell_params[subset].m_num_selector_weights = 4;
+				ccell_params[subset].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[subset].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[subset].m_astc_endpoint_range = endpoint_range;
+				ccell_params[subset].m_weights[0] = weights[0];
+				ccell_params[subset].m_weights[1] = weights[1];
+				ccell_params[subset].m_weights[2] = weights[2];
+				ccell_params[subset].m_weights[3] = weights[3];
+				ccell_params[subset].m_has_alpha = true;
+
+				memset(&ccell_results[subset], 0, sizeof(ccell_results[subset]));
+				ccell_results[subset].m_pSelectors = &ccell_result_selectors[subset][0];
+				ccell_results[subset].m_pSelectors_temp = &ccell_result_selectors_temp[subset][0];
+
+				uint64_t subset_err = color_cell_compression(255, &ccell_params[subset], &ccell_results[subset], &comp_params);
+
+				if (mode == 16)
+				{
+					color_rgba colors[4];
+					for (uint32_t c = 0; c < 4; c++)
+					{
+						colors[0].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results[subset].m_astc_low_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+						colors[3].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results[subset].m_astc_high_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+					}
+
+					for (uint32_t i = 1; i < 4 - 1; i++)
+						for (uint32_t c = 0; c < 4; c++)
+							colors[i].m_comps[c] = (uint8_t)astc_interpolate(colors[0].m_comps[c], colors[3].m_comps[c], g_bc7_weights2[i], false);
+
+					for (uint32_t p = 0; p < ccell_params[subset].m_num_pixels; p++)
+					{
+						color_rgba orig_pix(part_pixels[subset][p]);
+						orig_pix.g = orig_pix.r;
+						orig_pix.b = orig_pix.r;
+						total_err += color_distance_la(orig_pix, colors[ccell_result_selectors[subset][p]]);
+					}
+				}
+				else
+				{
+					total_err += subset_err;
+				}
+			} // subset
+
+			// ASTC
+			astc_block_desc astc_results;
+			memset(&astc_results, 0, sizeof(astc_results));
+
+			astc_results.m_dual_plane = false;
+			astc_results.m_weight_range = 2;
+
+			astc_results.m_ccs = 0;
+			astc_results.m_subsets = 2;
+			astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+			astc_results.m_cem = (mode == 16) ? 4 : 12;
+
+			uint32_t part[2] = { 0, 1 };
+			if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+				std::swap(part[0], part[1]);
+
+			bool invert[2] = { false, false };
+
+			for (uint32_t p = 0; p < 2; p++)
+			{
+				if (mode == 16)
+				{
+					astc_results.m_endpoints[p * 4 + 0] = ccell_results[part[p]].m_astc_low_endpoint.m_c[0];
+					astc_results.m_endpoints[p * 4 + 1] = ccell_results[part[p]].m_astc_high_endpoint.m_c[0];
+
+					astc_results.m_endpoints[p * 4 + 2] = ccell_results[part[p]].m_astc_low_endpoint.m_c[3];
+					astc_results.m_endpoints[p * 4 + 3] = ccell_results[part[p]].m_astc_high_endpoint.m_c[3];
+				}
+				else
+				{
+					for (uint32_t c = 0; c < 4; c++)
+					{
+						astc_results.m_endpoints[p * 8 + c * 2] = ccell_results[part[p]].m_astc_low_endpoint.m_c[c];
+						astc_results.m_endpoints[p * 8 + c * 2 + 1] = ccell_results[part[p]].m_astc_high_endpoint.m_c[c];
+					}
+
+					int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 0]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 2]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 4]].m_unquant;
+
+					int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 1]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 3]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 5]].m_unquant;
+
+					if (s1 < s0)
+					{
+						std::swap(astc_results.m_endpoints[p * 8 + 0], astc_results.m_endpoints[p * 8 + 1]);
+						std::swap(astc_results.m_endpoints[p * 8 + 2], astc_results.m_endpoints[p * 8 + 3]);
+						std::swap(astc_results.m_endpoints[p * 8 + 4], astc_results.m_endpoints[p * 8 + 5]);
+						std::swap(astc_results.m_endpoints[p * 8 + 6], astc_results.m_endpoints[p * 8 + 7]);
+						invert[p] = true;
+					}
+				}
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+					astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+					uint32_t astc_part = bc7_part;
+					if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+						astc_part = 1 - astc_part;
+
+					if (invert[astc_part])
+						astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = mode;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = astc_results;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 10
+	// DualPlane: 0, WeightRange: 8 (16), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 13 (48)       MODE6
+	static void astc_mode10(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 8;
+		const uint32_t endpoint_range = 13;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 16;
+		ccell_params.m_pSelector_weights = g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 15 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 10;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// 11. DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 12 (RGBA Direct), EndpointRange: 13 (48)        MODE5
+	// 17. DualPlane: 1, WeightRange : 2 (4), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256)    BC7 MODE5
+	static void astc_mode11_or_17(uint32_t mode, const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		assert((mode == 11) || (mode == 17));
+
+		const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = (mode == 17) ? 20 : 13;
+
+		bc7enc_compress_block_params local_comp_params(comp_params);
+		local_comp_params.m_perceptual = false;
+		local_comp_params.m_weights[0] = 1;
+		local_comp_params.m_weights[1] = 1;
+		local_comp_params.m_weights[2] = 1;
+		local_comp_params.m_weights[3] = 1;
+
+		const uint32_t last_rot_comp = (mode == 17) ? 1 : 4;
+
+		for (uint32_t rot_comp = 0; rot_comp < last_rot_comp; rot_comp++)
+		{
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				if (mode == 17)
+				{
+					assert(block_rgb[i].m_c[0] == block_rgb[i].m_c[1]);
+					assert(block_rgb[i].m_c[0] == block_rgb[i].m_c[2]);
+
+					block_a[i].m_c[0] = block_rgb[i].m_c[3];
+					block_a[i].m_c[1] = block_rgb[i].m_c[3];
+					block_a[i].m_c[2] = block_rgb[i].m_c[3];
+					block_a[i].m_c[3] = 255;
+
+					block_rgb[i].m_c[1] = block_rgb[i].m_c[0];
+					block_rgb[i].m_c[2] = block_rgb[i].m_c[0];
+					block_rgb[i].m_c[3] = 255;
+				}
+				else
+				{
+					uint8_t c = block_a[i].m_c[rot_comp];
+					block_a[i].m_c[0] = c;
+					block_a[i].m_c[1] = c;
+					block_a[i].m_c[2] = c;
+					block_a[i].m_c[3] = 255;
+
+					block_rgb[i].m_c[rot_comp] = block_rgb[i].m_c[3];
+					block_rgb[i].m_c[3] = 255;
+				}
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 4;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &local_comp_params);
+
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 4;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &local_comp_params) / 3;
+
+			uint64_t total_err = (mode == 17) ? ((part_err_rgb / 3) + part_err_a) : (part_err_rgb + part_err_a);
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = (mode == 17) ? 3 : rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = (mode == 17) ? 4 : 12;
+
+			bool invert = false;
+
+			if (mode == 17)
+			{
+				assert(ccell_results_rgb.m_astc_low_endpoint.m_c[0] == ccell_results_rgb.m_astc_low_endpoint.m_c[1]);
+				assert(ccell_results_rgb.m_astc_low_endpoint.m_c[0] == ccell_results_rgb.m_astc_low_endpoint.m_c[2]);
+
+				assert(ccell_results_rgb.m_astc_high_endpoint.m_c[0] == ccell_results_rgb.m_astc_high_endpoint.m_c[1]);
+				assert(ccell_results_rgb.m_astc_high_endpoint.m_c[0] == ccell_results_rgb.m_astc_high_endpoint.m_c[2]);
+
+				blk.m_endpoints[0] = ccell_results_rgb.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[1] = ccell_results_rgb.m_astc_high_endpoint.m_c[0];
+
+				blk.m_endpoints[2] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[3] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+			}
+			else
+			{
+				blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+				blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+				blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+				blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+				blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+				if (rot_comp == 3)
+				{
+					blk.m_endpoints[6] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+					blk.m_endpoints[7] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+				}
+				else
+				{
+					blk.m_endpoints[6] = ccell_results_rgb.m_astc_low_endpoint.m_c[rot_comp];
+					blk.m_endpoints[7] = ccell_results_rgb.m_astc_high_endpoint.m_c[rot_comp];
+				}
+
+				int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+				int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+				if (s1 < s0)
+				{
+					std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+					std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+					std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+					std::swap(blk.m_endpoints[6], blk.m_endpoints[7]);
+					invert = true;
+				}
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 3 - rgb_index;
+						a_index = 3 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = mode;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE 12
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 19 (192)       MODE6
+	static void astc_mode12(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 5;
+		const uint32_t endpoint_range = 19;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 8;
+		ccell_params.m_pSelector_weights = g_bc7_weights3;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 7 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 12;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// 13. DualPlane: 1, WeightRange: 0 (2), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 20 (256)        MODE5
+	static void astc_mode13(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		bc7enc_compress_block_params local_comp_params(comp_params);
+		local_comp_params.m_perceptual = false;
+		local_comp_params.m_weights[0] = 1;
+		local_comp_params.m_weights[1] = 1;
+		local_comp_params.m_weights[2] = 1;
+		local_comp_params.m_weights[3] = 1;
+
+		for (uint32_t rot_comp = 0; rot_comp < 4; rot_comp++)
+		{
+			const uint32_t weight_range = 0;
+			const uint32_t endpoint_range = 20;
+
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				uint8_t c = block_a[i].m_c[rot_comp];
+				block_a[i].m_c[0] = c;
+				block_a[i].m_c[1] = c;
+				block_a[i].m_c[2] = c;
+				block_a[i].m_c[3] = 255;
+
+				block_rgb[i].m_c[rot_comp] = block_rgb[i].m_c[3];
+				block_rgb[i].m_c[3] = 255;
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 2;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights1;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights1x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &local_comp_params);
+
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 2;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights1;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights1x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &local_comp_params) / 3;
+
+			uint64_t total_err = part_err_rgb + part_err_a;
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = 12;
+
+			blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+			if (rot_comp == 3)
+			{
+				blk.m_endpoints[6] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[7] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+			}
+			else
+			{
+				blk.m_endpoints[6] = ccell_results_rgb.m_astc_low_endpoint.m_c[rot_comp];
+				blk.m_endpoints[7] = ccell_results_rgb.m_astc_high_endpoint.m_c[rot_comp];
+			}
+
+			bool invert = false;
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				std::swap(blk.m_endpoints[6], blk.m_endpoints[7]);
+				invert = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 1 - rgb_index;
+						a_index = 1 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 13;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE14
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 20 (256)		MODE6
+	static void astc_mode14(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 4;
+		ccell_params.m_pSelector_weights = g_bc7_weights2;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 14;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 15
+	// DualPlane: 0, WeightRange : 8 (16), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256)   BC7 MODE6
+	static void astc_mode15(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 8;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		color_rgba temp_block[16];
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t l = ((const color_rgba*)block)[i].r;
+			const uint32_t a = ((const color_rgba*)block)[i].a;
+
+			// Use (l,0,0,a) not (l,l,l,a) so both components are treated equally.
+			temp_block[i].set_noclamp_rgba(l, 0, 0, a);
+		}
+
+		ccell_params.m_num_pixels = 16;
+		//ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_pPixels = (color_quad_u8*)temp_block;
+		ccell_params.m_num_selector_weights = 16;
+		ccell_params.m_pSelector_weights = g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 4;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		for (uint32_t y = 0; y < 4; y++)
+			for (uint32_t x = 0; x < 4; x++)
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+		color_rgba colors[16];
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			colors[0].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results.m_astc_low_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+			colors[15].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results.m_astc_high_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+		}
+
+		for (uint32_t i = 1; i < 16 - 1; i++)
+			for (uint32_t c = 0; c < 4; c++)
+				colors[i].m_comps[c] = (uint8_t)astc_interpolate(colors[0].m_comps[c], colors[15].m_comps[c], g_astc_weights4[i], false);
+
+		uint64_t total_err = 0;
+		for (uint32_t p = 0; p < 16; p++)
+			total_err += color_distance_la(((const color_rgba*)block)[p], colors[ccell_result_selectors[p]]);
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 15;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = total_err;
+			total_results++;
+		}
+	}
+		
+	static void compute_block_error(const color_rgba block[4][4], const color_rgba decoded_block[4][4], uint64_t &total_rgb_err, uint64_t &total_rgba_err, uint64_t &total_la_err)
+	{
+		uint64_t total_err_r = 0, total_err_g = 0, total_err_b = 0, total_err_a = 0;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				const int dr = (int)block[y][x].m_comps[0] - (int)decoded_block[y][x].m_comps[0];
+				const int dg = (int)block[y][x].m_comps[1] - (int)decoded_block[y][x].m_comps[1];
+				const int db = (int)block[y][x].m_comps[2] - (int)decoded_block[y][x].m_comps[2];
+				const int da = (int)block[y][x].m_comps[3] - (int)decoded_block[y][x].m_comps[3];
+
+				total_err_r += dr * dr;
+				total_err_g += dg * dg;
+				total_err_b += db * db;
+				total_err_a += da * da;
+			}
+		}
+
+		total_la_err = total_err_r + total_err_a;
+		total_rgb_err = total_err_r + total_err_g + total_err_b;
+		total_rgba_err = total_rgb_err + total_err_a;
+	}
+
+	static void compute_bc1_hints(bool &bc1_hint0, bool &bc1_hint1, const uastc_encode_results &best_results, const color_rgba block[4][4], const color_rgba decoded_uastc_block[4][4])
+	{
+		const uint32_t best_mode = best_results.m_uastc_mode;
+		const bool perceptual = false;
+
+		bc1_hint0 = false;
+		bc1_hint1 = false;
+
+		if (best_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+			return;
+
+		if (!g_uastc_mode_has_bc1_hint0[best_mode] && !g_uastc_mode_has_bc1_hint1[best_mode])
+			return;
+
+		color_rgba tblock_bc1[4][4];
+		dxt1_block tbc1_block[8];
+		basist::encode_bc1(tbc1_block, (const uint8_t*)&decoded_uastc_block[0][0], 0);
+		unpack_block(texture_format::cBC1, tbc1_block, &tblock_bc1[0][0]);
+
+		color_rgba tblock_hint0_bc1[4][4];
+		color_rgba tblock_hint1_bc1[4][4];
+		
+		etc_block etc1_blk;
+		memset(&etc1_blk, 0, sizeof(etc1_blk));
+
+		eac_a8_block etc2_blk;
+		memset(&etc2_blk, 0, sizeof(etc2_blk));
+		etc2_blk.m_multiplier = 1;
+		
+		// Pack to UASTC, then unpack, because the endpoints may be swapped.
+
+		uastc_block temp_ublock;
+		pack_uastc(temp_ublock, best_results, etc1_blk, 0, etc2_blk, false, false);
+
+		unpacked_uastc_block temp_ublock_unpacked;
+		unpack_uastc(temp_ublock, temp_ublock_unpacked, false);
+										
+		unpacked_uastc_block ublock;
+		memset(&ublock, 0, sizeof(ublock));
+		ublock.m_mode = best_results.m_uastc_mode;
+		ublock.m_common_pattern = best_results.m_common_pattern;
+		ublock.m_astc = temp_ublock_unpacked.m_astc;
+
+		dxt1_block b;
+
+		// HINT1
+		if (!g_uastc_mode_has_bc1_hint1[best_mode])
+		{
+			memset(tblock_hint1_bc1, 0, sizeof(tblock_hint1_bc1));
+		}
+		else
+		{
+			transcode_uastc_to_bc1_hint1(ublock, (color32 (*)[4]) decoded_uastc_block, &b, false);
+
+			unpack_block(texture_format::cBC1, &b, &tblock_hint1_bc1[0][0]);
+		}
+
+		// HINT0
+		if (!g_uastc_mode_has_bc1_hint0[best_mode])
+		{
+			memset(tblock_hint0_bc1, 0, sizeof(tblock_hint0_bc1));
+		}
+		else
+		{
+			transcode_uastc_to_bc1_hint0(ublock, &b);
+			
+			unpack_block(texture_format::cBC1, &b, &tblock_hint0_bc1[0][0]);
+		}
+
+		// Compute block errors
+		uint64_t total_t_err = 0, total_hint0_err = 0, total_hint1_err = 0;
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				total_t_err += color_distance(perceptual, block[y][x], tblock_bc1[y][x], false);
+				total_hint0_err += color_distance(perceptual, block[y][x], tblock_hint0_bc1[y][x], false);
+				total_hint1_err += color_distance(perceptual, block[y][x], tblock_hint1_bc1[y][x], false);
+			}
+		}
+
+		const float t_err = sqrtf((float)total_t_err);
+		const float t_err_hint0 = sqrtf((float)total_hint0_err);
+		const float t_err_hint1 = sqrtf((float)total_hint1_err);
+
+		const float err_thresh0 = 1.075f;
+		const float err_thresh1 = 1.075f;
+		
+		if ((g_uastc_mode_has_bc1_hint0[best_mode]) && (t_err_hint0 <= t_err * err_thresh0))
+			bc1_hint0 = true;
+
+		if ((g_uastc_mode_has_bc1_hint1[best_mode]) && (t_err_hint1 <= t_err * err_thresh1))
+			bc1_hint1 = true;
+	}
+
+	struct ycbcr
+	{
+		int32_t m_y;
+		int32_t m_cb;
+		int32_t m_cr;
+	};
+
+	static inline void rgb_to_y_cb_cr(const color_rgba& c, ycbcr& dst)
+	{
+		const int y = c.r * 54 + c.g * 183 + c.b * 19;
+		dst.m_y = y;
+		dst.m_cb = (c.b << 8) - y;
+		dst.m_cr = (c.r << 8) - y;
+	}
+
+	static inline uint64_t color_diff(const ycbcr& a, const ycbcr& b)
+	{
+		const int y_delta = a.m_y - b.m_y;
+		const int cb_delta = a.m_cb - b.m_cb;
+		const int cr_delta = a.m_cr - b.m_cr;
+		return ((int64_t)y_delta * y_delta * 4) + ((int64_t)cr_delta * cr_delta) + ((int64_t)cb_delta * cb_delta);
+	}
+
+	static inline int gray_distance2(const color_rgba& c, int r, int g, int b)
+	{
+		int gray_dist = (((int)c[0] - r) + ((int)c[1] - g) + ((int)c[2] - b) + 1) / 3;
+
+		int gray_point_r = clamp255(r + gray_dist);
+		int gray_point_g = clamp255(g + gray_dist);
+		int gray_point_b = clamp255(b + gray_dist);
+
+		int dist_to_gray_point_r = c[0] - gray_point_r;
+		int dist_to_gray_point_g = c[1] - gray_point_g;
+		int dist_to_gray_point_b = c[2] - gray_point_b;
+
+		return (dist_to_gray_point_r * dist_to_gray_point_r) + (dist_to_gray_point_g * dist_to_gray_point_g) + (dist_to_gray_point_b * dist_to_gray_point_b);
+	}
+
+	static bool pack_etc1_estimate_flipped(const color_rgba* pSrc_pixels)
+	{
+		int sums[3][2][2];
+
+#define GET_XY(x, y, c) pSrc_pixels[(x) + ((y) * 4)][c]
+
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			sums[c][0][0] = GET_XY(0, 0, c) + GET_XY(0, 1, c) + GET_XY(1, 0, c) + GET_XY(1, 1, c);
+			sums[c][1][0] = GET_XY(2, 0, c) + GET_XY(2, 1, c) + GET_XY(3, 0, c) + GET_XY(3, 1, c);
+			sums[c][0][1] = GET_XY(0, 2, c) + GET_XY(0, 3, c) + GET_XY(1, 2, c) + GET_XY(1, 3, c);
+			sums[c][1][1] = GET_XY(2, 2, c) + GET_XY(2, 3, c) + GET_XY(3, 2, c) + GET_XY(3, 3, c);
+		}
+
+		int upper_avg[3], lower_avg[3], left_avg[3], right_avg[3];
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			upper_avg[c] = (sums[c][0][0] + sums[c][1][0] + 4) / 8;
+			lower_avg[c] = (sums[c][0][1] + sums[c][1][1] + 4) / 8;
+			left_avg[c] = (sums[c][0][0] + sums[c][0][1] + 4) / 8;
+			right_avg[c] = (sums[c][1][0] + sums[c][1][1] + 4) / 8;
+		}
+
+#undef GET_XY
+#define GET_XY(x, y, a) gray_distance2(pSrc_pixels[(x) + ((y) * 4)], a[0], a[1], a[2])
+
+		int upper_gray_dist = 0, lower_gray_dist = 0, left_gray_dist = 0, right_gray_dist = 0;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			for (uint32_t j = 0; j < 2; j++)
+			{
+				upper_gray_dist += GET_XY(i, j, upper_avg);
+				lower_gray_dist += GET_XY(i, 2 + j, lower_avg);
+				left_gray_dist += GET_XY(j, i, left_avg);
+				right_gray_dist += GET_XY(2 + j, i, right_avg);
+			}
+		}
+
+#undef GET_XY
+
+		int upper_lower_sum = upper_gray_dist + lower_gray_dist;
+		int left_right_sum = left_gray_dist + right_gray_dist;
+
+		return upper_lower_sum < left_right_sum;
+	}
+
+	static void compute_etc1_hints(etc_block& best_etc1_blk, uint32_t& best_etc1_bias, const uastc_encode_results& best_results, const color_rgba block[4][4], const color_rgba decoded_uastc_block[4][4], int level, uint32_t flags)
+	{
+		best_etc1_bias = 0;
+
+		if (best_results.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			pack_etc1_block_solid_color(best_etc1_blk, &best_results.m_solid_color.m_comps[0]);
+			return;
+		}
+
+		const bool faster_etc1 = (flags & cPackUASTCETC1FasterHints) != 0;
+		const bool fastest_etc1 = (flags & cPackUASTCETC1FastestHints) != 0;
+
+		const bool has_bias = g_uastc_mode_has_etc1_bias[best_results.m_uastc_mode];
+
+		// 0 should be at the top, but we need 13 first because it represents bias (0,0,0).
+		const uint8_t s_sorted_bias_modes[32] = { 13, 0, 22, 29, 27, 12, 26, 9, 30, 31, 8, 10, 25, 2, 23, 5, 15, 7, 3, 11, 6, 17, 28, 18, 1, 19, 20, 21, 24, 4, 14, 16 };
+
+		uint32_t last_bias = 1;
+		bool use_faster_bias_mode_table = false;
+		const bool flip_estimate = (level <= cPackUASTCLevelFaster) || (faster_etc1) || (fastest_etc1);
+		if (has_bias)
+		{
+			switch (level)
+			{
+			case cPackUASTCLevelFastest:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 1 : 2);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelFaster:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 3 : 5);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelDefault:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 10 : 20);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelSlower:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 16 : 32);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			default:
+			{
+				last_bias = 32;
+				break;
+			}
+			}
+		}
+
+		memset(&best_etc1_blk, 0, sizeof(best_etc1_blk));
+		uint64_t best_err = UINT64_MAX;
+
+		etc_block trial_block;
+		memset(&trial_block, 0, sizeof(trial_block));
+
+		ycbcr block_ycbcr[4][4], decoded_uastc_block_ycbcr[4][4];
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				rgb_to_y_cb_cr(block[y][x], block_ycbcr[y][x]);
+				rgb_to_y_cb_cr(decoded_uastc_block[y][x], decoded_uastc_block_ycbcr[y][x]);
+			}
+		}
+
+		uint32_t first_flip = 0, last_flip = 2;
+		uint32_t first_individ = 0, last_individ = 2;
+		
+		if (flags & cPackUASTCETC1DisableFlipAndIndividual)
+		{
+			last_flip = 1;
+			last_individ = 1;
+		}
+		else if (flip_estimate)
+		{
+			if (pack_etc1_estimate_flipped(&decoded_uastc_block[0][0]))
+				first_flip = 1;
+			last_flip = first_flip + 1;
+		}
+										
+		for (uint32_t flip = first_flip; flip < last_flip; flip++)
+		{
+			trial_block.set_flip_bit(flip != 0);
+
+			for (uint32_t individ = first_individ; individ < last_individ; individ++)
+			{
+				const uint32_t mul = individ ? 15 : 31;
+				
+				trial_block.set_diff_bit(individ == 0);
+
+				color_rgba unbiased_block_colors[2];
+
+				int min_r[2] = { 255, 255 }, min_g[2] = { 255, 255 }, min_b[2] = { 255, 255 }, max_r[2] = { 0, 0 }, max_g[2] = { 0, 0 }, max_b[2] = { 0, 0 };
+
+				for (uint32_t subset = 0; subset < 2; subset++)
+				{
+					uint32_t avg_color[3];
+					memset(avg_color, 0, sizeof(avg_color));
+
+					for (uint32_t j = 0; j < 8; j++)
+					{
+						const etc_coord2 &c = g_etc1_pixel_coords[flip][subset][j];
+						const color_rgba& p = decoded_uastc_block[c.m_y][c.m_x];
+												
+						avg_color[0] += p.r;
+						avg_color[1] += p.g;
+						avg_color[2] += p.b;
+
+						min_r[subset] = basisu::minimum<uint32_t>(min_r[subset], p.r);
+						min_g[subset] = basisu::minimum<uint32_t>(min_g[subset], p.g);
+						min_b[subset] = basisu::minimum<uint32_t>(min_b[subset], p.b);
+
+						max_r[subset] = basisu::maximum<uint32_t>(max_r[subset], p.r);
+						max_g[subset] = basisu::maximum<uint32_t>(max_g[subset], p.g);
+						max_b[subset] = basisu::maximum<uint32_t>(max_b[subset], p.b);
+					} // j
+
+					unbiased_block_colors[subset][0] = (uint8_t)((avg_color[0] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][1] = (uint8_t)((avg_color[1] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][2] = (uint8_t)((avg_color[2] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][3] = 0;
+										
+				} // subset
+												
+				for (uint32_t bias_iter = 0; bias_iter < last_bias; bias_iter++)
+				{
+					const uint32_t bias = use_faster_bias_mode_table ? s_sorted_bias_modes[bias_iter] : bias_iter;
+										
+					color_rgba block_colors[2];
+					for (uint32_t subset = 0; subset < 2; subset++)
+						block_colors[subset] = has_bias ? apply_etc1_bias((color32&)unbiased_block_colors[subset], bias, mul, subset) : unbiased_block_colors[subset];
+
+					if (individ)
+						trial_block.set_block_color4(block_colors[0], block_colors[1]);
+					else
+						trial_block.set_block_color5_clamp(block_colors[0], block_colors[1]);
+
+					uint32_t range[2];
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						const color_rgba base_c(trial_block.get_block_color(subset, true));
+
+						const int pos_r = iabs(max_r[subset] - base_c.r);
+						const int neg_r = iabs(base_c.r - min_r[subset]);
+
+						const int pos_g = iabs(max_g[subset] - base_c.g);
+						const int neg_g = iabs(base_c.g - min_g[subset]);
+
+						const int pos_b = iabs(max_b[subset] - base_c.b);
+						const int neg_b = iabs(base_c.b - min_b[subset]);
+
+						range[subset] = maximum(maximum(pos_r, neg_r, pos_g, neg_g), pos_b, neg_b);
+					}
+
+					uint32_t best_inten_table[2] = { 0, 0 };
+
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						uint64_t best_subset_err = UINT64_MAX;
+
+						const uint32_t inten_table_limit = (level == cPackUASTCLevelVerySlow) ? 8 : ((range[subset] > 51) ? 8 : (range[subset] >= 7 ? 4 : 2));
+						
+						for (uint32_t inten_table = 0; inten_table < inten_table_limit; inten_table++)
+						{
+							trial_block.set_inten_table(subset, inten_table);
+
+							color_rgba color_table[4];
+							trial_block.get_block_colors(color_table, subset);
+
+							ycbcr color_table_ycbcr[4];
+							for (uint32_t i = 0; i < 4; i++)
+								rgb_to_y_cb_cr(color_table[i], color_table_ycbcr[i]);
+
+							uint64_t total_error = 0;
+							if (flip)
+							{
+								for (uint32_t y = 0; y < 2; y++)
+								{
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][0];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][1];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][2];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][3];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									if (total_error >= best_subset_err)
+										break;
+								}
+							}
+							else
+							{
+								for (uint32_t y = 0; y < 4; y++)
+								{
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + 0];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + 1];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+								}
+								if (total_error >= best_subset_err)
+									break;
+							}
+
+							if (total_error < best_subset_err)
+							{
+								best_subset_err = total_error;
+								best_inten_table[subset] = inten_table;
+							}
+
+						} // inten_table
+
+					} // subset
+
+					trial_block.set_inten_table(0, best_inten_table[0]);
+					trial_block.set_inten_table(1, best_inten_table[1]);
+
+					// Compute error against the ORIGINAL block.
+					uint64_t err = 0;
+
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						color_rgba color_table[4];
+						trial_block.get_block_colors(color_table, subset);
+
+						ycbcr color_table_ycbcr[4];
+						for (uint32_t i = 0; i < 4; i++)
+							rgb_to_y_cb_cr(color_table[i], color_table_ycbcr[i]);
+
+						if (flip)
+						{
+							for (uint32_t y = 0; y < 2; y++)
+							{
+								for (uint32_t x = 0; x < 4; x++)
+								{
+									const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][x];
+									const uint64_t best_index_err = minimum(color_diff(color_table_ycbcr[0], c) << 2, (color_diff(color_table_ycbcr[1], c) << 2) + 1, (color_diff(color_table_ycbcr[2], c) << 2) + 2, (color_diff(color_table_ycbcr[3], c) << 2) + 3);
+
+									const uint32_t best_index = (uint32_t)best_index_err & 3;
+									err += color_diff(block_ycbcr[subset * 2 + y][x], color_table_ycbcr[best_index]);
+								}
+								if (err >= best_err)
+									break;
+							}
+						}
+						else
+						{
+							for (uint32_t y = 0; y < 4; y++)
+							{
+								for (uint32_t x = 0; x < 2; x++)
+								{
+									const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + x];
+									const uint64_t best_index_err = minimum(color_diff(color_table_ycbcr[0], c) << 2, (color_diff(color_table_ycbcr[1], c) << 2) + 1, (color_diff(color_table_ycbcr[2], c) << 2) + 2, (color_diff(color_table_ycbcr[3], c) << 2) + 3);
+
+									const uint32_t best_index = (uint32_t)best_index_err & 3;
+									err += color_diff(block_ycbcr[y][subset * 2 + x], color_table_ycbcr[best_index]);
+								}
+								if (err >= best_err)
+									break;
+							}
+						}
+
+					} // subset
+
+					if (err < best_err)
+					{
+						best_err = err;
+
+						best_etc1_blk = trial_block;
+						best_etc1_bias = bias;
+					}
+
+				} // bias_iter
+
+			} // individ
+
+		} // flip
+	}
+
+	struct uastc_pack_eac_a8_results
+	{
+		uint32_t m_base;
+		uint32_t m_table;
+		uint32_t m_multiplier;
+	};
+	
+	static uint64_t uastc_pack_eac_a8(uastc_pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		assert(num_pixels <= 16);
+
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t a = pPixels[i];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			results.m_base = min_alpha;
+			results.m_table = 13;
+			results.m_multiplier = 1;
+			return 0;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		uint64_t best_err = UINT64_MAX;
+
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			if ((table_mask & (1U << table)) == 0)
+				continue;
+
+			const float range = (float)(g_etc2_eac_tables[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+			const int center = (int)roundf(lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range));
+
+			const int base_min = clamp255(center - base_search_rad);
+			const int base_max = clamp255(center + base_search_rad);
+
+			const int mul = (int)roundf(alpha_range / range);
+			const int mul_low = clamp<int>(mul - mul_search_rad, 1, 15);
+			const int mul_high = clamp<int>(mul + mul_search_rad, 1, 15);
+
+			for (int base = base_min; base <= base_max; base++)
+			{
+				for (int multiplier = mul_low; multiplier <= mul_high; multiplier++)
+				{
+					uint64_t total_err = 0;
+
+					for (uint32_t i = 0; i < num_pixels; i++)
+					{
+						const int a = pPixels[i];
+
+						uint32_t best_s_err = UINT32_MAX;
+						//uint32_t best_s = 0;
+						for (uint32_t s = 0; s < 8; s++)
+						{
+							const int v = clamp255((int)multiplier * g_etc2_eac_tables[table][s] + (int)base);
+
+							uint32_t err = iabs(a - v);
+							if (err < best_s_err)
+							{
+								best_s_err = err;
+								//best_s = s;
+							}
+						}
+
+						total_err += best_s_err * best_s_err;
+						if (total_err >= best_err)
+							break;
+					}
+
+					if (total_err < best_err)
+					{
+						best_err = total_err;
+						results.m_base = base;
+						results.m_multiplier = multiplier;
+						results.m_table = table;
+						if (!best_err)
+							return best_err;
+					}
+
+				} // table
+
+			} // multiplier
+
+		} // base
+
+		return best_err;
+	}
+
+	const int32_t DEFAULT_BC7_ERROR_WEIGHT = 50;
+	const float UASTC_ERROR_THRESH = 1.3f;
+
+	// TODO: This is a quick hack to favor certain modes when we know we'll be followed up with an RDO postprocess.
+	static inline float get_uastc_mode_weight(uint32_t mode)
+	{
+		const float FAVORED_MODE_WEIGHT = .8f;
+
+		switch (mode)
+		{
+		case 0:
+		case 10:
+			return FAVORED_MODE_WEIGHT;
+		default:
+			break;
+		}
+
+		return 1.0f;
+	}
+
+	void encode_uastc(const uint8_t* pRGBAPixels, uastc_block& output_block, uint32_t flags)
+	{
+//		printf("encode_uastc: \n");
+//		for (int i = 0; i < 16; i++)
+//			printf("[%u %u %u %u] ", pRGBAPixels[i * 4 + 0], pRGBAPixels[i * 4 + 1], pRGBAPixels[i * 4 + 2], pRGBAPixels[i * 4 + 3]);
+//		printf("\n");
+
+		const color_rgba(*block)[4] = reinterpret_cast<const color_rgba(*)[4]>(pRGBAPixels);
+
+		bool solid_color = true, has_alpha = false, is_la = true;
+
+		const color_rgba first_color(block[0][0]);
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				if (block[y][x].a < 255)
+					has_alpha = true;
+
+				if (block[y][x] != first_color)
+					solid_color = false;
+
+				if ((block[y][x].r != block[y][x].g) || (block[y][x].r != block[y][x].b))
+					is_la = false;
+			}
+		}
+
+		if (solid_color)
+		{
+			// Solid color blocks are so common that we handle them specially and as quickly as we can.
+			uastc_encode_results solid_results;
+			solid_results.m_uastc_mode = UASTC_MODE_INDEX_SOLID_COLOR;
+			solid_results.m_astc_err = 0;
+			solid_results.m_common_pattern = 0;
+			solid_results.m_solid_color = first_color;
+			memset(&solid_results.m_astc, 0, sizeof(solid_results.m_astc));
+						
+			etc_block etc1_blk;
+			uint32_t etc1_bias = 0;
+
+			pack_etc1_block_solid_color(etc1_blk, &first_color.m_comps[0]);
+
+			eac_a8_block eac_a8_blk;
+			eac_a8_blk.m_table = 0;
+			eac_a8_blk.m_multiplier = 1;
+
+			pack_uastc(output_block, solid_results, etc1_blk, etc1_bias, eac_a8_blk, false, false);
+
+//			printf(" Solid\n");
+
+			return;
+		}
+		
+		int level = flags & 7;
+		const bool favor_uastc_error = (flags & cPackUASTCFavorUASTCError) != 0;
+		const bool favor_bc7_error = !favor_uastc_error && ((flags & cPackUASTCFavorBC7Error) != 0);
+		//const bool etc1_perceptual = true;
+		
+		uastc_encode_results results[MAX_ENCODE_RESULTS];
+						
+		level = clampi(level, cPackUASTCLevelFastest, cPackUASTCLevelVerySlow);
+		
+		// Set all options to slowest, then configure from there depending on the selected level.
+		uint32_t mode_mask = UINT32_MAX;
+		uint32_t uber_level = 6;
+		bool estimate_partition = false;
+		bool always_try_alpha_modes = true;
+		uint32_t eac_a8_mul_search_rad = 3;
+		uint32_t eac_a8_table_mask = UINT32_MAX;
+		uint32_t least_squares_passes = 2;
+		bool bc1_hints = true;
+		bool only_use_la_on_transparent_blocks = false;
+		
+		switch (level)
+		{
+		case cPackUASTCLevelFastest:
+		{
+			mode_mask = (1 << 0) | (1 << 8) | 
+				(1 << 11) | (1 << 12) |
+				(1 << 15);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			uber_level = 0;
+			least_squares_passes = 1;
+			bc1_hints = false;
+			estimate_partition = true;
+			only_use_la_on_transparent_blocks = true;
+			break;
+		}
+		case cPackUASTCLevelFaster:
+		{
+			mode_mask = (1 << 0) | (1 << 4) | (1 << 6) | (1 << 8) |
+				(1 << 9) | (1 << 11) | (1 << 12) |
+				(1 << 15) | (1 << 17);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			uber_level = 0;
+			least_squares_passes = 1;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelDefault: 
+		{
+			mode_mask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 6) | (1 << 8) |
+				(1 << 9) | (1 << 10) | (1 << 11) | (1 << 12) | (1 << 13) |
+				(1 << 15) | (1 << 16) | (1 << 17);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 1;
+			eac_a8_table_mask = (1 << 0) | (1 << 2) | (1 << 6) | (1 << 7) | (1 << 8) | (1 << 10) | (1 << 11) | (1 << 13);
+			uber_level = 1;
+			least_squares_passes = 1;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelSlower:
+		{
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 2;
+			uber_level = 3;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelVerySlow:
+		{
+			break;
+		}
+		}
+
+#if BASISU_SUPPORT_FORCE_MODE
+		static int force_mode = -1;
+		force_mode = (force_mode + 1) % TOTAL_UASTC_MODES;
+		mode_mask = UINT32_MAX;
+		always_try_alpha_modes = true;
+		only_use_la_on_transparent_blocks = false;
+#endif
+
+		// HACK HACK
+		//mode_mask &= ~(1 << 18);
+		//mode_mask = (1 << 18)| (1 << 10);
+																				
+		uint32_t total_results = 0;
+				
+		if (only_use_la_on_transparent_blocks)
+		{
+			if ((is_la) && (!has_alpha))
+				is_la = false;
+		}
+
+		const bool try_alpha_modes = has_alpha || always_try_alpha_modes;
+		
+		bc7enc_compress_block_params comp_params;
+		memset(&comp_params, 0, sizeof(comp_params));
+		comp_params.m_max_partitions_mode1 = 64;
+		comp_params.m_least_squares_passes = least_squares_passes;
+		comp_params.m_weights[0] = 1;
+		comp_params.m_weights[1] = 1;
+		comp_params.m_weights[2] = 1;
+		comp_params.m_weights[3] = 1;
+		comp_params.m_uber_level = uber_level;
+
+		if (is_la)
+		{
+			if (mode_mask & (1U << 15))
+				astc_mode15(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 16))
+				astc_mode9_or_16(16, block, results, total_results, comp_params, estimate_partition ? 4 : 0);
+
+			if (mode_mask & (1U << 17))
+				astc_mode11_or_17(17, block, results, total_results, comp_params);
+		}
+
+		if (!has_alpha)
+		{
+			if (mode_mask & (1U << 0))
+				astc_mode0_or_18(0, block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 1))
+				astc_mode1(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 2))
+				astc_mode2(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 3))
+				astc_mode3(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 4))
+				astc_mode4(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 5))
+				astc_mode5(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 6))
+				astc_mode6(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 7))
+				astc_mode7(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 18))
+				astc_mode0_or_18(18, block, results, total_results, comp_params);
+		}
+
+		if (try_alpha_modes)
+		{
+			if (mode_mask & (1U << 9))
+				astc_mode9_or_16(9, block, results, total_results, comp_params, estimate_partition ? 4 : 0);
+
+			if (mode_mask & (1U << 10))
+				astc_mode10(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 11))
+				astc_mode11_or_17(11, block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 12))
+				astc_mode12(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 13))
+				astc_mode13(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 14))
+				astc_mode14(block, results, total_results, comp_params);
+		}
+
+		assert(total_results);
+		
+		// Fix up the errors so we consistently have LA, RGB, or RGBA error.
+		for (uint32_t i = 0; i < total_results; i++)
+		{
+			uastc_encode_results& r = results[i];
+			if (!is_la)
+			{
+				if (g_uastc_mode_is_la[r.m_uastc_mode])
+				{
+					color_rgba unpacked_block[16];
+					unpack_uastc(r.m_uastc_mode, r.m_common_pattern, r.m_solid_color.get_color32(), r.m_astc, (basist::color32 *)unpacked_block, false);
+
+					uint64_t total_err = 0;
+					for (uint32_t j = 0; j < 16; j++)
+						total_err += color_distance(unpacked_block[j], ((const color_rgba*)block)[j], true);
+
+					r.m_astc_err = total_err;
+				}
+			}
+			else
+			{
+				if (!g_uastc_mode_is_la[r.m_uastc_mode])
+				{
+					color_rgba unpacked_block[16];
+					unpack_uastc(r.m_uastc_mode, r.m_common_pattern, r.m_solid_color.get_color32(), r.m_astc, (basist::color32 *)unpacked_block, false);
+
+					uint64_t total_err = 0;
+					for (uint32_t j = 0; j < 16; j++)
+						total_err += color_distance_la(unpacked_block[j], ((const color_rgba*)block)[j]);
+
+					r.m_astc_err = total_err;
+				}
+			}
+		}
+				
+		unpacked_uastc_block unpacked_ublock;
+		memset(&unpacked_ublock, 0, sizeof(unpacked_ublock));
+
+		uint64_t total_overall_err[MAX_ENCODE_RESULTS];
+		float uastc_err_f[MAX_ENCODE_RESULTS];
+		double best_uastc_err_f = 1e+20f;
+
+		int best_index = -1;
+
+		if (total_results == 1)
+		{
+			best_index = 0;
+		}
+		else
+		{
+			const uint32_t bc7_err_weight = favor_bc7_error ? 100 : ((favor_uastc_error ? 0 : DEFAULT_BC7_ERROR_WEIGHT));
+			const uint32_t uastc_err_weight = favor_bc7_error ? 0 : 100;
+
+			// Find best overall results, balancing UASTC and UASTC->BC7 error.
+			// We purposely allow UASTC error to increase a little, if doing so lowers the BC7 error.
+			for (uint32_t i = 0; i < total_results; i++)
+			{
+#if BASISU_SUPPORT_FORCE_MODE
+				if (results[i].m_uastc_mode == force_mode)
+				{
+					best_index = i;
+					break;
+				}
+#endif
+
+				unpacked_ublock.m_mode = results[i].m_uastc_mode;
+				unpacked_ublock.m_astc = results[i].m_astc;
+				unpacked_ublock.m_common_pattern = results[i].m_common_pattern;
+				unpacked_ublock.m_solid_color = results[i].m_solid_color.get_color32();
+
+				color_rgba decoded_uastc_block[4][4];
+				bool success = unpack_uastc(results[i].m_uastc_mode, results[i].m_common_pattern, results[i].m_solid_color.get_color32(), results[i].m_astc, (basist::color32 *)&decoded_uastc_block[0][0], false);
+				(void)success;
+				VALIDATE(success);
+
+				uint64_t total_uastc_rgb_err, total_uastc_rgba_err, total_uastc_la_err;
+				compute_block_error(block, decoded_uastc_block, total_uastc_rgb_err, total_uastc_rgba_err, total_uastc_la_err);
+
+				// Validate the computed error, or we're go mad if it's inaccurate.
+				if (results[i].m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				{
+					VALIDATE(total_uastc_rgba_err == 0);
+				}
+				else if (is_la)
+				{
+					VALIDATE(total_uastc_la_err == results[i].m_astc_err);
+				}
+				else if (g_uastc_mode_has_alpha[results[i].m_uastc_mode])
+				{
+					VALIDATE(total_uastc_rgba_err == results[i].m_astc_err);
+				}
+				else
+				{
+					VALIDATE(total_uastc_rgb_err == results[i].m_astc_err);
+				}
+
+				// Transcode to BC7
+				bc7_optimization_results bc7_results;
+				transcode_uastc_to_bc7(unpacked_ublock, bc7_results);
+
+				bc7_block bc7_data;
+				encode_bc7_block(&bc7_data, &bc7_results);
+
+				color_rgba decoded_bc7_block[4][4];
+				unpack_block(texture_format::cBC7, &bc7_data, &decoded_bc7_block[0][0]);
+
+				// Compute BC7 error
+				uint64_t total_bc7_la_err, total_bc7_rgb_err, total_bc7_rgba_err;
+				compute_block_error(block, decoded_bc7_block, total_bc7_rgb_err, total_bc7_rgba_err, total_bc7_la_err);
+
+				if (results[i].m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				{
+					VALIDATE(total_bc7_rgba_err == 0);
+
+					best_index = i;
+					break;
+				}
+
+				uint64_t total_uastc_err = 0, total_bc7_err = 0;
+				if (is_la)
+				{
+					total_bc7_err = total_bc7_la_err;
+					total_uastc_err = total_uastc_la_err;
+				}
+				else if (has_alpha)
+				{
+					total_bc7_err = total_bc7_rgba_err;
+					total_uastc_err = total_uastc_rgba_err;
+				}
+				else
+				{
+					total_bc7_err = total_bc7_rgb_err;
+					total_uastc_err = total_uastc_rgb_err;
+				}
+
+				total_overall_err[i] = ((total_bc7_err * bc7_err_weight) / 100) + ((total_uastc_err * uastc_err_weight) / 100);
+				if (!total_overall_err[i])
+				{
+					best_index = i;
+					break;
+				}
+
+				uastc_err_f[i] = sqrtf((float)total_uastc_err);
+
+				if (uastc_err_f[i] < best_uastc_err_f)
+				{
+					best_uastc_err_f = uastc_err_f[i];
+				}
+
+			} // total_results
+
+			if (best_index < 0)
+			{
+				uint64_t best_err = UINT64_MAX;
+
+				if ((best_uastc_err_f == 0.0f) || (favor_bc7_error))
+				{
+					for (uint32_t i = 0; i < total_results; i++)
+					{
+						// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+						const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+						const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+						if (w  < best_err)
+						{
+							best_err = w;
+							best_index = i;
+							if (!best_err)
+								break;
+						}
+					} // i
+				}
+				else
+				{
+					// Scan the UASTC results, and consider all results within a window that has the best UASTC+BC7 error.
+					for (uint32_t i = 0; i < total_results; i++)
+					{
+						double err_delta = uastc_err_f[i] / best_uastc_err_f;
+
+						if (err_delta <= UASTC_ERROR_THRESH)
+						{
+							// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+							const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+							const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+							if (w < best_err)
+							{
+								best_err = w;
+								best_index = i;
+								if (!best_err)
+									break;
+							}
+						}
+					} // i
+				}
+			}
+		}
+
+		const uastc_encode_results& best_results = results[best_index];
+		const uint32_t best_mode = best_results.m_uastc_mode;
+		const astc_block_desc& best_astc_results = best_results.m_astc;
+				
+		color_rgba decoded_uastc_block[4][4];
+		bool success = unpack_uastc(best_mode, best_results.m_common_pattern, best_results.m_solid_color.get_color32(), best_astc_results, (basist::color32 *)&decoded_uastc_block[0][0], false);
+		(void)success;
+		VALIDATE(success);
+
+#if BASISU_VALIDATE_UASTC_ENC
+		// Make sure that the UASTC block unpacks to the same exact pixels as the ASTC block does, using two different decoders.
+		{
+			// Round trip to packed UASTC and back, then decode to pixels.
+			etc_block etc1_blk;
+			memset(&etc1_blk, 0, sizeof(etc1_blk));
+			eac_a8_block etc_eac_a8_blk;
+			memset(&etc_eac_a8_blk, 0, sizeof(etc_eac_a8_blk));
+			etc_eac_a8_blk.m_multiplier = 1;
+
+			basist::uastc_block temp_block;
+			pack_uastc(temp_block, best_results, etc1_blk, 0, etc_eac_a8_blk, false, false);
+			
+			basist::color32 temp_block_unpacked[4][4];
+			success = basist::unpack_uastc(temp_block, (basist::color32 *)temp_block_unpacked, false);
+			VALIDATE(success);
+				
+			// Now round trip to packed ASTC and back, then decode to pixels.
+			uint32_t astc_data[4];
+			
+			if (best_results.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				pack_astc_solid_block(astc_data, (color32 &)best_results.m_solid_color);
+			else
+			{
+				success = pack_astc_block(astc_data, &best_astc_results, best_results.m_uastc_mode);
+				VALIDATE(success);
+			}
+
+			color_rgba decoded_astc_block[4][4];
+			success = basisu_astc::astc::decompress((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4);
+			VALIDATE(success);
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					VALIDATE(decoded_astc_block[y][x] == decoded_uastc_block[y][x]);
+					
+					VALIDATE(temp_block_unpacked[y][x].c[0] == decoded_uastc_block[y][x].r);
+					VALIDATE(temp_block_unpacked[y][x].c[1] == decoded_uastc_block[y][x].g);
+					VALIDATE(temp_block_unpacked[y][x].c[2] == decoded_uastc_block[y][x].b);
+					VALIDATE(temp_block_unpacked[y][x].c[3] == decoded_uastc_block[y][x].a);
+				}
+			}
+		}
+#endif
+
+		// Compute BC1 hints
+		bool bc1_hint0 = false, bc1_hint1 = false;
+		if (bc1_hints)
+			compute_bc1_hints(bc1_hint0, bc1_hint1, best_results, block, decoded_uastc_block);
+		
+		eac_a8_block eac_a8_blk;
+		if ((g_uastc_mode_has_alpha[best_mode]) && (best_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			// Compute ETC2 hints
+			uint8_t decoded_uastc_block_alpha[16];
+			for (uint32_t i = 0; i < 16; i++)
+				decoded_uastc_block_alpha[i] = decoded_uastc_block[i >> 2][i & 3].a;
+
+			uastc_pack_eac_a8_results eac8_a8_results;
+			memset(&eac8_a8_results, 0, sizeof(eac8_a8_results));
+			uastc_pack_eac_a8(eac8_a8_results, decoded_uastc_block_alpha, 16, 0, eac_a8_mul_search_rad, eac_a8_table_mask);
+						
+			// All we care about for hinting is the table and multiplier.
+			eac_a8_blk.m_table = eac8_a8_results.m_table;
+			eac_a8_blk.m_multiplier = eac8_a8_results.m_multiplier;
+		}
+		else
+		{
+			memset(&eac_a8_blk, 0, sizeof(eac_a8_blk));
+		}
+
+		// Compute ETC1 hints
+		etc_block etc1_blk;
+		uint32_t etc1_bias = 0;
+		compute_etc1_hints(etc1_blk, etc1_bias, best_results, block, decoded_uastc_block, level, flags);
+
+		// Finally, pack the UASTC block with its hints and we're done.
+		pack_uastc(output_block, best_results, etc1_blk, etc1_bias, eac_a8_blk, bc1_hint0, bc1_hint1);
+
+//		printf(" Packed: ");
+//		for (int i = 0; i < 16; i++)
+//			printf("%X ", output_block.m_bytes[i]);
+//		printf("\n");
+	}
+
+	static bool uastc_recompute_hints(basist::uastc_block* pBlock, const color_rgba* pBlock_pixels, uint32_t flags, const unpacked_uastc_block *pUnpacked_blk)
+	{
+		unpacked_uastc_block unpacked_blk;
+
+		if (pUnpacked_blk)
+			unpacked_blk = *pUnpacked_blk;
+		else
+		{
+			if (!unpack_uastc(*pBlock, unpacked_blk, false, true))
+				return false;
+		}
+		color_rgba decoded_uastc_block[4][4];
+		if (!unpack_uastc(unpacked_blk, (basist::color32 *)decoded_uastc_block, false))
+			return false;
+		uastc_encode_results results;
+		results.m_uastc_mode = unpacked_blk.m_mode;
+		results.m_common_pattern = unpacked_blk.m_common_pattern;
+		results.m_astc = unpacked_blk.m_astc;
+		results.m_solid_color = unpacked_blk.m_solid_color;
+		results.m_astc_err = 0;
+		bool bc1_hints = true;
+		uint32_t eac_a8_mul_search_rad = 3;
+		uint32_t eac_a8_table_mask = UINT32_MAX;
+		const uint32_t level = flags & cPackUASTCLevelMask;
+		switch (level)
+		{
+		case cPackUASTCLevelFastest:
+		{
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			bc1_hints = false;
+			break;
+		}
+		case cPackUASTCLevelFaster:
+		{
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			break;
+		}
+		case cPackUASTCLevelDefault:
+		{
+			eac_a8_mul_search_rad = 1;
+			eac_a8_table_mask = (1 << 0) | (1 << 2) | (1 << 6) | (1 << 7) | (1 << 8) | (1 << 10) | (1 << 11) | (1 << 13);
+			break;
+		}
+		case cPackUASTCLevelSlower:
+		{
+			eac_a8_mul_search_rad = 2;
+			break;
+		}
+		case cPackUASTCLevelVerySlow:
+		{
+			break;
+		}
+		}
+		bool bc1_hint0 = false, bc1_hint1 = false;
+		if (bc1_hints)
+			compute_bc1_hints(bc1_hint0, bc1_hint1, results, (color_rgba (*)[4])pBlock_pixels, decoded_uastc_block);
+		const uint32_t best_mode = unpacked_blk.m_mode;
+		eac_a8_block eac_a8_blk;
+		if ((g_uastc_mode_has_alpha[best_mode]) && (best_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			uint8_t decoded_uastc_block_alpha[16];
+			for (uint32_t i = 0; i < 16; i++)
+				decoded_uastc_block_alpha[i] = decoded_uastc_block[i >> 2][i & 3].a;
+			uastc_pack_eac_a8_results eac8_a8_results;
+			memset(&eac8_a8_results, 0, sizeof(eac8_a8_results));
+			uastc_pack_eac_a8(eac8_a8_results, decoded_uastc_block_alpha, 16, 0, eac_a8_mul_search_rad, eac_a8_table_mask);
+			eac_a8_blk.m_table = eac8_a8_results.m_table;
+			eac_a8_blk.m_multiplier = eac8_a8_results.m_multiplier;
+		}
+		else
+		{
+			memset(&eac_a8_blk, 0, sizeof(eac_a8_blk));
+		}
+		etc_block etc1_blk;
+		uint32_t etc1_bias = 0;
+		compute_etc1_hints(etc1_blk, etc1_bias, results, (color_rgba (*)[4])pBlock_pixels, decoded_uastc_block, level, flags);
+		pack_uastc(*pBlock, results, etc1_blk, etc1_bias, eac_a8_blk, bc1_hint0, bc1_hint1);
+		return true;
+	}
+
+	static const uint8_t g_uastc_mode_selector_bits[TOTAL_UASTC_MODES][2] =
+	{
+		{ 65, 63 }, { 69, 31 }, { 73, 46 }, { 89, 29 },
+		{ 89, 30 }, { 68, 47 }, { 66, 62 }, { 89, 30 },
+		{ 0, 0 }, { 97, 30 }, { 65, 63 }, { 66, 62 },
+		{ 81, 47 }, { 94, 30 }, { 92, 31 }, { 62, 63 },
+		{ 98, 30 }, { 61, 62 }, { 49, 79 }
+	};
+
+	static inline uint32_t set_block_bits(uint8_t* pBytes, uint64_t val, uint32_t num_bits, uint32_t cur_ofs)
+	{
+		assert(num_bits <= 64);
+		assert((num_bits == 64) || (val < (1ULL << num_bits)));
+		uint64_t mask = (num_bits == 64) ? UINT64_MAX : ((1ULL << num_bits) - 1);
+		while (num_bits)
+		{
+			const uint32_t n = basisu::minimum<uint32_t>(8U - (cur_ofs & 7U), num_bits);
+			pBytes[cur_ofs >> 3] &= ~static_cast<uint8_t>(mask << (cur_ofs & 7U));
+			pBytes[cur_ofs >> 3] |= static_cast<uint8_t>(val << (cur_ofs & 7U));
+			val >>= n;
+			mask >>= n;
+			num_bits -= n;
+			cur_ofs += n;
+		}
+		return cur_ofs;
+	}
+
+	static const uint8_t g_tdefl_small_dist_extra[512] =
+	{
+		0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7
+	};
+
+	static const uint8_t g_tdefl_large_dist_extra[128] =
+	{
+		0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+		12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+		13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+	};
+
+	static inline uint32_t compute_match_cost_estimate(uint32_t dist)
+	{
+		uint32_t len_cost = 7;
+		uint32_t dist_cost = 5;
+		if (dist < 512)
+			dist_cost += g_tdefl_small_dist_extra[dist & 511];
+		else
+		{
+			dist_cost += g_tdefl_large_dist_extra[basisu::minimum<uint32_t>(dist, 32767) >> 8];
+			while (dist >= 32768)
+			{
+				dist_cost++;
+				dist >>= 1;
+			}
+		}
+		return len_cost + dist_cost;
+	}
+
+	struct selector_bitsequence
+	{
+		uint64_t m_sel;
+		uint32_t m_ofs;
+		selector_bitsequence() { }
+		selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs) { }
+		bool operator== (const selector_bitsequence& other) const
+		{
+			return (m_ofs == other.m_ofs) && (m_sel == other.m_sel);
+		}
+
+		bool operator< (const selector_bitsequence& other) const
+		{
+			if (m_ofs < other.m_ofs)
+				return true;
+			else if (m_ofs == other.m_ofs)
+				return m_sel < other.m_sel;
+
+			return false;
+		}
+	};
+
+	struct selector_bitsequence_hash
+	{
+		std::size_t operator()(selector_bitsequence const& s) const noexcept
+		{
+			return static_cast<std::size_t>(hash_hsieh((uint8_t *)&s, sizeof(s)) ^ s.m_sel);
+		}
+	};
+
+	class tracked_stat
+	{
+	public:
+		tracked_stat() { clear(); }
+
+		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+		uint32_t get_number_of_values() { return m_num; }
+		uint64_t get_total() const { return m_total; }
+		uint64_t get_total2() const { return m_total2; }
+
+		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		float get_variance() const { float s = get_std_dev(); return s * s; }
+
+	private:
+		uint32_t m_num;
+		uint64_t m_total;
+		uint64_t m_total2;
+	};
+		
+	static bool uastc_rdo_blocks(uint32_t first_index, uint32_t last_index, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, 
+		uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified, uint32_t &total_smooth)
+	{
+		debug_printf("uastc_rdo_blocks: Processing blocks %u to %u\n", first_index, last_index);
+
+		const int total_blocks_to_check = basisu::maximum<uint32_t>(1U, params.m_lz_dict_size / sizeof(basist::uastc_block));
+		const bool perceptual = false;
+
+		std::unordered_map<selector_bitsequence, uint32_t, selector_bitsequence_hash> selector_history;
+						
+		for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+		{
+			const basist::uastc_block& blk = pBlocks[block_index];
+			const color_rgba* pPixels = &pBlock_pixels[16 * block_index];
+
+			unpacked_uastc_block unpacked_blk;
+			if (!unpack_uastc(blk, unpacked_blk, false, true))
+				return false;
+
+			const uint32_t block_mode = unpacked_blk.m_mode;
+			if (block_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				continue;
+
+			tracked_stat r_stats, g_stats, b_stats, a_stats;
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				r_stats.update(pPixels[i].r);
+				g_stats.update(pPixels[i].g);
+				b_stats.update(pPixels[i].b);
+				a_stats.update(pPixels[i].a);
+			}
+
+			const float max_std_dev = basisu::maximum<float>(basisu::maximum<float>(basisu::maximum(r_stats.get_std_dev(), g_stats.get_std_dev()), b_stats.get_std_dev()), a_stats.get_std_dev());
+
+			float yl = clamp<float>(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f);
+			yl = yl * yl;
+			const float smooth_block_error_scale = lerp<float>(params.m_smooth_block_max_error_scale, 1.0f, yl);
+			if (smooth_block_error_scale > 1.0f)
+				total_smooth++;
+
+			color_rgba decoded_uastc_block[4][4];
+			if (!unpack_uastc(unpacked_blk, (basist::color32*)decoded_uastc_block, false))
+				return false;
+
+			uint64_t uastc_err = 0;
+			for (uint32_t i = 0; i < 16; i++)
+				uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_uastc_block)[i], true);
+
+			// Transcode to BC7
+			bc7_optimization_results b7_results;
+			if (!transcode_uastc_to_bc7(unpacked_blk, b7_results))
+				return false;
+
+			basist::bc7_block b7_block;
+			basist::encode_bc7_block(&b7_block, &b7_results);
+
+			color_rgba decoded_b7_blk[4][4];
+			unpack_block(texture_format::cBC7, &b7_block, &decoded_b7_blk[0][0]);
+						
+			uint64_t bc7_err = 0;
+			for (uint32_t i = 0; i < 16; i++)
+				bc7_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_b7_blk)[i], true);
+
+			uint64_t cur_err = (uastc_err + bc7_err) / 2;
+
+			// Divide by 16*4 to compute RMS error
+			const float cur_ms_err = (float)cur_err * (1.0f / 64.0f);
+			const float cur_rms_err = sqrt(cur_ms_err);
+
+			const uint32_t first_sel_bit = g_uastc_mode_selector_bits[block_mode][0];
+			const uint32_t total_sel_bits = g_uastc_mode_selector_bits[block_mode][1];
+			assert(first_sel_bit + total_sel_bits <= 128);
+			assert(total_sel_bits > 0);
+
+			uint32_t cur_bit_offset = first_sel_bit;
+			uint64_t cur_sel_bits = read_bits((const uint8_t*)&blk, cur_bit_offset, basisu::minimum(64U, total_sel_bits));
+
+			if (cur_rms_err >= params.m_skip_block_rms_thresh)
+			{
+				auto cur_search_res = selector_history.insert(std::make_pair(selector_bitsequence(first_sel_bit, cur_sel_bits), block_index));
+
+				// Block already has too much error, so don't mess with it.
+				if (!cur_search_res.second)
+					(*cur_search_res.first).second = block_index;
+
+				total_skipped++;
+				continue;
+			}
+
+			int cur_bits;
+			auto cur_find_res = selector_history.find(selector_bitsequence(first_sel_bit, cur_sel_bits));
+			if (cur_find_res == selector_history.end())
+			{
+				// Wasn't found - wildly estimate literal cost
+				//cur_bits = (total_sel_bits * 5) / 4;
+				cur_bits = (total_sel_bits * params.m_lz_literal_cost) / 100;
+			}
+			else
+			{
+				// Was found - wildly estimate match cost
+				uint32_t match_block_index = cur_find_res->second;
+				const int block_dist_in_bytes = (block_index - match_block_index) * 16;
+				cur_bits = compute_match_cost_estimate(block_dist_in_bytes);
+			}
+
+			int first_block_to_check = basisu::maximum<int>(first_index, block_index - total_blocks_to_check);
+			int last_block_to_check = block_index - 1;
+
+			basist::uastc_block best_block(blk);
+			uint32_t best_block_index = block_index;
+
+			float best_t = cur_ms_err * smooth_block_error_scale + cur_bits * params.m_lambda;
+
+			// Now scan through previous blocks, insert their selector bit patterns into the current block, and find 
+			// selector bit patterns which don't increase the overall block error too much.
+			for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index)
+			{
+				const basist::uastc_block& prev_blk = pBlocks[prev_block_index];
+
+				uint32_t bit_offset = first_sel_bit;
+				uint64_t sel_bits = read_bits((const uint8_t*)&prev_blk, bit_offset, basisu::minimum(64U, total_sel_bits));
+
+				int match_block_index = prev_block_index;
+				auto res = selector_history.find(selector_bitsequence(first_sel_bit, sel_bits));
+				if (res != selector_history.end())
+					match_block_index = res->second;
+				// Have we already checked this bit pattern? If so then skip this block.
+				if (match_block_index > prev_block_index)
+					continue;
+
+				unpacked_uastc_block unpacked_prev_blk;
+				if (!unpack_uastc(prev_blk, unpacked_prev_blk, false, true))
+					return false;
+
+				basist::uastc_block trial_blk(blk);
+
+				set_block_bits((uint8_t*)&trial_blk, sel_bits, basisu::minimum(64U, total_sel_bits), first_sel_bit);
+
+				if (total_sel_bits > 64)
+				{
+					sel_bits = read_bits((const uint8_t*)&prev_blk, bit_offset, total_sel_bits - 64U);
+
+					set_block_bits((uint8_t*)&trial_blk, sel_bits, total_sel_bits - 64U, first_sel_bit + basisu::minimum(64U, total_sel_bits));
+				}
+
+				unpacked_uastc_block unpacked_trial_blk;
+				if (!unpack_uastc(trial_blk, unpacked_trial_blk, false, true))
+					continue;
+
+				color_rgba decoded_trial_uastc_block[4][4];
+				if (!unpack_uastc(unpacked_trial_blk, (basist::color32*)decoded_trial_uastc_block, false))
+					continue;
+
+				uint64_t trial_uastc_err = 0;
+				for (uint32_t i = 0; i < 16; i++)
+					trial_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_uastc_block)[i], true);
+
+				// Transcode trial to BC7, compute error
+				bc7_optimization_results trial_b7_results;
+				if (!transcode_uastc_to_bc7(unpacked_trial_blk, trial_b7_results))
+					return false;
+
+				basist::bc7_block trial_b7_block;
+				basist::encode_bc7_block(&trial_b7_block, &trial_b7_results);
+
+				color_rgba decoded_trial_b7_blk[4][4];
+				unpack_block(texture_format::cBC7, &trial_b7_block, &decoded_trial_b7_blk[0][0]);
+
+				uint64_t trial_bc7_err = 0;
+				for (uint32_t i = 0; i < 16; i++)
+					trial_bc7_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_b7_blk)[i], true);
+
+				uint64_t trial_err = (trial_uastc_err + trial_bc7_err) / 2;
+
+				const float trial_ms_err = (float)trial_err * (1.0f / 64.0f);
+				const float trial_rms_err = sqrtf(trial_ms_err);
+
+				if (trial_rms_err > cur_rms_err * params.m_max_allowed_rms_increase_ratio)
+					continue;
+
+				const int block_dist_in_bytes = (block_index - match_block_index) * 16;
+				const int match_bits = compute_match_cost_estimate(block_dist_in_bytes);
+
+				float t = trial_ms_err * smooth_block_error_scale + match_bits * params.m_lambda;
+				if (t < best_t)
+				{
+					best_t = t;
+					best_block_index = prev_block_index;
+
+					best_block = trial_blk;
+				}
+
+			} // prev_block_index
+
+			if (best_block_index != block_index)
+			{
+				total_modified++;
+
+				unpacked_uastc_block unpacked_best_blk;
+				if (!unpack_uastc(best_block, unpacked_best_blk, false, false))
+					return false;
+
+				if ((params.m_endpoint_refinement) && (block_mode == 0))
+				{
+					// Attempt to refine mode 0 block's endpoints, using the new selectors. This doesn't help much, but it does help.
+					// TODO: We could do this with the other modes too.
+					color_rgba decoded_best_uastc_block[4][4];
+					if (!unpack_uastc(unpacked_best_blk, (basist::color32*)decoded_best_uastc_block, false))
+						return false;
+
+					// Compute the block's current error (with the modified selectors).
+					uint64_t best_uastc_err = 0;
+					for (uint32_t i = 0; i < 16; i++)
+						best_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_best_uastc_block)[i], true);
+
+					bc7enc_compress_block_params comp_params;
+					memset(&comp_params, 0, sizeof(comp_params));
+					comp_params.m_max_partitions_mode1 = 64;
+					comp_params.m_least_squares_passes = 1;
+					comp_params.m_weights[0] = 1;
+					comp_params.m_weights[1] = 1;
+					comp_params.m_weights[2] = 1;
+					comp_params.m_weights[3] = 1;
+					comp_params.m_uber_level = 0;
+
+					uastc_encode_results results;
+					uint32_t total_results = 0;
+					astc_mode0_or_18(0, (color_rgba(*)[4])pPixels, &results, total_results, comp_params, unpacked_best_blk.m_astc.m_weights);
+					assert(total_results == 1);
+
+					// See if the overall error has actually gone done.
+
+					color_rgba decoded_trial_uastc_block[4][4];
+					bool success = unpack_uastc(results.m_uastc_mode, results.m_common_pattern, results.m_solid_color.get_color32(), results.m_astc, (basist::color32*) & decoded_trial_uastc_block[0][0], false);
+					assert(success);
+					
+					BASISU_NOTE_UNUSED(success);
+
+					uint64_t trial_uastc_err = 0;
+					for (uint32_t i = 0; i < 16; i++)
+						trial_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_uastc_block)[i], true);
+
+					if (trial_uastc_err < best_uastc_err)
+					{
+						// The error went down, so accept the new endpoints.
+
+						// Ensure the selectors haven't changed, otherwise we'll invalidate the LZ matches.
+						for (uint32_t i = 0; i < 16; i++)
+							assert(unpacked_best_blk.m_astc.m_weights[i] == results.m_astc.m_weights[i]);
+
+						unpacked_best_blk.m_astc = results.m_astc;
+
+						total_refined++;
+					}
+				} // if ((params.m_endpoint_refinement) && (block_mode == 0))
+
+				// The selectors have changed, so go recompute the block hints.
+				if (!uastc_recompute_hints(&best_block, pPixels, flags, &unpacked_best_blk))
+					return false;
+
+				// Write the modified block
+				pBlocks[block_index] = best_block;
+			
+			} // if (best_block_index != block_index)
+
+			{
+				uint32_t bit_offset = first_sel_bit;
+				uint64_t sel_bits = read_bits((const uint8_t*)&best_block, bit_offset, basisu::minimum(64U, total_sel_bits));
+
+				auto res = selector_history.insert(std::make_pair(selector_bitsequence(first_sel_bit, sel_bits), block_index));
+				if (!res.second)
+					(*res.first).second = block_index;
+			}
+
+		} // block_index
+
+		return true;
+	}
+				
+	// This function implements a basic form of rate distortion optimization (RDO) for UASTC. 
+	// It only changes selectors and then updates the hints. It uses very approximate LZ bitprice estimation.
+	// There's A LOT that can be done better in here, but it's a start.
+	// One nice advantage of the method used here is that it works for any input, no matter which or how many modes it uses.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, job_pool* pJob_pool, uint32_t total_jobs)
+	{
+		assert(params.m_max_allowed_rms_increase_ratio > 1.0f);
+		assert(params.m_lz_dict_size > 0);
+		assert(params.m_lambda > 0.0f);
+
+		uint32_t total_skipped = 0, total_modified = 0, total_refined = 0, total_smooth = 0;
+
+		uint32_t blocks_per_job = total_jobs ? (num_blocks / total_jobs) : 0;
+
+		std::mutex stat_mutex;
+
+		bool status = false;
+
+		if ((!pJob_pool) || (total_jobs <= 1) || (blocks_per_job <= 8))
+		{
+			status = uastc_rdo_blocks(0, num_blocks, pBlocks, pBlock_pixels, params, flags, total_skipped, total_refined, total_modified, total_smooth);
+		}
+		else
+		{
+			bool all_succeeded = true;
+
+			for (uint32_t block_index_iter = 0; block_index_iter < num_blocks; block_index_iter += blocks_per_job)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(num_blocks, block_index_iter + blocks_per_job);
+
+#ifndef __EMSCRIPTEN__
+				pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, &params, flags, &total_skipped, &total_modified, &total_refined, &total_smooth, &all_succeeded, &stat_mutex] {
+#endif
+
+					uint32_t job_skipped = 0, job_modified = 0, job_refined = 0, job_smooth = 0;
+
+					bool status = uastc_rdo_blocks(first_index, last_index, pBlocks, pBlock_pixels, params, flags, job_skipped, job_refined, job_modified, job_smooth);
+
+					{
+						std::lock_guard<std::mutex> lck(stat_mutex);
+						
+						all_succeeded = all_succeeded && status;
+						total_skipped += job_skipped;
+						total_modified += job_modified;
+						total_refined += job_refined;
+						total_smooth += job_smooth;
+					}
+
+#ifndef __EMSCRIPTEN__
+					}
+				);
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			pJob_pool->wait_for_all();
+#endif
+
+			status = all_succeeded;
+		}
+
+		debug_printf("uastc_rdo: Total modified: %3.2f%%, total skipped: %3.2f%%, total refined: %3.2f%%, total smooth: %3.2f%%\n", total_modified * 100.0f / num_blocks, total_skipped * 100.0f / num_blocks, total_refined * 100.0f / num_blocks, total_smooth * 100.0f / num_blocks);
+				
+		return status;
+	}
+} // namespace basisu
+
+
+
+
+
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
new file mode 100644
index 0000000000..ba39a558b3
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
@@ -0,0 +1,140 @@
+// basisu_uastc_enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_etc.h"
+
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+	const uint32_t TOTAL_PACK_UASTC_LEVELS = 5;
+
+	enum
+	{
+		// Fastest is the lowest quality, although it's stil substantially higher quality vs. BC1/ETC1. It supports 5 modes.
+		// The output may be somewhat blocky because this setting doesn't support 2/3-subset UASTC modes, but it should be less blocky vs. BC1/ETC1.
+		// This setting doesn't write BC1 hints, so BC1 transcoding will be slower. 
+		// Transcoded ETC1 quality will be lower because it only considers 2 hints out of 32.
+		// Avg. 43.45 dB
+		cPackUASTCLevelFastest = 0,
+		
+		// Faster is ~3x slower than fastest. It supports 9 modes.
+		// Avg. 46.49 dB
+		cPackUASTCLevelFaster = 1,
+		
+		// Default is ~5.5x slower than fastest. It supports 14 modes.
+		// Avg. 47.47 dB
+		cPackUASTCLevelDefault = 2,
+
+		// Slower is ~14.5x slower than fastest. It supports all 18 modes.
+		// Avg. 48.01 dB
+		cPackUASTCLevelSlower = 3,
+
+		// VerySlow is ~200x slower than fastest. 
+		// The best quality the codec is capable of, but you'll need to be patient or have a lot of cores.
+		// Avg. 48.24 dB
+		cPackUASTCLevelVerySlow = 4,
+
+		cPackUASTCLevelMask = 0xF,
+
+		// By default the encoder tries to strike a balance between UASTC and transcoded BC7 quality.
+		// These flags allow you to favor only optimizing for lowest UASTC error, or lowest BC7 error.
+		cPackUASTCFavorUASTCError = 8,
+		cPackUASTCFavorBC7Error = 16,
+						
+		cPackUASTCETC1FasterHints = 64,
+		cPackUASTCETC1FastestHints = 128,
+		cPackUASTCETC1DisableFlipAndIndividual = 256,
+		
+		// Favor UASTC modes 0 and 10 more than the others (this is experimental, it's useful for RDO compression)
+		cPackUASTCFavorSimplerModes = 512, 
+	};
+
+	// pRGBAPixels: Pointer to source 4x4 block of RGBA pixels (R first in memory).
+	// block: Reference to destination UASTC block.
+	// level: Controls compression speed vs. performance tradeoff.
+	void encode_uastc(const uint8_t* pRGBAPixels, basist::uastc_block& output_block, uint32_t flags = cPackUASTCLevelDefault);
+
+	struct uastc_encode_results
+	{
+		uint32_t m_uastc_mode;
+		uint32_t m_common_pattern;
+		basist::astc_block_desc m_astc;
+		color_rgba m_solid_color;
+		uint64_t m_astc_err;
+	};
+			  
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1);
+
+	const uint32_t UASCT_RDO_DEFAULT_LZ_DICT_SIZE = 4096;
+
+	const float UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO = 10.0f;
+	const float UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH = 8.0f;
+	
+	// The RDO encoder computes a smoothness factor, from [0,1], for each block. To do this it computes each block's maximum component variance, then it divides this by this factor and clamps the result.
+	// Larger values will result in more blocks being protected from too much distortion.
+	const float UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV = 18.0f;
+	
+	// The RDO encoder can artifically boost the error of smooth blocks, in order to suppress distortions on smooth areas of the texture.
+	// The encoder will use this value as the maximum error scale to use on smooth blocks. The larger this value, the better smooth bocks will look. Set to 1.0 to disable this completely.
+	const float UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE = 10.0f;
+
+	struct uastc_rdo_params
+	{
+		uastc_rdo_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_lz_dict_size = UASCT_RDO_DEFAULT_LZ_DICT_SIZE;
+			m_lambda = 0.5f;
+			m_max_allowed_rms_increase_ratio = UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO;
+			m_skip_block_rms_thresh = UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH;
+			m_endpoint_refinement = true;
+			m_lz_literal_cost = 100;
+						
+			m_max_smooth_block_std_dev = UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV;
+			m_smooth_block_max_error_scale = UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE;
+		}
+				
+		// m_lz_dict_size: Size of LZ dictionary to simulate in bytes. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+		uint32_t m_lz_dict_size;
+
+		// m_lambda: The post-processor tries to reduce distortion+rate*lambda (rate is approximate LZ bits and distortion is scaled MS error).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
+		
+		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
+		float m_max_allowed_rms_increase_ratio;
+		
+		// m_skip_block_rms_thresh: Blocks with this much RMS error or more are completely skipped by the RDO encoder. 
+		float m_skip_block_rms_thresh;
+
+		// m_endpoint_refinement: If true, the post-process will attempt to refine the endpoints of blocks with modified selectors. 
+		bool m_endpoint_refinement;
+
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_error_scale;
+		
+		uint32_t m_lz_literal_cost;
+	};
+
+	// num_blocks, pBlocks: Number of blocks and pointer to UASTC blocks to process.
+	// pBlock_pixels: Pointer to an array of 4x4 blocks containing the original texture pixels. This is NOT a raster image, but a pointer to individual 4x4 blocks.
+	// flags: Pass in the same flags used to encode the UASTC blocks. The flags are used to reencode the transcode hints in the same way.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params &params, uint32_t flags = cPackUASTCLevelDefault, job_pool* pJob_pool = nullptr, uint32_t total_jobs = 0);
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/cppspmd_flow.h b/thirdparty/basis_universal/encoder/cppspmd_flow.h
new file mode 100644
index 0000000000..f6930476aa
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_flow.h
@@ -0,0 +1,590 @@
+// Do not include this header directly.
+// Control flow functionality in common between all the headers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _DEBUG
+CPPSPMD_FORCE_INLINE void spmd_kernel::check_masks()
+{
+	assert(!any(andnot(m_kernel_exec, m_exec)));
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_break()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_continue()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	// Kill any active lanes, and remember which lanes were active so we can re-enable them at the end of the loop body.
+	m_continue_mask = m_continue_mask | m_exec;
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_return()
+{
+	// Permenantly kill all active lanes
+	m_kernel_exec = andnot(m_exec, m_kernel_exec);
+	m_exec = exec_mask::all_off();
+}
+			
+template<typename UnmaskedBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_unmasked(const UnmaskedBody& unmaskedBody)
+{
+	exec_mask orig_exec = m_exec, orig_kernel_exec = m_kernel_exec;
+
+	m_kernel_exec = exec_mask::all_on();
+	m_exec = exec_mask::all_on();
+
+	unmaskedBody();
+
+	m_kernel_exec = m_kernel_exec & orig_kernel_exec;
+	m_exec = m_exec & orig_exec;
+	
+	check_masks();
+}
+
+struct scoped_unmasked_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_kernel_exec;
+				
+	CPPSPMD_FORCE_INLINE scoped_unmasked_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_kernel_exec(pKernel->m_kernel_exec)
+	{
+		pKernel->m_kernel_exec = exec_mask::all_on();
+		pKernel->m_exec = exec_mask::all_on();
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_unmasked_restorer() 
+	{ 
+		m_pKernel->m_kernel_exec = m_pKernel->m_kernel_exec & m_orig_kernel_exec;
+		m_pKernel->m_exec = m_pKernel->m_exec & m_orig_exec;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_UNMASKED_BEGIN { scoped_unmasked_restorer _unmasked_restorer(this); 
+#define SPMD_UNMASKED_END }
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if_break(const vbool& cond)
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+	
+	exec_mask cond_exec(cond);
+					
+	m_exec = andnot(m_exec & cond_exec, m_exec);
+
+	check_masks();
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sif(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		const exec_mask orig_exec = m_exec;
+		m_exec = im;
+		ifBody();
+		m_exec = orig_exec;
+	}
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody)
+{
+	const exec_mask orig_exec = m_exec;
+
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		m_exec = im;
+		ifBody();
+	}
+
+	exec_mask em = orig_exec & exec_mask(!cond);
+
+	if (any(em))
+	{
+		m_exec = em;
+		elseBody();
+	}
+		
+	m_exec = orig_exec;
+}
+
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask cond_exec(cond);
+		
+	exec_mask pre_if_exec = cond_exec & m_exec;
+
+	if (any(pre_if_exec))
+	{
+		exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+		m_exec = pre_if_exec;
+
+		ifBody();
+
+		// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+		m_exec = m_exec | unexecuted_lanes;
+
+		check_masks();
+	}
+}
+
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody)
+{
+	bool all_flag = false;
+
+	exec_mask cond_exec(cond);
+		
+	{
+		exec_mask pre_if_exec = cond_exec & m_exec;
+
+		int mask = pre_if_exec.get_movemask();
+		if (mask != 0)
+		{
+			all_flag = ((uint32_t)mask == m_exec.get_movemask());
+
+			exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+
+	if (!all_flag)
+	{
+		exec_mask pre_if_exec = andnot(cond_exec, m_exec);
+
+		if (any(pre_if_exec))
+		{
+			exec_mask unexecuted_lanes = cond_exec & m_exec;
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+}
+
+struct scoped_exec_restorer
+{
+	exec_mask *m_pMask;
+	exec_mask m_prev_mask;
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer(exec_mask *pExec_mask) : m_pMask(pExec_mask), m_prev_mask(*pExec_mask) { }
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer() { *m_pMask = m_prev_mask; }
+};
+
+// Cannot use SPMD break, continue, or return inside "simple" if/else
+#define SPMD_SIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SENDIF }
+
+// Same as SPMD_SIF, except doesn't use a scoped object
+#define SPMD_SIF2(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE2(cond) m_exec = _orig_exec; } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SEND_IF2 m_exec = _orig_exec; }
+
+// Same as SPMD_SIF(), except the if/else blocks are always executed
+#define SPMD_SAIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAENDIF }
+
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SSELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+#define SPMD_SCASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); _select_executed = _select_executed | m_exec;
+
+//#define SPMD_SCASE_END			if (_select_executed.get_movemask() == _orig_exec.m_prev_mask.get_movemask()) break; }
+#define SPMD_SCASE_END			if (!any(_select_executed ^ _orig_exec.m_prev_mask)) break; }
+#define SPMD_SDEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); if (any(_all_other_lanes)) { m_exec = _all_other_lanes;
+#define SPMD_SDEFAULT_END		}
+#define SPMD_SSELECT_END		} while(0);
+
+// Same as SPMD_SSELECT, except all cases are executed.
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SASELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+
+#define SPMD_SACASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); { m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); \
+	_select_executed = _select_executed | m_exec;
+
+#define SPMD_SACASE_END			}
+#define SPMD_SADEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); { m_exec = _all_other_lanes;
+#define SPMD_SADEFAULT_END		}
+#define SPMD_SASELECT_END		} while(0);
+
+struct scoped_exec_restorer2
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_unexecuted_lanes;
+		
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer2(spmd_kernel *pKernel, const vbool &cond) : 
+		m_pKernel(pKernel)
+	{ 
+		exec_mask cond_exec(cond);
+		m_unexecuted_lanes = andnot(cond_exec, pKernel->m_exec);
+		pKernel->m_exec = cond_exec & pKernel->m_exec;
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer2() 
+	{ 
+		m_pKernel->m_exec = m_pKernel->m_exec | m_unexecuted_lanes;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_IF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); if (any(m_exec)) {
+#define SPMD_ELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); if (any(m_exec)) {
+#define SPMD_END_IF } }
+
+// Same as SPMD_IF, except the conditional block is always executed.
+#define SPMD_AIF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); {
+#define SPMD_AELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); {
+#define SPMD_AEND_IF } }
+
+class scoped_exec_saver
+{
+	exec_mask m_exec, m_kernel_exec, m_continue_mask;
+	spmd_kernel *m_pKernel;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+
+public:
+	inline scoped_exec_saver(spmd_kernel *pKernel) :
+		m_exec(pKernel->m_exec), m_kernel_exec(pKernel->m_kernel_exec), m_continue_mask(pKernel->m_continue_mask),
+		m_pKernel(pKernel)
+	{ 
+#ifdef _DEBUG
+		m_in_loop = pKernel->m_in_loop;
+#endif
+	}
+		
+	inline ~scoped_exec_saver()
+	{ 
+		m_pKernel->m_exec = m_exec; 
+		m_pKernel->m_continue_mask = m_continue_mask; 
+		m_pKernel->m_kernel_exec = m_kernel_exec; 
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#define SPMD_BEGIN_CALL scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_continue_mask = exec_mask::all_off();
+#define SPMD_BEGIN_CALL_ALL_LANES scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_exec = exec_mask::all_on(); m_continue_mask = exec_mask::all_off();
+
+template<typename ForeachBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_foreach(int begin, int end, const ForeachBody& foreachBody)
+{
+	if (begin == end)
+		return;
+	
+	if (!any(m_exec))
+		return;
+
+	// We don't support iterating backwards.
+	if (begin > end)
+		std::swap(begin, end);
+
+	exec_mask prev_continue_mask = m_continue_mask, prev_exec = m_exec;
+	
+	int total_full = (end - begin) / PROGRAM_COUNT;
+	int total_partial = (end - begin) % PROGRAM_COUNT;
+
+	lint_t loop_index = begin + program_index;
+	
+	const int total_loops = total_full + (total_partial ? 1 : 0);
+
+	m_continue_mask = exec_mask::all_off();
+
+	for (int i = 0; i < total_loops; i++)
+	{
+		int n = PROGRAM_COUNT;
+		if ((i == (total_loops - 1)) && (total_partial))
+		{
+			exec_mask partial_mask = exec_mask(vint_t(total_partial) > vint_t(program_index));
+			m_exec = m_exec & partial_mask;
+			n = total_partial;
+		}
+
+		foreachBody(loop_index, n);
+
+		m_exec = m_exec | m_continue_mask;
+		if (!any(m_exec))
+			break;
+
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+				
+		store_all(loop_index, loop_index + PROGRAM_COUNT);
+	}
+
+	m_exec = prev_exec & m_kernel_exec;
+	m_continue_mask = prev_continue_mask;
+	check_masks();
+}
+
+template<typename WhileCondBody, typename WhileBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(whileCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		whileBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+	}
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+#endif
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+	check_masks();
+}
+
+struct scoped_while_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_continue_mask;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+				
+	CPPSPMD_FORCE_INLINE scoped_while_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_continue_mask(pKernel->m_continue_mask)
+	{
+		pKernel->m_continue_mask.all_off();
+
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_while_restorer() 
+	{ 
+		m_pKernel->m_exec = m_orig_exec & m_pKernel->m_kernel_exec;
+		m_pKernel->m_continue_mask = m_orig_continue_mask;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#undef SPMD_WHILE
+#undef SPMD_WEND
+#define SPMD_WHILE(cond) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+
+#define SPMD_WEND m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); } }
+
+// Nesting is not supported (although it will compile, but the results won't make much sense).
+#define SPMD_FOREACH(loop_var, bi, ei) if (((bi) != (ei)) && (any(m_exec))) { \
+	scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	uint32_t b = (uint32_t)(bi), e = (uint32_t)(ei); if ((b) > (e)) { std::swap(b, e); } const uint32_t total_full = ((e) - (b)) >> PROGRAM_COUNT_SHIFT, total_partial = ((e) - (b)) & (PROGRAM_COUNT - 1); \
+	lint_t loop_var = program_index + (int)b; const uint32_t total_loops = total_full + (total_partial ? 1U : 0U); \
+	for (uint32_t CPPSPMD_GLUER2(_foreach_counter, __LINE__) = 0; CPPSPMD_GLUER2(_foreach_counter, __LINE__) < total_loops; ++CPPSPMD_GLUER2(_foreach_counter, __LINE__)) { \
+		if ((CPPSPMD_GLUER2(_foreach_counter, __LINE__) == (total_loops - 1)) && (total_partial)) { exec_mask partial_mask = exec_mask(vint_t((int)total_partial) > vint_t(program_index)); m_exec = m_exec & partial_mask; }
+
+#define SPMD_FOREACH_END(loop_var) m_exec = m_exec | m_continue_mask; if (!any(m_exec)) break; m_continue_mask = exec_mask::all_off(); check_masks(); store_all(loop_var, loop_var + PROGRAM_COUNT); } }
+
+// Okay to use spmd_continue or spmd_return, but not spmd_break
+#define SPMD_FOREACH_ACTIVE(index_var) int64_t index_var; { uint64_t _movemask = m_exec.get_movemask(); if (_movemask) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	for (uint32_t _i = 0; _i < PROGRAM_COUNT; ++_i) { \
+		if (_movemask & (1U << _i)) { \
+			m_exec.enable_lane(_i); m_exec = m_exec & m_kernel_exec; \
+			(index_var) = _i; \
+
+#define SPMD_FOREACH_ACTIVE_END } } } }
+
+// Okay to use spmd_continue, but not spmd_break/spmd_continue
+#define SPMD_FOREACH_UNIQUE_INT(index_var, var) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	CPPSPMD_DECL(int_t, _vals[PROGRAM_COUNT]); store_linear_all(_vals, var); std::sort(_vals, _vals + PROGRAM_COUNT); \
+	const int _n = (int)(std::unique(_vals, _vals + PROGRAM_COUNT) - _vals); \
+	for (int _i = 0; _i < _n; ++_i) { int index_var = _vals[_i]; vbool cond = (vint_t(var) == vint_t(index_var)); m_exec = exec_mask(cond);
+
+#define SPMD_FOREACH_UNIQUE_INT_END } }
+
+struct scoped_simple_while_restorer
+{
+	spmd_kernel* m_pKernel;
+	exec_mask m_orig_exec;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+
+	CPPSPMD_FORCE_INLINE scoped_simple_while_restorer(spmd_kernel* pKernel) :
+		m_pKernel(pKernel),
+		m_orig_exec(pKernel->m_exec)
+	{
+			
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_simple_while_restorer()
+	{
+		m_pKernel->m_exec = m_orig_exec;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+// Cannot use SPMD break, continue, or return inside simple while
+
+#define SPMD_SWHILE(cond) { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	while(true) { \
+		exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_SWEND } }	
+
+// Cannot use SPMD break, continue, or return inside simple do
+#define SPMD_SDO { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) {
+#define SPMD_SEND_DO(cond) exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break; } }	
+
+#undef SPMD_FOR
+#undef SPMD_END_FOR
+#define SPMD_FOR(for_init, for_cond) { for_init; scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(for_cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_END_FOR(for_inc) m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); for_inc; } }
+		
+template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	forInitBody();
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(forCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		forBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+			
+		forIncrBody();
+	}
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+	check_masks();
+#endif
+}
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math.h b/thirdparty/basis_universal/encoder/cppspmd_math.h
new file mode 100644
index 0000000000..e7b3202b8e
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_math.h
@@ -0,0 +1,725 @@
+// Do not include this header directly.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The general goal of these vectorized estimated math functions is scalability/performance.
+// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either. 
+// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper 
+// engineering analysis before relying on them.
+// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
+// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	vfloat c = frac(abs(a * b_inv)) * abs(b); 
+	return spmd_ternaryf(a < 0, -c, c); 
+}
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	return frac(a * b_inv) * b; 
+}
+
+// Avoids dividing by zero or very small values.
+CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
+{
+	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
+
+	log2_est():
+	max abs err: 0.0000023076808731
+	max rel err: 0.0000000756678881
+	avg abs err: 0.0000007535452724
+	avg rel err: 0.0000000235117843
+
+	XMVectorLog2():
+	max abs err: 0.0000023329709933
+	max rel err: 0.0000000826961046
+	avg abs err: 0.0000007564889684
+	avg rel err: 0.0000000236051899
+
+	std::log2f():
+	max abs err: 0.0000020265979401
+	max rel err: 0.0000000626647654
+	avg abs err: 0.0000007494445227
+	avg rel err: 0.0000000233800985
+*/
+
+// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
+inline vfloat spmd_kernel::log2_est(vfloat v)
+{
+	vfloat signif, fexp;
+
+	// Just clamp to a very small value, instead of checking for invalid inputs.
+	vfloat x = max(v, 2.2e-38f);
+
+	/*
+	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
+	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
+	 */
+
+	 // get exponent
+	vint ux1_i = cast_vfloat_to_vint(x);
+
+	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
+
+	// actual exponent is exp-127, will subtract 127 later
+
+	vint ux2_i;
+	vfloat ux2_f;
+
+	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
+	SPMD_SIF(greater != 0)
+	{
+		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1 
+		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
+
+		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		// 126 instead of 127 compensates for division by 2
+		store_all(fexp, vfloat(exp - 126));    
+	}
+	SPMD_SELSE(greater != 0)
+	{
+		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
+		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
+
+		store(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		store(fexp, vfloat(exp - 127));
+	}
+	SPMD_SENDIF
+
+	store_all(signif, ux2_f);
+	store_all(signif, signif - 1.0f);
+
+	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
+
+	vfloat xm1 = signif;
+	vfloat xm1sqr = xm1 * xm1;
+		
+	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
+	
+	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
+	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
+}
+
+// Uses log2_est(), so this function must be <= the precision of that.
+inline vfloat spmd_kernel::log_est(vfloat v)
+{
+	return log2_est(v) * 0.693147181f;
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
+{
+	// Assume we're using equation (2)
+	store_all(adjustment, 0);
+	
+	// integer part of the input argument
+	vint int_arg = (vint)arg;
+	
+	// if frac(arg) is in [0.5, 1.0]...
+	SPMD_SIF((arg - int_arg) > 0.5f)   
+	{
+		store(adjustment, 1);
+		
+		// then change it to [0.0, 0.5]
+		store(arg, arg - 0.5f);
+	}
+	SPMD_SENDIF
+
+	// arg == just the fractional part
+	store_all(arg, arg - (vfloat)int_arg);
+   
+	// Now compute 2** (int) arg. 
+	store_all(int_arg, min(int_arg + 127, 254));
+	
+	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
+	
+	exp2_est():
+	Total passed near - zero check : 16777216
+	Total sign diffs : 0
+	max abs err: 1668910609.7500000000000000
+	max rel err: 0.0000015642030031
+	avg abs err: 10793794.4007573910057545
+	avg rel err: 0.0000003890893282
+	 
+	XMVectorExp2():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1665552836.8750000000000000
+	max rel err: 0.0000114674862370
+	avg abs err: 10771868.2627860084176064
+	avg rel err: 0.0000011218880770
+
+	std::exp2f():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1591636585.6250000000000000
+	max rel err: 0.0000014849731018
+	avg abs err: 10775800.3204844966530800
+	avg rel err: 0.0000003851496422
+*/
+
+// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
+inline vfloat spmd_kernel::exp2_est(vfloat arg)
+{
+	SPMD_BEGIN_CALL
+
+	const vfloat P00 = +7.2152891521493f;
+	const vfloat P01 = +0.0576900723731f;
+	const vfloat Q00 = +20.8189237930062f;
+	const vfloat Q01 = +1.0f;
+	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling 
+
+	vfloat result = 0.0f;
+
+	// Return 0 if arg is too large. 
+	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
+	SPMD_IF(abs(arg) > 126.0f)
+	{
+		spmd_return();
+	}
+	SPMD_END_IF
+
+	// 2**(int(a))
+	vfloat two_int_a;                
+	
+	// set to 1 by reduce_expb
+	vint adjustment;
+	
+	// 0 if arg is +; 1 if negative
+	vint negative = 0;                 
+
+	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
+	SPMD_SIF(arg < 0.0f)
+	{
+		store(arg, -arg);
+		store(negative, 1);
+	}
+	SPMD_SENDIF
+
+	store_all(arg, min(arg, 126.0f));
+
+	// reduce to [0.0, 0.5]
+	reduce_expb(arg, two_int_a, adjustment);
+
+	// The format of the polynomial is:
+	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
+	//
+	//  The following computes the polynomial in several steps:
+
+	// Q(x**2)
+	vfloat Q = vfma(Q01, (arg * arg), Q00);
+	
+	// x*P(x**2)
+	vfloat x_P = arg * (vfma(P01, arg * arg, P00));
+	
+	vfloat answer = (Q + x_P) / (Q - x_P);
+
+	// Now correct for the scaling factor of 2**(int(a))
+	store_all(answer, answer * two_int_a);
+			
+	// If the result had a fractional part > 0.5, correct for that
+	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
+
+	// Correct for a negative input
+	SPMD_SIF(negative != 0)
+	{
+		store(answer, 1.0f / answer);
+	}
+	SPMD_SENDIF
+
+	store(result, answer);
+
+	return result;
+}
+
+inline vfloat spmd_kernel::exp_est(vfloat arg)
+{
+	// e^x = exp2(x / log_base_e(2))
+	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
+	return exp2_est(arg * 1.44269504f);
+}
+
+inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
+{
+	return exp_est(log_est(arg1) * arg2);
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	Total near-zero: 144, output above near-zero tresh: 30
+	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16777072
+	Total sign diffs: 5
+	max abs err: 0.0000031375306036
+	max rel err: 0.1140846017075028
+	avg abs err: 0.0000003026226621
+	avg rel err: 0.0000033564977623
+*/
+
+// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
+// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
+inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
+{
+	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
+	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
+	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
+	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
+	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
+
+	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
+
+	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
+
+	store_all(r1_y, frac(r1_x));                   
+	
+	store_all(r2_x, (vfloat)(r1_y < c1_x));        
+
+	store_all(r2_y, (vfloat)(r1_y >= c1_y));    
+	store_all(r2_z, (vfloat)(r1_y >= c1_z));    
+
+	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
+
+	store_all(r0_x, c0_x - r1_y);                
+	store_all(r0_y, c0_y - r1_y);                
+	store_all(r0_z, c0_z - r1_y);                
+	
+	store_all(r0_x, r0_x * r0_x);
+	store_all(r0_y, r0_y * r0_y);
+	store_all(r0_z, r0_z * r0_z);
+
+	store_all(r1_x, vfma(c2_x, r0_x, c2_z));           
+	store_all(r1_y, vfma(c2_y, r0_y, c2_w));           
+	store_all(r1_z, vfma(c2_x, r0_z, c2_z));           
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_x));
+		
+	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_z));
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_x));
+
+	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_z));
+
+	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
+
+	return r0_x;
+}
+
+// positive values only
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+	
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+	
+	return rcp_l * vfnma(rcp_l, q, 2.0f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat s = sign(t);
+	vfloat q = abs(t);
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+
+	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
+}
+
+// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
+// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	return x * vfnma(xhalf * x, x, 1.5008909f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
+	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
+	return x2;
+}
+
+// Math from: http://developer.download.nvidia.com/cg/atan2.html
+// TODO: Needs more validation, parameter checking.
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
+{
+	vfloat t1 = abs(y);
+	vfloat t3 = abs(x);
+	
+	vfloat t0 = max(t3, t1);
+	store_all(t1, min(t3, t1));
+
+	store_all(t3, t1 / t0);
+	
+	vfloat t4 = t3 * t3;
+	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
+	store_all(t0, vfms(t0, t4, 0.121239071f));
+	store_all(t0, vfma(t0, t4, 0.195635925f));
+	store_all(t0, vfms(t0, t4, 0.332994597f));
+	store_all(t0, vfma(t0, t4, 0.999995630f));
+	store_all(t3, t0 * t3);
+
+	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
+
+	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
+	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
+
+	return t3;
+}
+
+/*
+    clang 9.0.0 for win /fp:precise release
+	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
+	Skipped angles near 90/270 within +- .001 radians.
+	Near-zero threshold: .0000125f
+	Near-zero output above check threshold: 1e-6f
+
+	Total near-zero: 144, output above near-zero tresh: 20
+	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16766400
+	Total sign diffs: 5
+	max abs err: 1.4982600811139264
+	max rel err: 0.1459155900188041
+	avg rel err: 0.0000054659502568
+
+	XMVectorTan() precise:
+	Total near-zero: 144, output above near-zero tresh: 18
+	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
+	Total near-zero sign diffs: 0
+	Total passed near-zero check: 16766400
+	Total sign diffs: 0
+	max abs err: 1.9883573246424930
+	max rel err: 0.1459724171926864
+	avg rel err: 0.0000054965766843
+
+	std::tanf():
+	Total near-zero: 144, output above near-zero tresh: 0
+	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
+	Total near-zero sign diffs: 11
+	Total passed near-zero check: 16766400
+	Total sign diffs: 11
+	max abs err: 0.8989131818294709
+	max rel err: 0.0573181403173166
+	avg rel err: 0.0000030791301203
+	
+	Originally from:
+	http://www.ganssle.com/approx.htm
+*/
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
+{
+	// Original double version was 8.2 digits
+	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
+	// Tuned float constants for lower avg rel error (without using FMA3):
+	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
+	vfloat x2 = x * x;
+	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
+}
+
+// Don't call this for angles close to 90/270!.
+inline vfloat spmd_kernel::tan_est(vfloat x)
+{
+	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
+	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };
+
+	vint table = init_lookup4(s_table0); // a load
+	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
+
+	store_all(x, abs(x));
+	vfloat orig_x = x;
+
+	vfloat q = x * fOneOverPi;
+	store_all(x, q - floor(q));
+
+	vfloat x4 = x * 4.0f;
+	vint octant = (vint)(x4);
+
+	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
+
+	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
+
+	vfloat bias = (vfloat)k + -128.0f;
+	vfloat y = x0 + bias;
+
+	vfloat z = tan82(y);
+
+	vfloat r;
+	
+	vbool octant_one_or_two = (octant == 1) || (octant == 2);
+
+	// SPMD optimization - skip costly divide if we can
+	if (spmd_any(octant_one_or_two))
+	{
+		const float fDivThresh = .4371e-7f;
+		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
+				
+		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
+		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
+	}
+	else
+	{
+		store_all(r, spmd_ternaryf(octant == 0, z, -z));
+	}
+		
+	// Small angle approximation, to decrease the max rel error near Pi.
+	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
+	{
+		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
+	}
+	SPMD_SENDIF
+
+	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
+}
+
+inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
+{ 
+	store(x.a, 0xf1ea5eed); 
+	store(x.b, seed ^ 0xd8487b1f); 
+	store(x.c, seed ^ 0xdbadef9a); 
+	store(x.d, seed); 
+	for (int i = 0; i < 20; ++i) 
+		(void)get_randu(x); 
+}
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+// Returns 32-bit unsigned random numbers.
+inline vint spmd_kernel::get_randu(rand_context& x)
+{ 
+	vint e = x.a - VINT_ROT(x.b, 27); 
+	store(x.a, x.b ^ VINT_ROT(x.c, 17)); 
+	store(x.b, x.c + x.d); 
+	store(x.c, x.d + e); 
+	store(x.d, e + x.a);	
+	return x.d; 
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
+{
+	vint rnd = get_randu(x);
+
+	vint range = high - low;
+
+	vint rnd_range = mulhiu(rnd, range);
+	
+	return spmd_ternaryi(low < high, low + rnd_range, low);
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
+{
+	vint rndi = get_randu(x) & 0x7fffff;
+
+	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
+
+	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
+{
+	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
+	store_all(tab1, init_lookup4(tab1_bytes));
+	store_all(tab2, init_lookup4(tab2_bytes));
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
+{
+	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
+	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
+	vint r3 = r0 | r1;
+	return byteswap(r3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
+{
+	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	vint tab = init_lookup4(s_tab);
+
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
+{
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	// x <= 0x3fffffff
+	vbool c3 = (x2 & 0xC0000000) == 0;
+	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
+	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
+
+	// x <= 0x7fffffff
+	vbool c4 = (x3 & 0x80000000) == 0;
+	return spmd_ternaryi(c4, n3 + 1, n3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
+{
+	// cast the least significant bit in v to a float
+	vfloat f = (vfloat)(x & -x);
+	
+	// extract exponent and adjust
+	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
+{
+	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
+	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
+	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
+{ 
+	return cmpeq_epi16(subs_epu16(a, b), vint(0)); 
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b) 
+{ 
+	return cmple_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
+{
+	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
+}
+
+CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
+{
+	return cmpgt_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
+{
+	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
+{
+	return cmpge_epi16(b, a);
+}
+
+void spmd_kernel::print_vint(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_vbool(vbool v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i) ? 1 : 0); 
+	printf("\n"); 
+}
+	
+void spmd_kernel::print_vint_hex(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("0x%X ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_active_lanes(const char *pPrefix) 
+{ 
+	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
+	memset(flags, 0, sizeof(flags));
+	storeu_linear(flags, vint(1));
+
+	if (pPrefix)
+		printf("%s", pPrefix);
+
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+	{
+		if (flags[i])
+			printf("%u ", i);
+	}
+	printf("\n");
+}
+	
+void spmd_kernel::print_vfloat(vfloat v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%f ", extract(v, i)); 
+	printf("\n"); 
+}
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
new file mode 100644
index 0000000000..cdb6447b62
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
@@ -0,0 +1,89 @@
+// Do not include this header directly.
+// This header defines shared struct spmd_kernel helpers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See cppspmd_math.h for detailed error statistics.
+
+CPPSPMD_FORCE_INLINE void reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment);
+CPPSPMD_FORCE_INLINE vfloat tan56(vfloat x);
+CPPSPMD_FORCE_INLINE vfloat tan82(vfloat x);
+
+inline vfloat log2_est(vfloat v);
+
+inline vfloat log_est(vfloat v);
+
+inline vfloat exp2_est(vfloat arg);
+
+inline vfloat exp_est(vfloat arg);
+
+inline vfloat pow_est(vfloat arg1, vfloat arg2);
+
+CPPSPMD_FORCE_INLINE vfloat recip_est1(const vfloat& q);
+CPPSPMD_FORCE_INLINE vfloat recip_est1_pn(const vfloat& q);
+
+inline vfloat mod_angles(vfloat a);
+
+inline vfloat sincos_est_a(vfloat a, bool sin_flag);
+CPPSPMD_FORCE_INLINE vfloat sin_est_a(vfloat a) { return sincos_est_a(a, true); }
+CPPSPMD_FORCE_INLINE vfloat cos_est_a(vfloat a) { return sincos_est_a(a, false); }
+
+inline vfloat sin_est(vfloat a);
+
+inline vfloat cos_est(vfloat a);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est1(vfloat x0);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est2(vfloat x0);
+
+CPPSPMD_FORCE_INLINE vfloat atan2_est(vfloat y, vfloat x);
+
+CPPSPMD_FORCE_INLINE vfloat atan_est(vfloat x) { return atan2_est(x, vfloat(1.0f)); }
+
+// Don't call this for angles close to 90/270! 
+inline vfloat tan_est(vfloat x);
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+struct rand_context { vint a, b, c, d; };
+
+inline void seed_rand(rand_context& x, vint seed);
+
+// Returns 32-bit unsigned random numbers.
+inline vint get_randu(rand_context& x);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint get_randi(rand_context& x, vint low, vint high);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat get_randf(rand_context& x, vfloat low, vfloat high);
+
+CPPSPMD_FORCE_INLINE void init_reverse_bits(vint& tab1, vint& tab2);
+CPPSPMD_FORCE_INLINE vint reverse_bits(vint k, vint tab1, vint tab2);
+
+CPPSPMD_FORCE_INLINE vint count_leading_zeros(vint x);
+CPPSPMD_FORCE_INLINE vint count_leading_zeros_alt(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_trailing_zeros(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_set_bits(vint x);
+
+void print_vint(vint v);
+void print_vbool(vbool v);
+void print_vint_hex(vint v);
+void print_active_lanes(const char *pPrefix);
+void print_vfloat(vfloat v);
+
diff --git a/thirdparty/basis_universal/encoder/cppspmd_sse.h b/thirdparty/basis_universal/encoder/cppspmd_sse.h
new file mode 100644
index 0000000000..b39cb82a5f
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h
@@ -0,0 +1,2118 @@
+// cppspmd_sse.h
+// Note for Basis Universal: All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
+// SSE 2 or 4.1
+// Originally written by Nicolas Guillemot, Jefferson Amstutz in the "CppSPMD" project.
+// 4/20: Richard Geldreich: Macro control flow, more SIMD instruction sets, optimizations, supports using multiple SIMD instruction sets in same executable. Still a work in progress!
+//
+// Originally Copyright 2016 Nicolas Guillemot
+// Changed from the MIT license to Apache 2.0 with permission from the author.
+//
+// Modifications/enhancements Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <utility>
+#include <algorithm>
+
+#if CPPSPMD_SSE2
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#else
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#include <pmmintrin.h>		// SSE3
+#include <tmmintrin.h>		// SSSE3
+#include <smmintrin.h>		// SSE4.1
+//#include <nmmintrin.h>		// SSE4.2
+#endif
+
+#undef CPPSPMD_SSE
+#undef CPPSPMD_AVX1
+#undef CPPSPMD_AVX2
+#undef CPPSPMD_AVX
+#undef CPPSPMD_FLOAT4
+#undef CPPSPMD_INT16
+
+#define CPPSPMD_SSE 1
+#define CPPSPMD_AVX 0
+#define CPPSPMD_AVX1 0
+#define CPPSPMD_AVX2 0
+#define CPPSPMD_FLOAT4 0
+#define CPPSPMD_INT16 0
+
+#ifdef _MSC_VER
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __declspec(align(v))
+	#endif
+
+	#define _mm_undefined_si128 _mm_setzero_si128
+	#define _mm_undefined_ps _mm_setzero_ps
+#else
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
+	#endif
+#endif
+
+#ifndef CPPSPMD_FORCE_INLINE
+#ifdef _DEBUG
+#define CPPSPMD_FORCE_INLINE inline
+#else
+	#ifdef _MSC_VER
+		#define CPPSPMD_FORCE_INLINE __forceinline
+	#else
+		#define CPPSPMD_FORCE_INLINE inline
+	#endif
+#endif
+#endif
+
+#undef CPPSPMD
+#undef CPPSPMD_ARCH
+
+#if CPPSPMD_SSE2
+	#define CPPSPMD_SSE41 0
+	#define CPPSPMD cppspmd_sse2
+	#define CPPSPMD_ARCH _sse2
+#else
+	#define CPPSPMD_SSE41 1
+	#define CPPSPMD cppspmd_sse41
+	#define CPPSPMD_ARCH _sse41
+#endif
+
+#ifndef CPPSPMD_GLUER
+	#define CPPSPMD_GLUER(a, b) a##b
+#endif
+
+#ifndef CPPSPMD_GLUER2
+	#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
+#endif
+
+#ifndef CPPSPMD_NAME
+#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
+#endif
+
+#undef VASSERT
+#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
+#define VASSERT(cond) assert( VCOND(cond) )
+
+#define CPPSPMD_ALIGNMENT (16)
+
+#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
+
+namespace CPPSPMD
+{
+
+const int PROGRAM_COUNT_SHIFT = 2;
+const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
+
+template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N;	return static_cast<N*>(p); }
+template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
+
+CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
+CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
+CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
+CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
+
+CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) = 
+{ 
+	{ UINT32_MAX, 0, 0, 0 },
+	{ 0, UINT32_MAX, 0, 0 },
+	{ 0, 0, UINT32_MAX, 0 },
+	{ 0, 0, 0, UINT32_MAX },
+};
+
+#if CPPSPMD_SSE41
+CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
+{
+#if CPPSPMD_SSE2
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
+#else
+	return _mm_blendv_epi8(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// We know it's a mask, so we can just emulate the blend.
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
+	mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
+
+// Returns float bits as int, to emulate _mm_extract_ps()
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f;  }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
+#else
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
+
+// Returns float bits as int
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
+#else
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
+#endif
+
+#if CPPSPMD_SSE2
+inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
+{
+	// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
+	CPPSPMD_ALIGN(16) uint8_t av[16];
+	_mm_store_si128((__m128i*)av, a);
+		
+	CPPSPMD_ALIGN(16) uint8_t bvi[16];
+	_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
+
+	CPPSPMD_ALIGN(16) uint8_t result[16];
+
+	result[0] = av[bvi[0]];
+	result[1] = av[bvi[1]];
+	result[2] = av[bvi[2]];
+	result[3] = av[bvi[3]];
+	
+	result[4] = av[bvi[4]];
+	result[5] = av[bvi[5]];
+	result[6] = av[bvi[6]];
+	result[7] = av[bvi[7]];
+
+	result[8] = av[bvi[8]];
+	result[9] = av[bvi[9]];
+	result[10] = av[bvi[10]];
+	result[11] = av[bvi[11]];
+
+	result[12] = av[bvi[12]];
+	result[13] = av[bvi[13]];
+	result[14] = av[bvi[14]];
+	result[15] = av[bvi[15]];
+
+	return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b) 
+{ 
+	return _mm_shuffle_epi8(a, b); 
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return _mm_min_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return _mm_max_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	return _mm_min_epu32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	return _mm_max_epu32(a, b);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	__m128i sign_mask = _mm_srai_epi32(a, 31);
+	return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	return _mm_abs_epi32(a);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	return _mm_mullo_epi32(a, b);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
+}
+
+#if CPPSPMD_SSE2
+inline __m128i load_rgba32(const void* p)
+{
+	__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
+	xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
+	xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
+	return xmm;
+}
+#else
+inline __m128i load_rgba32(const void* p)
+{
+	return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
+}
+#endif
+
+inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
+{
+	__m128i t0 = _mm_unpacklo_epi32(r0, r1);
+	__m128i t1 = _mm_unpacklo_epi32(r2, r3);
+	__m128i t2 = _mm_unpackhi_epi32(r0, r1);
+	__m128i t3 = _mm_unpackhi_epi32(r2, r3);
+	x = _mm_unpacklo_epi64(t0, t1);
+	y = _mm_unpackhi_epi64(t0, t1);
+	z = _mm_unpacklo_epi64(t2, t3);
+	w = _mm_unpackhi_epi64(t2, t3);
+}
+
+const uint32_t ALL_ON_MOVEMASK = 0xF;
+
+struct spmd_kernel
+{
+	struct vint;
+	struct lint;
+	struct vbool;
+	struct vfloat;
+
+	typedef int int_t;
+	typedef vint vint_t;
+	typedef lint lint_t;
+		
+	// Exec mask
+	struct exec_mask
+	{
+		__m128i m_mask;
+
+		exec_mask() = default;
+
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
+
+		CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
+				
+		static CPPSPMD_FORCE_INLINE exec_mask all_on()	{ return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) };	}
+		static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
+
+		CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
+	};
+
+	friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
+	friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
+
+	CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
+
+	// true if cond is true for all active lanes - false if no active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
+	// true if cond is true for any active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
+	CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
+
+	friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
+		
+	exec_mask m_exec;
+	exec_mask m_kernel_exec;
+	exec_mask m_continue_mask;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+		
+	CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
+		
+	void init(const exec_mask& kernel_exec);
+	
+	// Varying bool
+		
+	struct vbool
+	{
+		__m128i m_value;
+
+		vbool() = default;
+
+		CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
+		CPPSPMD_FORCE_INLINE explicit operator vint() const;
+								
+	private:
+		vbool& operator=(const vbool&);
+	};
+
+	friend vbool operator!(const vbool& v);
+		
+	CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	// Varying float
+	struct vfloat
+	{
+		__m128 m_value;
+
+		vfloat() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
+
+		CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
+
+	private:
+		vfloat& operator=(const vfloat&);
+	};
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	// Linear ref to floats
+	struct float_lref
+	{
+		float* m_pValue;
+
+	private:
+		float_lref& operator=(const float_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
+	{
+		return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
+	}
+		
+	// Varying ref to floats
+	struct float_vref
+	{
+		__m128i m_vindex;
+		float* m_pValue;
+		
+	private:
+		float_vref& operator=(const float_vref&);
+	};
+
+	// Varying ref to varying float
+	struct vfloat_vref
+	{
+		__m128i m_vindex;
+		vfloat* m_pValue;
+		
+	private:
+		vfloat_vref& operator=(const vfloat_vref&);
+	};
+
+	// Varying ref to varying int
+	struct vint_vref
+	{
+		__m128i m_vindex;
+		vint* m_pValue;
+		
+	private:
+		vint_vref& operator=(const vint_vref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
+		
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				loaded[i] = src.m_pValue[vindex[i]];
+		}
+		return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		for (int i = 0; i < 4; i++)
+			loaded[i] = src.m_pValue[vindex[i]];
+		return vfloat{ _mm_load_ps((const float*)loaded) };
+	}
+
+	// Linear ref to ints
+	struct int_lref
+	{
+		int* m_pValue;
+
+	private:
+		int_lref& operator=(const int_lref&);
+	};
+		
+	CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+		{
+			_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
+		}
+		else
+		{
+			CPPSPMD_ALIGN(16) int stored[4];
+			_mm_store_si128((__m128i *)stored, src.m_value);
+
+			for (int i = 0; i < 4; i++)
+			{
+				if (mask & (1 << i))
+					dst.m_pValue[i] = stored[i];
+			}
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	// Linear ref to int16's
+	struct int16_lref
+	{
+		int16_t* m_pValue;
+
+	private:
+		int16_lref& operator=(const int16_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ t };
+	}
+		
+	// Linear ref to constant ints
+	struct cint_lref
+	{
+		const int* m_pValue;
+
+	private:
+		cint_lref& operator=(const cint_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
+	{
+		return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
+	}
+	
+	// Varying ref to ints
+	struct int_vref
+	{
+		__m128i m_vindex;
+		int* m_pValue;
+
+	private:
+		int_vref& operator=(const int_vref&);
+	};
+
+	// Varying ref to constant ints
+	struct cint_vref
+	{
+		__m128i m_vindex;
+		const int* m_pValue;
+
+	private:
+		cint_vref& operator=(const cint_vref&);
+	};
+
+	// Varying int
+	struct vint
+	{
+		__m128i m_value;
+
+		vint() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value)	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
+
+		CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
+
+		CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value))	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vbool() const 
+		{
+			return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
+		{
+			return int_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
+		{
+			return cint_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
+		{
+			return float_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
+		{
+			return vfloat_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
+		{
+			return vint_vref{ m_value, ptr };
+		}
+
+	private:
+		vint& operator=(const vint&);
+	};
+
+	// Load/store linear int
+	CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_si128((__m128i *)pDst, src.m_value);
+		else
+		{
+			if (mask & 1) pDst[0] = extract_x(src.m_value);
+			if (mask & 2) pDst[1] = extract_y(src.m_value);
+			if (mask & 4) pDst[2] = extract_z(src.m_value);
+			if (mask & 8) pDst[3] = extract_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
+	{
+		_mm_storeu_si128((__m128i*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
+	{
+		_mm_store_si128((__m128i*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)pSrc);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_loadu_si128((__m128i*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_load_si128((__m128i*)pSrc) };
+	}
+
+	// Load/store linear float
+	CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps((float*)pDst, src.m_value);
+		else
+		{
+			int *pDstI = (int *)pDst;
+			if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
+			if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
+			if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
+			if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_storeu_ps((float*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_store_ps((float*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
+	{
+		__m128 v = _mm_loadu_ps((const float*)pSrc);
+
+		v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
+
+		return vfloat{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_loadu_ps((float*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_load_ps((float*)pSrc) };
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[vindex[i]] = stored[i];
+		}
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+				
+	CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[vindex[i]] = stored[i];
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) pDst[0] = extract_x(v.m_value);
+		if (mask & 2) pDst[stride] = extract_y(v.m_value);
+		if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
+		if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		pDst[0] = extract_x(v.m_value);
+		pDst[stride] = extract_y(v.m_value);
+		pDst[stride*2] = extract_z(v.m_value);
+		pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+								
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrcF);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrc);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else		
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_load_ss(pSrcF);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_load_ss(pSrc);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vfloat{ _mm_castsi128_ps(k) };
+	}
+
+	CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		__m128i k;
+
+		k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+			
+	// Linear integer
+	struct lint
+	{
+		__m128i m_value;
+
+		CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
+			: m_value(value)
+		{ }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vint() const
+		{
+			return vint{ m_value };
+		}
+
+		CPPSPMD_FORCE_INLINE int get_first_value() const 
+		{
+			return _mm_cvtsi128_si32(m_value);
+		}
+
+		CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
+		{
+			return float_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
+		{
+			return int_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
+		{
+			return int16_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
+		{
+			return cint_lref{ ptr + get_first_value() };
+		}
+
+	private:
+		lint& operator=(const lint&);
+	};
+
+	CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
+	
+	// SPMD condition helpers
+
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
+
+	CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
+
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
+
+	template<typename WhileCondBody, typename WhileBody>
+	CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
+
+	template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+	CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
+
+	template<typename ForeachBody>
+	CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
+		
+#ifdef _DEBUG
+	CPPSPMD_FORCE_INLINE void check_masks();
+#else
+	CPPSPMD_FORCE_INLINE void check_masks() { }
+#endif
+
+	CPPSPMD_FORCE_INLINE void spmd_break();
+	CPPSPMD_FORCE_INLINE void spmd_continue();
+	
+	CPPSPMD_FORCE_INLINE void spmd_return();
+	
+	template<typename UnmaskedBody>
+	CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
+
+	template<typename SPMDKernel, typename... Args>
+	//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
+	CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
+
+	CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
+
+	CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)	
+	{ 
+		__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
+
+//#if CPPSPMD_SSE2
+#if 1
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128 shuf   = _mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128 sums   = _mm_add_ps(k3210, shuf);
+		shuf          = _mm_movehl_ps(shuf, sums);
+		sums          = _mm_add_ss(sums, shuf);
+		return _mm_cvtss_f32(sums);
+#else
+		// This is pretty slow.
+		__m128 a = _mm_hadd_ps(k3210, k3210);
+		__m128 b = _mm_hadd_ps(a, a);
+		return extractf_ps_x(b);
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE int reduce_add(vint v)
+	{
+		__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
+
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128i shuf = _mm_shuffle_epi32(k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128i sums = _mm_add_epi32(k3210, shuf);
+		shuf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(shuf), _mm_castsi128_ps(sums)));
+		sums = _mm_add_epi32(sums, shuf);
+		return extract_x(sums);
+	}
+
+	#include "cppspmd_math_declares.h"
+
+}; // struct spmd_kernel
+
+using exec_mask = spmd_kernel::exec_mask;
+using vint = spmd_kernel::vint;
+using int_lref = spmd_kernel::int_lref;
+using cint_vref = spmd_kernel::cint_vref;
+using cint_lref = spmd_kernel::cint_lref;
+using int_vref = spmd_kernel::int_vref;
+using lint = spmd_kernel::lint;
+using vbool = spmd_kernel::vbool;
+using vfloat = spmd_kernel::vfloat;
+using float_lref = spmd_kernel::float_lref;
+using float_vref = spmd_kernel::float_vref;
+using vfloat_vref = spmd_kernel::vfloat_vref;
+using vint_vref = spmd_kernel::vint_vref;
+
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const 
+{ 
+	return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) }; 
+}
+	
+// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const 
+{ 
+	return vint { m_value };
+}
+
+CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
+{
+	return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
+}
+
+CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
+
+CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) {	return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
+
+CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
+
+// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
+CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
+
+CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+		
+	__m128i ai = _mm_cvttps_epi32(a.m_value);
+	
+	__m128 af = _mm_cvtepi32_ps(ai);
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
+
+	af = _mm_add_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+	
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
+	
+	af = _mm_sub_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
+// I wish there was a better way.
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#elif defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
+#elif defined (_MSC_VER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+inline __m128 add_sub(__m128 a, __m128 b)
+#else
+inline __m128 add_sub(__m128 a, __m128 b)
+#endif
+{
+	return _mm_sub_ps(_mm_add_ps(a, b), b);
+}
+
+#if defined (_MSC_VER)
+#pragma float_control(pop)
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
+{
+	__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
+
+	__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
+	__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
+	
+	// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
+	//__m128 temp1 = _mm_add_ps(a.m_value, force_int);
+	//__m128 temp2 = _mm_sub_ps(temp1, force_int);
+	__m128 temp2 = add_sub(a.m_value, force_int);
+	
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
+	return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
+}
+
+#else
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
+CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
+CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
+
+CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) {	return vint{ min_epi32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) {	return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15,  8,  9, 10, 11,  4,  5,  6,  7,  0,  1,  2,  3)) }; }
+
+CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
+{
+	return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
+{
+	return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
+
+CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
+CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
+CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
+CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
+CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
+CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
+
+CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
+
+// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
+CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) {	return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
+CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
+
+CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
+CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
+
+#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
+#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
+#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
+
+CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
+CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
+#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
+#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
+#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
+
+#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+
+#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
+#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
+
+// Unpack and interleave 8-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
+
+// Unpack and interleave 16-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
+
+// Unpack and interleave 32-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
+
+// Unpack and interleave 64-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
+
+CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
+{
+	__m128d al = _mm_cvtepi32_pd(a.m_value);
+	__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
+
+	__m128d bl = _mm_cvtepi32_pd(b.m_value);
+	__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
+
+	__m128d rl = _mm_div_pd(al, bl);
+	__m128d rh = _mm_div_pd(ah, bh);
+
+	__m128i rli = _mm_cvttpd_epi32(rl);
+	__m128i rhi = _mm_cvttpd_epi32(rh);
+
+	return vint(_mm_unpacklo_epi64(rli, rhi));
+}
+
+CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
+{
+	vint aa = abs(a), ab = abs(b);
+	vint q = div_epi32(aa, ab);
+	vint r = aa - q * ab;
+	return spmd_ternaryi(a < 0, -r, r);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
+{
+	return div_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
+{
+	return div_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
+{
+	return mod_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
+{
+	return mod_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) << extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) << extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) << extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) << extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) << extract_x(b.m_value);
+	int y = extract_y(a.m_value) << extract_y(b.m_value);
+	int z = extract_z(a.m_value) << extract_z(b.m_value);
+	int w = extract_w(a.m_value) << extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
+	return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
+#endif
+}
+
+// uniform shift left
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sll_epi32(a.m_value, bv) };
+}
+
+// uniform arithmetic shift right
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sra_epi32(a.m_value, bv) };
+}
+
+// uniform shift right
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_srl_epi32(a.m_value, bv) };
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
+	result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
+	result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
+	result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
+	uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
+	uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
+	uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	vint q(mulhi_epu32(a.m_value, r.m_value));
+
+	// Handle shift amounts of 0.
+	return spmd_ternaryi(b > 0, q, a);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
+{
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	return vint(mulhi_epu32(a.m_value, r.m_value));
+}
+
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) >> extract_x(b.m_value);
+	int y = extract_y(a.m_value) >> extract_y(b.m_value);
+	int z = extract_z(a.m_value) >> extract_z(b.m_value);
+	int w = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
+	vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
+	return a_shifted;
+#endif
+}
+
+#undef VINT_SHIFT_LEFT
+#undef VINT_SHIFT_RIGHT
+#undef VUINT_SHIFT_RIGHT
+
+// Shift left/right by a uniform immediate constant
+#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
+#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) ) 
+#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
+#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
+
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+
+CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
+
+#undef VINT_EXTRACT
+#undef VBOOL_EXTRACT
+#undef VFLOAT_EXTRACT
+
+#if CPPSPMD_SSE2
+// Pass in an immediate constant and the compiler will optimize these expressions.
+#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
+#else
+CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
+
+#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) float values[4];
+	_mm_store_ps(values, v.m_value);
+	values[instance] = f;
+	v.m_value = _mm_load_ps(values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) int values[4];
+	_mm_store_si128((__m128i *)values, v.m_value);
+	values[instance] = i;
+	v.m_value = _mm_load_si128((__m128i *)values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	return vint{ l };
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
+{
+	return vint{ shuffle_epi8(table.m_value, a.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	table_0.m_value = l;
+	table_1.m_value = h;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
+{
+	__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
+	__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
+
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
+
+	return vint{ _mm_castps_si128(v_0) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
+{
+	__m128i a = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
+	__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
+
+	table_0.m_value = a;
+	table_1.m_value = b;
+	table_2.m_value = c;
+	table_3.m_value = d;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
+{
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 av_0;
+	{
+		__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
+		__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
+		av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128 bv_0;
+	{
+		__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
+		__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
+		bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
+	__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
+
+	return vint{ _mm_castps_si128(v2_0) };
+}
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
+{
+	m_exec = kernel_exec;
+	m_kernel_exec = kernel_exec;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	m_in_loop = false;
+#endif
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+#include "cppspmd_flow.h"
+#include "cppspmd_math.h"
+
+} // namespace cppspmd_sse41
+
diff --git a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
new file mode 100644
index 0000000000..0dfb28b88f
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
@@ -0,0 +1,47 @@
+// cppspmd_type_aliases.h
+// Do not include this file directly
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef CPPSPMD_TYPES
+#define CPPSPMD_TYPES
+
+using exec_mask = CPPSPMD::exec_mask;
+
+#if CPPSPMD_INT16
+using vint16 = CPPSPMD::vint16;
+using int16_lref = CPPSPMD::int16_lref;
+using cint16_vref = CPPSPMD::cint16_vref;
+using int16_vref = CPPSPMD::int16_vref;
+using lint16 = CPPSPMD::lint16;
+using vint16_vref = CPPSPMD::vint16_vref;
+#else
+using vint = CPPSPMD::vint;
+using int_lref = CPPSPMD::int_lref;
+using cint_vref = CPPSPMD::cint_vref;
+using int_vref = CPPSPMD::int_vref;
+using lint = CPPSPMD::lint;
+using vint_vref = CPPSPMD::vint_vref;
+#endif
+
+using vbool = CPPSPMD::vbool;
+using vfloat = CPPSPMD::vfloat;
+using float_lref = CPPSPMD::float_lref;
+using float_vref = CPPSPMD::float_vref;
+using vfloat_vref = CPPSPMD::vfloat_vref;
+
+#endif // CPPSPMD_TYPES
diff --git a/thirdparty/basis_universal/encoder/jpgd.cpp b/thirdparty/basis_universal/encoder/jpgd.cpp
new file mode 100644
index 0000000000..460834409d
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/jpgd.cpp
@@ -0,0 +1,3241 @@
+// jpgd.cpp - C++ class for JPEG decompression. Written by Richard Geldreich <richgel99@gmail.com> between 1994-2020.
+// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+// Supports box and linear chroma upsampling.
+//
+// Released under two licenses. You are free to choose which license you want:
+// License 1: 
+// Public Domain
+//
+// License 2:
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings
+// v2.00, March 20, 2020: Fuzzed with zzuf and afl. Fixed several issues, converted most assert()'s to run-time checks. Added chroma upsampling. Removed freq. domain upsampling. gcc/clang warnings.
+//
+#ifdef _MSC_VER
+#ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
+#if defined(_DEBUG) || defined(DEBUG)
+#define _ITERATOR_DEBUG_LEVEL 1
+#define _SECURE_SCL 1
+#else
+#define _SECURE_SCL 0
+#define _ITERATOR_DEBUG_LEVEL 0
+#endif
+#endif
+#endif
+
+#include "jpgd.h"
+#include <string.h>
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
+#endif
+
+#define JPGD_TRUE (1)
+#define JPGD_FALSE (0)
+
+#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+namespace jpgd {
+
+	static inline void* jpgd_malloc(size_t nSize) { return malloc(nSize); }
+	static inline void jpgd_free(void* p) { free(p); }
+
+	// DCT coefficients are stored in this sequence.
+	static int g_ZAG[64] = { 0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+
+	enum JPEG_MARKER
+	{
+		M_SOF0 = 0xC0, M_SOF1 = 0xC1, M_SOF2 = 0xC2, M_SOF3 = 0xC3, M_SOF5 = 0xC5, M_SOF6 = 0xC6, M_SOF7 = 0xC7, M_JPG = 0xC8,
+		M_SOF9 = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT = 0xC4, M_DAC = 0xCC,
+		M_RST0 = 0xD0, M_RST1 = 0xD1, M_RST2 = 0xD2, M_RST3 = 0xD3, M_RST4 = 0xD4, M_RST5 = 0xD5, M_RST6 = 0xD6, M_RST7 = 0xD7,
+		M_SOI = 0xD8, M_EOI = 0xD9, M_SOS = 0xDA, M_DQT = 0xDB, M_DNL = 0xDC, M_DRI = 0xDD, M_DHP = 0xDE, M_EXP = 0xDF,
+		M_APP0 = 0xE0, M_APP15 = 0xEF, M_JPG0 = 0xF0, M_JPG13 = 0xFD, M_COM = 0xFE, M_TEM = 0x01, M_ERROR = 0x100, RST0 = 0xD0
+	};
+
+	enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define SCALEDONE ((int32)1)
+
+#define FIX_0_298631336  ((int32)2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int32)3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int32)4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int32)6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int32)7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int32)9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int32)12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int32)15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int32)16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int32)16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int32)20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int32)25172)       /* FIX(3.072711026) */
+
+#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
+#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
+
+#define MULTIPLY(var, cnst)  ((var) * (cnst))
+
+#define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
+
+	static inline int left_shifti(int val, uint32_t bits)
+	{
+		return static_cast<int>(static_cast<uint32_t>(val) << bits);
+	}
+
+	// Compiler creates a fast path 1D IDCT for X non-zero columns
+	template <int NONZERO_COLS>
+	struct Row
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			// ACCESS_COL() will be optimized at compile time to either an array access, or 0. Good compilers will then optimize out muls against 0.
+#define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+
+			const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = left_shifti(ACCESS_COL(0) + ACCESS_COL(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_COL(0) - ACCESS_COL(4), CONST_BITS);
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS - PASS1_BITS);
+			pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS - PASS1_BITS);
+		}
+	};
+
+	template <>
+	struct Row<0>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			(void)pTemp; 
+			(void)pSrc;
+		}
+	};
+
+	template <>
+	struct Row<1>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			const int dcval = left_shifti(pSrc[0], PASS1_BITS);
+
+			pTemp[0] = dcval;
+			pTemp[1] = dcval;
+			pTemp[2] = dcval;
+			pTemp[3] = dcval;
+			pTemp[4] = dcval;
+			pTemp[5] = dcval;
+			pTemp[6] = dcval;
+			pTemp[7] = dcval;
+		}
+	};
+
+	// Compiler creates a fast path 1D IDCT for X non-zero rows
+	template <int NONZERO_ROWS>
+	struct Col
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			// ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+#define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+			const int z2 = ACCESS_ROW(2);
+			const int z3 = ACCESS_ROW(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = left_shifti(ACCESS_ROW(0) + ACCESS_ROW(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_ROW(0) - ACCESS_ROW(4), CONST_BITS);
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 0] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 7] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 1] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 6] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 2] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 5] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 3] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 4] = (uint8)CLAMP(i);
+		}
+	};
+
+	template <>
+	struct Col<1>
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS + 3);
+			const uint8 dcval_clamped = (uint8)CLAMP(dcval);
+			pDst_ptr[0 * 8] = dcval_clamped;
+			pDst_ptr[1 * 8] = dcval_clamped;
+			pDst_ptr[2 * 8] = dcval_clamped;
+			pDst_ptr[3 * 8] = dcval_clamped;
+			pDst_ptr[4 * 8] = dcval_clamped;
+			pDst_ptr[5 * 8] = dcval_clamped;
+			pDst_ptr[6 * 8] = dcval_clamped;
+			pDst_ptr[7 * 8] = dcval_clamped;
+		}
+	};
+
+	static const uint8 s_idct_row_table[] =
+	{
+	  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+	  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+	  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+	  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+	  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+	  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+	  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+	  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+	};
+
+	static const uint8 s_idct_col_table[] = 
+	{ 
+		1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
+		7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 
+	};
+
+	// Scalar "fast pathing" IDCT.
+	static void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
+	{
+		assert(block_max_zag >= 1);
+		assert(block_max_zag <= 64);
+
+		if (block_max_zag <= 1)
+		{
+			int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+			k = CLAMP(k);
+			k = k | (k << 8);
+			k = k | (k << 16);
+
+			for (int i = 8; i > 0; i--)
+			{
+				*(int*)&pDst_ptr[0] = k;
+				*(int*)&pDst_ptr[4] = k;
+				pDst_ptr += 8;
+			}
+			return;
+		}
+
+		int temp[64];
+
+		const jpgd_block_t* pSrc = pSrc_ptr;
+		int* pTemp = temp;
+
+		const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+		int i;
+		for (i = 8; i > 0; i--, pRow_tab++)
+		{
+			switch (*pRow_tab)
+			{
+			case 0: Row<0>::idct(pTemp, pSrc); break;
+			case 1: Row<1>::idct(pTemp, pSrc); break;
+			case 2: Row<2>::idct(pTemp, pSrc); break;
+			case 3: Row<3>::idct(pTemp, pSrc); break;
+			case 4: Row<4>::idct(pTemp, pSrc); break;
+			case 5: Row<5>::idct(pTemp, pSrc); break;
+			case 6: Row<6>::idct(pTemp, pSrc); break;
+			case 7: Row<7>::idct(pTemp, pSrc); break;
+			case 8: Row<8>::idct(pTemp, pSrc); break;
+			}
+
+			pSrc += 8;
+			pTemp += 8;
+		}
+
+		pTemp = temp;
+
+		const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+		for (i = 8; i > 0; i--)
+		{
+			switch (nonzero_rows)
+			{
+			case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+			case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+			case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+			case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+			case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+			case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+			case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+			case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+			}
+
+			pTemp++;
+			pDst_ptr++;
+		}
+	}
+
+	// Retrieve one character from the input stream.
+	inline uint jpeg_decoder::get_char()
+	{
+		// Any bytes remaining in buffer?
+		if (!m_in_buf_left)
+		{
+			// Try to get more bytes.
+			prep_in_buffer();
+			// Still nothing to get?
+			if (!m_in_buf_left)
+			{
+				// Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Same as previous method, except can indicate if the character is a pad character or not.
+	inline uint jpeg_decoder::get_char(bool* pPadding_flag)
+	{
+		if (!m_in_buf_left)
+		{
+			prep_in_buffer();
+			if (!m_in_buf_left)
+			{
+				*pPadding_flag = true;
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		*pPadding_flag = false;
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Inserts a previously retrieved character back into the input buffer.
+	inline void jpeg_decoder::stuff_char(uint8 q)
+	{
+		// This could write before the input buffer, but we've placed another array there.
+		*(--m_pIn_buf_ofs) = q;
+		m_in_buf_left++;
+	}
+
+	// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+	inline uint8 jpeg_decoder::get_octet()
+	{
+		bool padding_flag;
+		int c = get_char(&padding_flag);
+
+		if (c == 0xFF)
+		{
+			if (padding_flag)
+				return 0xFF;
+
+			c = get_char(&padding_flag);
+			if (padding_flag)
+			{
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+
+			if (c == 0x00)
+				return 0xFF;
+			else
+			{
+				stuff_char(static_cast<uint8>(c));
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+		}
+
+		return static_cast<uint8>(c);
+	}
+
+	// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+	inline uint jpeg_decoder::get_bits(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			uint c1 = get_char();
+			uint c2 = get_char();
+			m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+	inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		assert(num_bits <= 16);
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
+			{
+				uint c1 = get_octet();
+				uint c2 = get_octet();
+				m_bit_buf |= (c1 << 8) | c2;
+			}
+			else
+			{
+				m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+				m_in_buf_left -= 2;
+				m_pIn_buf_ofs += 2;
+			}
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH)
+	{
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		int symbol;
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
+		{
+			// Decode more bits, use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+		}
+		else
+		{
+			assert(symbol < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			get_bits_no_markers(pH->code_size[symbol]);
+		}
+
+		return symbol;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH, int& extra_bits)
+	{
+		int symbol;
+
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
+		{
+			// Use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+
+			extra_bits = get_bits_no_markers(symbol & 0xF);
+		}
+		else
+		{
+			if (symbol & 0x8000)
+			{
+				//get_bits_no_markers((symbol >> 8) & 31);
+				assert(((symbol >> 8) & 31) <= 15);
+				get_bits_no_markers((symbol >> 8) & 15);
+				extra_bits = symbol >> 16;
+			}
+			else
+			{
+				int code_size = (symbol >> 8) & 31;
+				int num_extra_bits = symbol & 0xF;
+				int bits = code_size + num_extra_bits;
+
+				if (bits <= 16)
+					extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+				else
+				{
+					get_bits_no_markers(code_size);
+					extra_bits = get_bits_no_markers(num_extra_bits);
+				}
+			}
+
+			symbol &= 0xFF;
+		}
+
+		return symbol;
+	}
+
+	// Tables and macro used to fully decode the DPCM differences.
+	static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+	static const int s_extend_offset[16] = { 0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767 };
+	//static const int s_extend_mask[] = { 0, (1 << 0), (1 << 1), (1 << 2), (1 << 3), (1 << 4), (1 << 5), (1 << 6), (1 << 7), (1 << 8), (1 << 9), (1 << 10), (1 << 11), (1 << 12), (1 << 13), (1 << 14), (1 << 15), (1 << 16) };
+
+#define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
+
+	// Unconditionally frees all allocated m_blocks.
+	void jpeg_decoder::free_all_blocks()
+	{
+		m_pStream = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; )
+		{
+			mem_block* n = b->m_pNext;
+			jpgd_free(b);
+			b = n;
+		}
+		m_pMem_blocks = nullptr;
+	}
+
+	// This method handles all errors. It will never return.
+	// It could easily be changed to use C++ exceptions.
+	JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
+	{
+		m_error_code = status;
+		free_all_blocks();
+		longjmp(m_jmp_state, status);
+	}
+
+	void* jpeg_decoder::alloc(size_t nSize, bool zero)
+	{
+		nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+		char* rv = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; b = b->m_pNext)
+		{
+			if ((b->m_used_count + nSize) <= b->m_size)
+			{
+				rv = b->m_data + b->m_used_count;
+				b->m_used_count += nSize;
+				break;
+			}
+		}
+		if (!rv)
+		{
+			int capacity = JPGD_MAX(32768 - 256, ((int)nSize + 2047) & ~2047);
+			mem_block* b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
+			if (!b)
+			{
+				stop_decoding(JPGD_NOTENOUGHMEM);
+			}
+
+			b->m_pNext = m_pMem_blocks;
+			m_pMem_blocks = b;
+			b->m_used_count = nSize;
+			b->m_size = capacity;
+			rv = b->m_data;
+		}
+		if (zero) memset(rv, 0, nSize);
+		return rv;
+	}
+
+	void jpeg_decoder::word_clear(void* p, uint16 c, uint n)
+	{
+		uint8* pD = (uint8*)p;
+		const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
+		while (n)
+		{
+			pD[0] = l;
+			pD[1] = h;
+			pD += 2;
+			n--;
+		}
+	}
+
+	// Refill the input buffer.
+	// This method will sit in a loop until (A) the buffer is full or (B)
+	// the stream's read() method reports and end of file condition.
+	void jpeg_decoder::prep_in_buffer()
+	{
+		m_in_buf_left = 0;
+		m_pIn_buf_ofs = m_in_buf;
+
+		if (m_eof_flag)
+			return;
+
+		do
+		{
+			int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+			if (bytes_read == -1)
+				stop_decoding(JPGD_STREAM_READ);
+
+			m_in_buf_left += bytes_read;
+		} while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+		m_total_bytes_read += m_in_buf_left;
+
+		// Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+		// (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+		word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+	}
+
+	// Read a Huffman code table.
+	void jpeg_decoder::read_dht_marker()
+	{
+		int i, index, count;
+		uint8 huff_num[17];
+		uint8 huff_val[256];
+
+		uint num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DHT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			index = get_bits(8);
+
+			huff_num[0] = 0;
+
+			count = 0;
+
+			for (i = 1; i <= 16; i++)
+			{
+				huff_num[i] = static_cast<uint8>(get_bits(8));
+				count += huff_num[i];
+			}
+
+			if (count > 255)
+				stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+			bool symbol_present[256];
+			memset(symbol_present, 0, sizeof(symbol_present));
+
+			for (i = 0; i < count; i++)
+			{
+				const int s = get_bits(8);
+
+				// Check for obviously bogus tables.
+				if (symbol_present[s])
+					stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+				huff_val[i] = static_cast<uint8_t>(s);
+				symbol_present[s] = true;
+			}
+
+			i = 1 + 16 + count;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DHT_MARKER);
+
+			num_left -= i;
+
+			if ((index & 0x10) > 0x10)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (index >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			if (!m_huff_num[index])
+				m_huff_num[index] = (uint8*)alloc(17);
+
+			if (!m_huff_val[index])
+				m_huff_val[index] = (uint8*)alloc(256);
+
+			m_huff_ac[index] = (index & 0x10) != 0;
+			memcpy(m_huff_num[index], huff_num, 17);
+			memcpy(m_huff_val[index], huff_val, 256);
+		}
+	}
+
+	// Read a quantization table.
+	void jpeg_decoder::read_dqt_marker()
+	{
+		int n, i, prec;
+		uint num_left;
+		uint temp;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DQT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			n = get_bits(8);
+			prec = n >> 4;
+			n &= 0x0F;
+
+			if (n >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_BAD_DQT_TABLE);
+
+			if (!m_quant[n])
+				m_quant[n] = (jpgd_quant_t*)alloc(64 * sizeof(jpgd_quant_t));
+
+			// read quantization entries, in zag order
+			for (i = 0; i < 64; i++)
+			{
+				temp = get_bits(8);
+
+				if (prec)
+					temp = (temp << 8) + get_bits(8);
+
+				m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+			}
+
+			i = 64 + 1;
+
+			if (prec)
+				i += 64;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DQT_LENGTH);
+
+			num_left -= i;
+		}
+	}
+
+	// Read the start of frame (SOF) marker.
+	void jpeg_decoder::read_sof_marker()
+	{
+		int i;
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		/* precision: sorry, only 8-bit precision is supported */
+		if (get_bits(8) != 8)
+			stop_decoding(JPGD_BAD_PRECISION);
+
+		m_image_y_size = get_bits(16);
+
+		if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
+			stop_decoding(JPGD_BAD_HEIGHT);
+
+		m_image_x_size = get_bits(16);
+
+		if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
+			stop_decoding(JPGD_BAD_WIDTH);
+
+		m_comps_in_frame = get_bits(8);
+
+		if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
+			stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+
+		if (num_left != (uint)(m_comps_in_frame * 3 + 8))
+			stop_decoding(JPGD_BAD_SOF_LENGTH);
+
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_comp_ident[i] = get_bits(8);
+			m_comp_h_samp[i] = get_bits(4);
+			m_comp_v_samp[i] = get_bits(4);
+
+			if (!m_comp_h_samp[i] || !m_comp_v_samp[i] || (m_comp_h_samp[i] > 2) || (m_comp_v_samp[i] > 2))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_comp_quant[i] = get_bits(8);
+			if (m_comp_quant[i] >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+	}
+
+	// Used to skip unrecognized markers.
+	void jpeg_decoder::skip_variable_marker()
+	{
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Read a define restart interval (DRI) marker.
+	void jpeg_decoder::read_dri_marker()
+	{
+		if (get_bits(16) != 4)
+			stop_decoding(JPGD_BAD_DRI_LENGTH);
+
+		m_restart_interval = get_bits(16);
+	}
+
+	// Read a start of scan (SOS) marker.
+	void jpeg_decoder::read_sos_marker()
+	{
+		uint num_left;
+		int i, ci, n, c, cc;
+
+		num_left = get_bits(16);
+
+		n = get_bits(8);
+
+		m_comps_in_scan = n;
+
+		num_left -= 3;
+
+		if ((num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN))
+			stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+		for (i = 0; i < n; i++)
+		{
+			cc = get_bits(8);
+			c = get_bits(8);
+			num_left -= 2;
+
+			for (ci = 0; ci < m_comps_in_frame; ci++)
+				if (cc == m_comp_ident[ci])
+					break;
+
+			if (ci >= m_comps_in_frame)
+				stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+			if (ci >= JPGD_MAX_COMPONENTS)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			m_comp_list[i] = ci;
+
+			m_comp_dc_tab[ci] = (c >> 4) & 15;
+			m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (m_comp_dc_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			if (m_comp_ac_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+
+		m_spectral_start = get_bits(8);
+		m_spectral_end = get_bits(8);
+		m_successive_high = get_bits(4);
+		m_successive_low = get_bits(4);
+
+		if (!m_progressive_flag)
+		{
+			m_spectral_start = 0;
+			m_spectral_end = 63;
+		}
+
+		num_left -= 3;
+
+		/* read past whatever is num_left */
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Finds the next marker.
+	int jpeg_decoder::next_marker()
+	{
+		uint c, bytes;
+
+		bytes = 0;
+
+		do
+		{
+			do
+			{
+				bytes++;
+				c = get_bits(8);
+			} while (c != 0xFF);
+
+			do
+			{
+				c = get_bits(8);
+			} while (c == 0xFF);
+
+		} while (c == 0);
+
+		// If bytes > 0 here, there where extra bytes before the marker (not good).
+
+		return c;
+	}
+
+	// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+	// encountered.
+	int jpeg_decoder::process_markers()
+	{
+		int c;
+
+		for (; ; )
+		{
+			c = next_marker();
+
+			switch (c)
+			{
+			case M_SOF0:
+			case M_SOF1:
+			case M_SOF2:
+			case M_SOF3:
+			case M_SOF5:
+			case M_SOF6:
+			case M_SOF7:
+				//      case M_JPG:
+			case M_SOF9:
+			case M_SOF10:
+			case M_SOF11:
+			case M_SOF13:
+			case M_SOF14:
+			case M_SOF15:
+			case M_SOI:
+			case M_EOI:
+			case M_SOS:
+			{
+				return c;
+			}
+			case M_DHT:
+			{
+				read_dht_marker();
+				break;
+			}
+			// No arithmitic support - dumb patents!
+			case M_DAC:
+			{
+				stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+				break;
+			}
+			case M_DQT:
+			{
+				read_dqt_marker();
+				break;
+			}
+			case M_DRI:
+			{
+				read_dri_marker();
+				break;
+			}
+			//case M_APP0:  /* no need to read the JFIF marker */
+			case M_JPG:
+			case M_RST0:    /* no parameters */
+			case M_RST1:
+			case M_RST2:
+			case M_RST3:
+			case M_RST4:
+			case M_RST5:
+			case M_RST6:
+			case M_RST7:
+			case M_TEM:
+			{
+				stop_decoding(JPGD_UNEXPECTED_MARKER);
+				break;
+			}
+			default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
+			{
+				skip_variable_marker();
+				break;
+			}
+			}
+		}
+	}
+
+	// Finds the start of image (SOI) marker.
+	void jpeg_decoder::locate_soi_marker()
+	{
+		uint lastchar, thischar;
+		uint bytesleft;
+
+		lastchar = get_bits(8);
+
+		thischar = get_bits(8);
+
+		/* ok if it's a normal JPEG file without a special header */
+
+		if ((lastchar == 0xFF) && (thischar == M_SOI))
+			return;
+
+		bytesleft = 4096;
+
+		for (; ; )
+		{
+			if (--bytesleft == 0)
+				stop_decoding(JPGD_NOT_JPEG);
+
+			lastchar = thischar;
+
+			thischar = get_bits(8);
+
+			if (lastchar == 0xFF)
+			{
+				if (thischar == M_SOI)
+					break;
+				else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
+					stop_decoding(JPGD_NOT_JPEG);
+			}
+		}
+
+		// Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+		thischar = (m_bit_buf >> 24) & 0xFF;
+
+		if (thischar != 0xFF)
+			stop_decoding(JPGD_NOT_JPEG);
+	}
+
+	// Find a start of frame (SOF) marker.
+	void jpeg_decoder::locate_sof_marker()
+	{
+		locate_soi_marker();
+
+		int c = process_markers();
+
+		switch (c)
+		{
+		case M_SOF2:
+		{
+			m_progressive_flag = JPGD_TRUE;
+			read_sof_marker();
+			break;
+		}
+		case M_SOF0:  /* baseline DCT */
+		case M_SOF1:  /* extended sequential DCT */
+		{
+			read_sof_marker();
+			break;
+		}
+		case M_SOF9:  /* Arithmitic coding */
+		{
+			stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+			break;
+		}
+		default:
+		{
+			stop_decoding(JPGD_UNSUPPORTED_MARKER);
+			break;
+		}
+		}
+	}
+
+	// Find a start of scan (SOS) marker.
+	int jpeg_decoder::locate_sos_marker()
+	{
+		int c;
+
+		c = process_markers();
+
+		if (c == M_EOI)
+			return JPGD_FALSE;
+		else if (c != M_SOS)
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+
+		read_sos_marker();
+
+		return JPGD_TRUE;
+	}
+
+	// Reset everything to default/uninitialized state.
+	void jpeg_decoder::init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		m_flags = flags;
+		m_pMem_blocks = nullptr;
+		m_error_code = JPGD_SUCCESS;
+		m_ready_flag = false;
+		m_image_x_size = m_image_y_size = 0;
+		m_pStream = pStream;
+		m_progressive_flag = JPGD_FALSE;
+
+		memset(m_huff_ac, 0, sizeof(m_huff_ac));
+		memset(m_huff_num, 0, sizeof(m_huff_num));
+		memset(m_huff_val, 0, sizeof(m_huff_val));
+		memset(m_quant, 0, sizeof(m_quant));
+
+		m_scan_type = 0;
+		m_comps_in_frame = 0;
+
+		memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+		memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+		memset(m_comp_quant, 0, sizeof(m_comp_quant));
+		memset(m_comp_ident, 0, sizeof(m_comp_ident));
+		memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+		memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+		m_comps_in_scan = 0;
+		memset(m_comp_list, 0, sizeof(m_comp_list));
+		memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+		memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+		m_spectral_start = 0;
+		m_spectral_end = 0;
+		m_successive_low = 0;
+		m_successive_high = 0;
+		m_max_mcu_x_size = 0;
+		m_max_mcu_y_size = 0;
+		m_blocks_per_mcu = 0;
+		m_max_blocks_per_row = 0;
+		m_mcus_per_row = 0;
+		m_mcus_per_col = 0;
+
+		memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+		m_total_lines_left = 0;
+		m_mcu_lines_left = 0;
+		m_num_buffered_scanlines = 0;
+		m_real_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_pixel = 0;
+
+		memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+		memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+		memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		m_eob_run = 0;
+
+		m_pIn_buf_ofs = m_in_buf;
+		m_in_buf_left = 0;
+		m_eof_flag = false;
+		m_tem_flag = 0;
+
+		memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+		memset(m_in_buf, 0, sizeof(m_in_buf));
+		memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+		m_restart_interval = 0;
+		m_restarts_left = 0;
+		m_next_restart_num = 0;
+
+		m_max_mcus_per_row = 0;
+		m_max_blocks_per_mcu = 0;
+		m_max_mcus_per_col = 0;
+
+		memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+		m_pMCU_coefficients = nullptr;
+		m_pSample_buf = nullptr;
+		m_pSample_buf_prev = nullptr;
+		m_sample_buf_prev_valid = false;
+
+		m_total_bytes_read = 0;
+
+		m_pScan_line_0 = nullptr;
+		m_pScan_line_1 = nullptr;
+
+		// Ready the input buffer.
+		prep_in_buffer();
+
+		// Prime the bit buffer.
+		m_bits_left = 16;
+		m_bit_buf = 0;
+
+		get_bits(16);
+		get_bits(16);
+
+		for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
+			m_mcu_block_max_zag[i] = 64;
+	}
+
+#define SCALEBITS 16
+#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
+#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
+
+	// Create a few tables that allow us to quickly convert YCbCr to RGB.
+	void jpeg_decoder::create_look_ups()
+	{
+		for (int i = 0; i <= 255; i++)
+		{
+			int k = i - 128;
+			m_crr[i] = (FIX(1.40200f) * k + ONE_HALF) >> SCALEBITS;
+			m_cbb[i] = (FIX(1.77200f) * k + ONE_HALF) >> SCALEBITS;
+			m_crg[i] = (-FIX(0.71414f)) * k;
+			m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+		}
+	}
+
+	// This method throws back into the stream any bytes that where read
+	// into the bit buffer during initial marker scanning.
+	void jpeg_decoder::fix_in_buffer()
+	{
+		// In case any 0xFF's where pulled into the buffer during marker scanning.
+		assert((m_bits_left & 7) == 0);
+
+		if (m_bits_left == 16)
+			stuff_char((uint8)(m_bit_buf & 0xFF));
+
+		if (m_bits_left >= 8)
+			stuff_char((uint8)((m_bit_buf >> 8) & 0xFF));
+
+		stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
+		stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	void jpeg_decoder::transform_mcu(int mcu_row)
+	{
+		jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+		if (mcu_row * m_blocks_per_mcu >= m_max_blocks_per_row)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+			pSrc_ptr += 64;
+			pDst_ptr += 64;
+		}
+	}
+
+	// Loads and dequantizes the next row of (already decoded) coefficients.
+	// Progressive images only.
+	void jpeg_decoder::load_next_row()
+	{
+		int i;
+		jpgd_block_t* p;
+		jpgd_quant_t* q;
+		int mcu_row, mcu_block, row_block = 0;
+		int component_num, component_id;
+		int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+		for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+			for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+			{
+				component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				q = m_quant[m_comp_quant[component_id]];
+
+				p = m_pMCU_coefficients + 64 * mcu_block;
+
+				jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				p[0] = pDC[0];
+				memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
+
+				for (i = 63; i > 0; i--)
+					if (p[g_ZAG[i]])
+						break;
+
+				m_mcu_block_max_zag[mcu_block] = i + 1;
+
+				for (; i >= 0; i--)
+					if (p[g_ZAG[i]])
+						p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
+
+				row_block++;
+
+				if (m_comps_in_scan == 1)
+					block_x_mcu[component_id]++;
+				else
+				{
+					if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+					{
+						block_x_mcu_ofs = 0;
+
+						if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+						{
+							block_y_mcu_ofs = 0;
+
+							block_x_mcu[component_id] += m_comp_h_samp[component_id];
+						}
+					}
+				}
+			}
+
+			transform_mcu(mcu_row);
+		}
+
+		if (m_comps_in_scan == 1)
+			m_block_y_mcu[m_comp_list[0]]++;
+		else
+		{
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				component_id = m_comp_list[component_num];
+
+				m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+			}
+		}
+	}
+
+	// Restart interval processing.
+	void jpeg_decoder::process_restart()
+	{
+		int i;
+		int c = 0;
+
+		// Align to a byte boundry
+		// FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+		//get_bits_no_markers(m_bits_left & 7);
+
+		// Let's scan a little bit to find the marker, but not _too_ far.
+		// 1536 is a "fudge factor" that determines how much to scan.
+		for (i = 1536; i > 0; i--)
+			if (get_char() == 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		for (; i > 0; i--)
+			if ((c = get_char()) != 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Is it the expected marker? If not, something bad happened.
+		if (c != (m_next_restart_num + M_RST0))
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Reset each component's DC prediction values.
+		memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		m_restarts_left = m_restart_interval;
+
+		m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+		// Get the bit buffer going again...
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	static inline int dequantize_ac(int c, int q) { c *= q; return c; }
+
+	// Decodes and dequantizes the next row of coefficients.
+	void jpeg_decoder::decode_next_row()
+	{
+		int row_block = 0;
+
+		for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			if ((m_restart_interval) && (m_restarts_left == 0))
+				process_restart();
+
+			jpgd_block_t* p = m_pMCU_coefficients;
+			for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
+			{
+				int component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+				int r, s;
+				s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+				if (s >= 16)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+				p[0] = static_cast<jpgd_block_t>(s * q[0]);
+
+				int prev_num_set = m_mcu_block_max_zag[mcu_block];
+
+				huff_tables* pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+
+				int k;
+				for (k = 1; k < 64; k++)
+				{
+					int extra_bits;
+					s = huff_decode(pH, extra_bits);
+
+					r = s >> 4;
+					s &= 15;
+
+					if (s)
+					{
+						if (r)
+						{
+							if ((k + r) > 63)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(r, prev_num_set - k);
+								int kt = k;
+								while (n--)
+									p[g_ZAG[kt++]] = 0;
+							}
+
+							k += r;
+						}
+
+						s = JPGD_HUFF_EXTEND(extra_bits, s);
+
+						if (k >= 64)
+							stop_decoding(JPGD_DECODE_ERROR);
+
+						p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
+					}
+					else
+					{
+						if (r == 15)
+						{
+							if ((k + 16) > 64)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(16, prev_num_set - k);
+								int kt = k;
+								while (n--)
+								{
+									if (kt > 63)
+										stop_decoding(JPGD_DECODE_ERROR);
+									p[g_ZAG[kt++]] = 0;
+								}
+							}
+
+							k += 16 - 1; // - 1 because the loop counter is k
+
+							if (p[g_ZAG[k & 63]] != 0)
+								stop_decoding(JPGD_DECODE_ERROR);
+						}
+						else
+							break;
+					}
+				}
+
+				if (k < prev_num_set)
+				{
+					int kt = k;
+					while (kt < prev_num_set)
+						p[g_ZAG[kt++]] = 0;
+				}
+
+				m_mcu_block_max_zag[mcu_block] = k;
+
+				row_block++;
+			}
+
+			transform_mcu(mcu_row);
+
+			m_restarts_left--;
+		}
+	}
+
+	// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int y = s[j];
+				int cb = s[64 + j];
+				int cr = s[128 + j];
+
+				d[0] = clamp(y + m_crr[cr]);
+				d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+				d[2] = clamp(y + m_cbb[cb]);
+				d[3] = 255;
+
+				d += 4;
+			}
+
+			s += 64 * 3;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* y = m_pSample_buf + row * 8;
+		uint8* c = m_pSample_buf + 2 * 64 + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 4; j++)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j << 1];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[(j << 1) + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					d0 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 4 - 64 * 2;
+			c += 64 * 4 - 8;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+		const int row_x8 = row * 8;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			int y = m_pSample_buf[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + row_x8)];
+
+			int c_x0 = (x - 1) >> 1;
+			int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+			c_x0 = JPGD_MAX(c_x0, 0);
+
+			int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7) + row_x8 + 128;
+			int cb0 = m_pSample_buf[check_sample_buf_ofs(a)];
+			int cr0 = m_pSample_buf[check_sample_buf_ofs(a + 64)];
+
+			int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7) + row_x8 + 128;
+			int cb1 = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1 = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int w0 = (x & 1) ? 3 : 1;
+			int w1 = (x & 1) ? 1 : 3;
+
+			int cb = (cb0 * w0 + cb1 * w1 + 2) >> 2;
+			int cr = (cr0 * w0 + cr1 * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y + rc);
+			d0[1] = clamp(y + gc);
+			d0[2] = clamp(y + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 1 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 2 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int cb = c[0 + j];
+				int cr = c[64 + j];
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				int yy = y[j];
+				d0[0] = clamp(yy + rc);
+				d0[1] = clamp(yy + gc);
+				d0[2] = clamp(yy + bc);
+				d0[3] = 255;
+
+				yy = y[8 + j];
+				d1[0] = clamp(yy + rc);
+				d1[1] = clamp(yy + gc);
+				d1[2] = clamp(yy + bc);
+				d1[3] = 255;
+
+				d0 += 4;
+				d1 += 4;
+			}
+
+			y += 64 * 4;
+			c += 64 * 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		const int w0 = (row & 1) ? 3 : 1;
+		const int w1 = (row & 1) ? 1 : 3;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
+		}
+
+		const int y_sample_base_ofs = ((row & 8) ? 64 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 128;
+		const int y1_base = (c_y1 & 7) * 8 + 128;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			const int base_ofs = (x >> 3) * BLOCKS_PER_MCU * 64 + (x & 7);
+
+			int y_sample = p_YSamples[check_sample_buf_ofs(base_ofs + y_sample_base_ofs)];
+
+			int a = base_ofs + y0_base;
+			int cb0_sample = p_C0Samples[check_sample_buf_ofs(a)];
+			int cr0_sample = p_C0Samples[check_sample_buf_ofs(a + 64)];
+
+			int b = base_ofs + y1_base;
+			int cb1_sample = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1_sample = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int cb = (cb0_sample * w0 + cb1_sample * w1 + 2) >> 2;
+			int cr = (cr0_sample * w0 + cr1_sample * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y_sample + rc);
+			d0[1] = clamp(y_sample + gc);
+			d0[2] = clamp(y_sample + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
+	}
+
+	// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 2 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 4 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 8; j += 2)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[j + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					yy = y[j + 8];
+					d1[0] = clamp(yy + rc);
+					d1[1] = clamp(yy + gc);
+					d1[2] = clamp(yy + bc);
+					d1[3] = 255;
+
+					yy = y[j + 8 + 1];
+					d1[4] = clamp(yy + rc);
+					d1[5] = clamp(yy + gc);
+					d1[6] = clamp(yy + bc);
+					d1[7] = 255;
+
+					d0 += 8;
+					d1 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 6 - 64 * 2;
+			c += 64 * 6 - 8;
+		}
+	}
+
+	uint32_t jpeg_decoder::H2V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 6;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
+		}
+
+		const int y_sample_base_ofs = ((row & 8) ? 128 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 256;
+		const int y1_base = (c_y1 & 7) * 8 + 256;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+
+		static const uint8_t s_muls[2][2][4] =
+		{
+			{ { 1, 3, 3, 9 }, { 3, 9, 1, 3 }, },
+			{ { 3, 1, 9, 3 }, { 9, 3, 3, 1 } }
+		};
+
+		if (((row & 15) >= 1) && ((row & 15) <= 14))
+		{
+			assert((row & 1) == 1);
+			assert(((y + 1 - 1) >> 1) == c_y0);
+
+			assert(p_YSamples == m_pSample_buf);
+			assert(p_C0Samples == m_pSample_buf);
+
+			uint8* d1 = m_pScan_line_1;
+			const int y_sample_base_ofs1 = (((row + 1) & 8) ? 128 : 0) + ((row + 1) & 7) * 8;
+
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int k = (x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7);
+				int y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+				int y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				{
+					const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d0[0] = clamp(y_sample0 + rc);
+					d0[1] = clamp(y_sample0 + gc);
+					d0[2] = clamp(y_sample0 + bc);
+					d0[3] = 255;
+
+					d0 += 4;
+				}
+
+				{
+					const uint8_t* pMuls = &s_muls[(row + 1) & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d1[0] = clamp(y_sample1 + rc);
+					d1[1] = clamp(y_sample1 + gc);
+					d1[2] = clamp(y_sample1 + bc);
+					d1[3] = 255;
+
+					d1 += 4;
+				}
+
+				if (((x & 1) == 1) && (x < m_image_x_size - 1))
+				{
+					const int nx = x + 1;
+					assert(c_x0 == (nx - 1) >> 1);
+
+					k = (nx >> 4) * BLOCKS_PER_MCU * 64 + ((nx & 8) ? 64 : 0) + (nx & 7);
+					y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+					y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+					{
+						const uint8_t* pMuls = &s_muls[row & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d0[0] = clamp(y_sample0 + rc);
+						d0[1] = clamp(y_sample0 + gc);
+						d0[2] = clamp(y_sample0 + bc);
+						d0[3] = 255;
+
+						d0 += 4;
+					}
+
+					{
+						const uint8_t* pMuls = &s_muls[(row + 1) & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d1[0] = clamp(y_sample1 + rc);
+						d1[1] = clamp(y_sample1 + gc);
+						d1[2] = clamp(y_sample1 + bc);
+						d1[3] = 255;
+
+						d1 += 4;
+					}
+
+					++x;
+				}
+			}
+
+			return 2;
+		}
+		else
+		{
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int y_sample = p_YSamples[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + y_sample_base_ofs)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+				int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+				int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				d0[0] = clamp(y_sample + rc);
+				d0[1] = clamp(y_sample + gc);
+				d0[2] = clamp(y_sample + bc);
+				d0[3] = 255;
+
+				d0 += 4;
+			}
+
+			return 1;
+		}
+	}
+
+	// Y (1 block per MCU) to 8-bit grayscale
+	void jpeg_decoder::gray_convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			*(uint*)d = *(uint*)s;
+			*(uint*)(&d[4]) = *(uint*)(&s[4]);
+
+			s += 64;
+			d += 8;
+		}
+	}
+
+	// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+	void jpeg_decoder::find_eoi()
+	{
+		if (!m_progressive_flag)
+		{
+			// Attempt to read the EOI marker.
+			//get_bits_no_markers(m_bits_left & 7);
+
+			// Prime the bit buffer
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			// The next marker _should_ be EOI
+			process_markers();
+		}
+
+		m_total_bytes_read -= m_in_buf_left;
+	}
+
+	int jpeg_decoder::decode_next_mcu_row()
+	{
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		const bool chroma_y_filtering = (m_flags & cFlagLinearChromaFiltering) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+		if (chroma_y_filtering)
+		{
+			std::swap(m_pSample_buf, m_pSample_buf_prev);
+
+			m_sample_buf_prev_valid = true;
+		}
+
+		if (m_progressive_flag)
+			load_next_row();
+		else
+			decode_next_row();
+
+		// Find the EOI marker if that was the last row.
+		if (m_total_lines_left <= m_max_mcu_y_size)
+			find_eoi();
+
+		m_mcu_lines_left = m_max_mcu_y_size;
+		return 0;
+	}
+
+	int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
+	{
+		if ((m_error_code) || (!m_ready_flag))
+			return JPGD_FAILED;
+
+		if (m_total_lines_left == 0)
+			return JPGD_DONE;
+
+		const bool chroma_y_filtering = (m_flags & cFlagLinearChromaFiltering) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+
+		bool get_another_mcu_row = false;
+		bool got_mcu_early = false;
+		if (chroma_y_filtering)
+		{
+			if (m_total_lines_left == m_image_y_size)
+				get_another_mcu_row = true;
+			else if ((m_mcu_lines_left == 1) && (m_total_lines_left > 1))
+			{
+				get_another_mcu_row = true;
+				got_mcu_early = true;
+			}
+		}
+		else
+		{
+			get_another_mcu_row = (m_mcu_lines_left == 0);
+		}
+
+		if (get_another_mcu_row)
+		{
+			int status = decode_next_mcu_row();
+			if (status != 0)
+				return status;
+		}
+
+		switch (m_scan_type)
+		{
+		case JPGD_YH2V2:
+		{
+			if (m_flags & cFlagLinearChromaFiltering)
+			{
+				if (m_num_buffered_scanlines == 1)
+				{
+					*pScan_line = m_pScan_line_1;
+				}
+				else if (m_num_buffered_scanlines == 0)
+				{
+					m_num_buffered_scanlines = H2V2ConvertFiltered();
+					*pScan_line = m_pScan_line_0;
+				}
+
+				m_num_buffered_scanlines--;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H2V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH2V1:
+		{
+			if (m_flags & cFlagLinearChromaFiltering)
+				H2V1ConvertFiltered();
+			else
+				H2V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_YH1V2:
+		{
+			if (chroma_y_filtering)
+			{
+				H1V2ConvertFiltered();
+				*pScan_line = m_pScan_line_0;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H1V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH1V1:
+		{
+			H1V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_GRAYSCALE:
+		{
+			gray_convert();
+			*pScan_line = m_pScan_line_0;
+
+			break;
+		}
+		}
+
+		*pScan_line_len = m_real_dest_bytes_per_scan_line;
+
+		if (!got_mcu_early)
+		{
+			m_mcu_lines_left--;
+		}
+
+		m_total_lines_left--;
+
+		return JPGD_SUCCESS;
+	}
+
+	// Creates the tables needed for efficient Huffman decoding.
+	void jpeg_decoder::make_huff_table(int index, huff_tables* pH)
+	{
+		int p, i, l, si;
+		uint8 huffsize[258];
+		uint huffcode[258];
+		uint code;
+		uint subtree;
+		int code_size;
+		int lastp;
+		int nextfreeentry;
+		int currententry;
+
+		pH->ac_table = m_huff_ac[index] != 0;
+
+		p = 0;
+
+		for (l = 1; l <= 16; l++)
+		{
+			for (i = 1; i <= m_huff_num[index][l]; i++)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffsize[p++] = static_cast<uint8>(l);
+			}
+		}
+
+		assert(p < 258);
+		huffsize[p] = 0;
+
+		lastp = p;
+
+		code = 0;
+		si = huffsize[0];
+		p = 0;
+
+		while (huffsize[p])
+		{
+			while (huffsize[p] == si)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffcode[p++] = code;
+				code++;
+			}
+
+			code <<= 1;
+			si++;
+		}
+
+		memset(pH->look_up, 0, sizeof(pH->look_up));
+		memset(pH->look_up2, 0, sizeof(pH->look_up2));
+		memset(pH->tree, 0, sizeof(pH->tree));
+		memset(pH->code_size, 0, sizeof(pH->code_size));
+
+		nextfreeentry = -1;
+
+		p = 0;
+
+		while (p < lastp)
+		{
+			i = m_huff_val[index][p];
+
+			code = huffcode[p];
+			code_size = huffsize[p];
+
+			assert(i < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			pH->code_size[i] = static_cast<uint8>(code_size);
+
+			if (code_size <= 8)
+			{
+				code <<= (8 - code_size);
+
+				for (l = 1 << (8 - code_size); l > 0; l--)
+				{
+					if (code >= 256)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					pH->look_up[code] = i;
+
+					bool has_extrabits = false;
+					int extra_bits = 0;
+					int num_extra_bits = i & 15;
+
+					int bits_to_fetch = code_size;
+					if (num_extra_bits)
+					{
+						int total_codesize = code_size + num_extra_bits;
+						if (total_codesize <= 8)
+						{
+							has_extrabits = true;
+							extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+
+							if (extra_bits > 0x7FFF)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							bits_to_fetch += num_extra_bits;
+						}
+					}
+
+					if (!has_extrabits)
+						pH->look_up2[code] = i | (bits_to_fetch << 8);
+					else
+						pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+
+					code++;
+				}
+			}
+			else
+			{
+				subtree = (code >> (code_size - 8)) & 0xFF;
+
+				currententry = pH->look_up[subtree];
+
+				if (currententry == 0)
+				{
+					pH->look_up[subtree] = currententry = nextfreeentry;
+					pH->look_up2[subtree] = currententry = nextfreeentry;
+
+					nextfreeentry -= 2;
+				}
+
+				code <<= (16 - (code_size - 8));
+
+				for (l = code_size; l > 9; l--)
+				{
+					if ((code & 0x8000) == 0)
+						currententry--;
+
+					unsigned int idx = -currententry - 1;
+
+					if (idx >= JPGD_HUFF_TREE_MAX_LENGTH)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pH->tree[idx] == 0)
+					{
+						pH->tree[idx] = nextfreeentry;
+
+						currententry = nextfreeentry;
+
+						nextfreeentry -= 2;
+					}
+					else
+					{
+						currententry = pH->tree[idx];
+					}
+
+					code <<= 1;
+				}
+
+				if ((code & 0x8000) == 0)
+					currententry--;
+
+				if ((-currententry - 1) >= JPGD_HUFF_TREE_MAX_LENGTH)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				pH->tree[-currententry - 1] = i;
+			}
+
+			p++;
+		}
+	}
+
+	// Verifies the quantization tables needed for this scan are available.
+	void jpeg_decoder::check_quant_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+			if (m_quant[m_comp_quant[m_comp_list[i]]] == nullptr)
+				stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+	}
+
+	// Verifies that all the Huffman tables needed for this scan are available.
+	void jpeg_decoder::check_huff_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+		{
+			if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+
+			if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+		}
+
+		for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
+			if (m_huff_num[i])
+			{
+				if (!m_pHuff_tabs[i])
+					m_pHuff_tabs[i] = (huff_tables*)alloc(sizeof(huff_tables));
+
+				make_huff_table(i, m_pHuff_tabs[i]);
+			}
+	}
+
+	// Determines the component order inside each MCU.
+	// Also calcs how many MCU's are on each row, etc.
+	bool jpeg_decoder::calc_mcu_block_order()
+	{
+		int component_num, component_id;
+		int max_h_samp = 0, max_v_samp = 0;
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			if (m_comp_h_samp[component_id] > max_h_samp)
+				max_h_samp = m_comp_h_samp[component_id];
+
+			if (m_comp_v_samp[component_id] > max_v_samp)
+				max_v_samp = m_comp_v_samp[component_id];
+		}
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+			m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+			m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+		}
+		else
+		{
+			m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+			m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcu_org[0] = m_comp_list[0];
+
+			m_blocks_per_mcu = 1;
+		}
+		else
+		{
+			m_blocks_per_mcu = 0;
+
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				int num_blocks;
+
+				component_id = m_comp_list[component_num];
+
+				num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+
+				while (num_blocks--)
+					m_mcu_org[m_blocks_per_mcu++] = component_id;
+			}
+		}
+
+		if (m_blocks_per_mcu > m_max_blocks_per_mcu)
+			return false;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			int comp_id = m_mcu_org[mcu_block];
+			if (comp_id >= JPGD_MAX_QUANT_TABLES)
+				return false;
+		}
+
+		return true;
+	}
+
+	// Starts a new scan.
+	int jpeg_decoder::init_scan()
+	{
+		if (!locate_sos_marker())
+			return JPGD_FALSE;
+
+		if (!calc_mcu_block_order())
+			return JPGD_FALSE;
+
+		check_huff_tables();
+
+		check_quant_tables();
+
+		memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		if (m_restart_interval)
+		{
+			m_restarts_left = m_restart_interval;
+			m_next_restart_num = 0;
+		}
+
+		fix_in_buffer();
+
+		return JPGD_TRUE;
+	}
+
+	// Starts a frame. Determines if the number of components or sampling factors
+	// are supported.
+	void jpeg_decoder::init_frame()
+	{
+		int i;
+
+		if (m_comps_in_frame == 1)
+		{
+			if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_scan_type = JPGD_GRAYSCALE;
+			m_max_blocks_per_mcu = 1;
+			m_max_mcu_x_size = 8;
+			m_max_mcu_y_size = 8;
+		}
+		else if (m_comps_in_frame == 3)
+		{
+			if (((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
+				((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH1V1;
+
+				m_max_blocks_per_mcu = 3;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH2V1;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH1V2;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 16;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH2V2;
+				m_max_blocks_per_mcu = 6;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 16;
+			}
+			else
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+		}
+		else
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+		m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+		// These values are for the *destination* pixels: after conversion.
+		if (m_scan_type == JPGD_GRAYSCALE)
+			m_dest_bytes_per_pixel = 1;
+		else
+			m_dest_bytes_per_pixel = 4;
+
+		m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+
+		m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+		// Initialize two scan line buffers.
+		m_pScan_line_0 = (uint8*)alloc(m_dest_bytes_per_scan_line, true);
+		if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
+			m_pScan_line_1 = (uint8*)alloc(m_dest_bytes_per_scan_line, true);
+
+		m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+		// Should never happen
+		if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Allocate the coefficient buffer, enough for one MCU
+		m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
+
+		for (i = 0; i < m_max_blocks_per_mcu; i++)
+			m_mcu_block_max_zag[i] = 64;
+
+		m_pSample_buf = (uint8*)alloc(m_max_blocks_per_row * 64);
+		m_pSample_buf_prev = (uint8*)alloc(m_max_blocks_per_row * 64);
+
+		m_total_lines_left = m_image_y_size;
+
+		m_mcu_lines_left = 0;
+
+		create_look_ups();
+	}
+
+	// The coeff_buf series of methods originally stored the coefficients
+	// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+	// was used to make this process more efficient. Now, we can store the entire
+	// thing in RAM.
+	jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+	{
+		coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+
+		cb->block_num_x = block_num_x;
+		cb->block_num_y = block_num_y;
+		cb->block_len_x = block_len_x;
+		cb->block_len_y = block_len_y;
+		cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
+		cb->pData = (uint8*)alloc(cb->block_size * block_num_x * block_num_y, true);
+		return cb;
+	}
+
+	inline jpgd_block_t* jpeg_decoder::coeff_buf_getp(coeff_buf* cb, int block_x, int block_y)
+	{
+		if ((block_x >= cb->block_num_x) || (block_y >= cb->block_num_y))
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		return (jpgd_block_t*)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+	}
+
+	// The following methods decode the various types of m_blocks encountered
+	// in progressively encoded images.
+	void jpeg_decoder::decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, r;
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+		if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
+		{
+			if (s >= 16)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			r = pD->get_bits_no_markers(s);
+			s = JPGD_HUFF_EXTEND(r, s);
+		}
+
+		pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+
+		p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+	}
+
+	void jpeg_decoder::decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		if (pD->get_bits_no_markers(1))
+		{
+			jpgd_block_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+			p[0] |= (1 << pD->m_successive_low);
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int k, s, r;
+
+		if (pD->m_eob_run)
+		{
+			pD->m_eob_run--;
+			return;
+		}
+
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+		for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
+		{
+			unsigned int idx = pD->m_comp_ac_tab[component_id];
+			if (idx >= JPGD_MAX_HUFF_TABLES)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+			r = s >> 4;
+			s &= 15;
+
+			if (s)
+			{
+				if ((k += r) > 63)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				r = pD->get_bits_no_markers(s);
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+			}
+			else
+			{
+				if (r == 15)
+				{
+					if ((k += 15) > 63)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+				}
+				else
+				{
+					pD->m_eob_run = 1 << r;
+
+					if (r)
+						pD->m_eob_run += pD->get_bits_no_markers(r);
+
+					pD->m_eob_run--;
+
+					break;
+				}
+			}
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, k, r;
+
+		int p1 = 1 << pD->m_successive_low;
+
+		//int m1 = (-1) << pD->m_successive_low;
+		int m1 = static_cast<int>((UINT32_MAX << pD->m_successive_low));
+
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+		if (pD->m_spectral_end > 63)
+			pD->stop_decoding(JPGD_DECODE_ERROR);
+
+		k = pD->m_spectral_start;
+
+		if (pD->m_eob_run == 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				unsigned int idx = pD->m_comp_ac_tab[component_id];
+				if (idx >= JPGD_MAX_HUFF_TABLES)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+				r = s >> 4;
+				s &= 15;
+
+				if (s)
+				{
+					if (s != 1)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pD->get_bits_no_markers(1))
+						s = p1;
+					else
+						s = m1;
+				}
+				else
+				{
+					if (r != 15)
+					{
+						pD->m_eob_run = 1 << r;
+
+						if (r)
+							pD->m_eob_run += pD->get_bits_no_markers(r);
+
+						break;
+					}
+				}
+
+				do
+				{
+					jpgd_block_t* this_coef = p + g_ZAG[k & 63];
+
+					if (*this_coef != 0)
+					{
+						if (pD->get_bits_no_markers(1))
+						{
+							if ((*this_coef & p1) == 0)
+							{
+								if (*this_coef >= 0)
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+								else
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+							}
+						}
+					}
+					else
+					{
+						if (--r < 0)
+							break;
+					}
+
+					k++;
+
+				} while (k <= pD->m_spectral_end);
+
+				if ((s) && (k < 64))
+				{
+					p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
+				}
+			}
+		}
+
+		if (pD->m_eob_run > 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				jpgd_block_t* this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
+
+				if (*this_coef != 0)
+				{
+					if (pD->get_bits_no_markers(1))
+					{
+						if ((*this_coef & p1) == 0)
+						{
+							if (*this_coef >= 0)
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+							else
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+						}
+					}
+				}
+			}
+
+			pD->m_eob_run--;
+		}
+	}
+
+	// Decode a scan in a progressively encoded image.
+	void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+	{
+		int mcu_row, mcu_col, mcu_block;
+		int block_x_mcu[JPGD_MAX_COMPONENTS], block_y_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_y_mcu, 0, sizeof(block_y_mcu));
+
+		for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
+		{
+			int component_num, component_id;
+
+			memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+			for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+			{
+				int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+				if ((m_restart_interval) && (m_restarts_left == 0))
+					process_restart();
+
+				for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+				{
+					component_id = m_mcu_org[mcu_block];
+
+					decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, block_y_mcu[component_id] + block_y_mcu_ofs);
+
+					if (m_comps_in_scan == 1)
+						block_x_mcu[component_id]++;
+					else
+					{
+						if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+						{
+							block_x_mcu_ofs = 0;
+
+							if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+							{
+								block_y_mcu_ofs = 0;
+								block_x_mcu[component_id] += m_comp_h_samp[component_id];
+							}
+						}
+					}
+				}
+
+				m_restarts_left--;
+			}
+
+			if (m_comps_in_scan == 1)
+				block_y_mcu[m_comp_list[0]]++;
+			else
+			{
+				for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+				{
+					component_id = m_comp_list[component_num];
+					block_y_mcu[component_id] += m_comp_v_samp[component_id];
+				}
+			}
+		}
+	}
+
+	// Decode a progressively encoded image.
+	void jpeg_decoder::init_progressive()
+	{
+		int i;
+
+		if (m_comps_in_frame == 4)
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		// Allocate the coefficient buffers.
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+			m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+		}
+
+		// See https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
+		uint32_t total_scans = 0;
+		const uint32_t MAX_SCANS_TO_PROCESS = 1000;
+
+		for (; ; )
+		{
+			int dc_only_scan, refinement_scan;
+			pDecode_block_func decode_block_func;
+
+			if (!init_scan())
+				break;
+
+			dc_only_scan = (m_spectral_start == 0);
+			refinement_scan = (m_successive_high != 0);
+
+			if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if (dc_only_scan)
+			{
+				if (m_spectral_end)
+					stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+			}
+			else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
+				stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+			if (dc_only_scan)
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_dc_refine;
+				else
+					decode_block_func = decode_block_dc_first;
+			}
+			else
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_ac_refine;
+				else
+					decode_block_func = decode_block_ac_first;
+			}
+
+			decode_scan(decode_block_func);
+
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			total_scans++;
+			if (total_scans > MAX_SCANS_TO_PROCESS)
+				stop_decoding(JPGD_TOO_MANY_SCANS);
+		}
+
+		m_comps_in_scan = m_comps_in_frame;
+
+		for (i = 0; i < m_comps_in_frame; i++)
+			m_comp_list[i] = i;
+
+		if (!calc_mcu_block_order())
+			stop_decoding(JPGD_DECODE_ERROR);
+	}
+
+	void jpeg_decoder::init_sequential()
+	{
+		if (!init_scan())
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+	}
+
+	void jpeg_decoder::decode_start()
+	{
+		init_frame();
+
+		if (m_progressive_flag)
+			init_progressive();
+		else
+			init_sequential();
+	}
+
+	void jpeg_decoder::decode_init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		init(pStream, flags);
+		locate_sof_marker();
+	}
+
+	jpeg_decoder::jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		if (setjmp(m_jmp_state))
+			return;
+		decode_init(pStream, flags);
+	}
+
+	int jpeg_decoder::begin_decoding()
+	{
+		if (m_ready_flag)
+			return JPGD_SUCCESS;
+
+		if (m_error_code)
+			return JPGD_FAILED;
+
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		decode_start();
+
+		m_ready_flag = true;
+
+		return JPGD_SUCCESS;
+	}
+
+	jpeg_decoder::~jpeg_decoder()
+	{
+		free_all_blocks();
+	}
+
+	jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+	{
+		m_pFile = nullptr;
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	void jpeg_decoder_file_stream::close()
+	{
+		if (m_pFile)
+		{
+			fclose(m_pFile);
+			m_pFile = nullptr;
+		}
+
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+	{
+		close();
+	}
+
+	bool jpeg_decoder_file_stream::open(const char* Pfilename)
+	{
+		close();
+
+		m_eof_flag = false;
+		m_error_flag = false;
+
+#if defined(_MSC_VER)
+		m_pFile = nullptr;
+		fopen_s(&m_pFile, Pfilename, "rb");
+#else
+		m_pFile = fopen(Pfilename, "rb");
+#endif
+		return m_pFile != nullptr;
+	}
+
+	int jpeg_decoder_file_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		if (!m_pFile)
+			return -1;
+
+		if (m_eof_flag)
+		{
+			*pEOF_flag = true;
+			return 0;
+		}
+
+		if (m_error_flag)
+			return -1;
+
+		int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+		if (bytes_read < max_bytes_to_read)
+		{
+			if (ferror(m_pFile))
+			{
+				m_error_flag = true;
+				return -1;
+			}
+
+			m_eof_flag = true;
+			*pEOF_flag = true;
+		}
+
+		return bytes_read;
+	}
+
+	bool jpeg_decoder_mem_stream::open(const uint8* pSrc_data, uint size)
+	{
+		close();
+		m_pSrc_data = pSrc_data;
+		m_ofs = 0;
+		m_size = size;
+		return true;
+	}
+
+	int jpeg_decoder_mem_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		*pEOF_flag = false;
+
+		if (!m_pSrc_data)
+			return -1;
+
+		uint bytes_remaining = m_size - m_ofs;
+		if ((uint)max_bytes_to_read > bytes_remaining)
+		{
+			max_bytes_to_read = bytes_remaining;
+			*pEOF_flag = true;
+		}
+
+		memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+		m_ofs += max_bytes_to_read;
+
+		return max_bytes_to_read;
+	}
+
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		if (!actual_comps)
+			return nullptr;
+		*actual_comps = 0;
+
+		if ((!pStream) || (!width) || (!height) || (!req_comps))
+			return nullptr;
+
+		if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
+			return nullptr;
+
+		jpeg_decoder decoder(pStream, flags);
+		if (decoder.get_error_code() != JPGD_SUCCESS)
+			return nullptr;
+
+		const int image_width = decoder.get_width(), image_height = decoder.get_height();
+		*width = image_width;
+		*height = image_height;
+		*actual_comps = decoder.get_num_components();
+
+		if (decoder.begin_decoding() != JPGD_SUCCESS)
+			return nullptr;
+
+		const int dst_bpl = image_width * req_comps;
+
+		uint8* pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
+		if (!pImage_data)
+			return nullptr;
+
+		for (int y = 0; y < image_height; y++)
+		{
+			const uint8* pScan_line;
+			uint scan_line_len;
+			if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
+			{
+				jpgd_free(pImage_data);
+				return nullptr;
+			}
+
+			uint8* pDst = pImage_data + y * dst_bpl;
+
+			if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
+				memcpy(pDst, pScan_line, dst_bpl);
+			else if (decoder.get_num_components() == 1)
+			{
+				if (req_comps == 3)
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst += 3;
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst[3] = 255;
+						pDst += 4;
+					}
+				}
+			}
+			else if (decoder.get_num_components() == 3)
+			{
+				if (req_comps == 1)
+				{
+					const int YR = 19595, YG = 38470, YB = 7471;
+					for (int x = 0; x < image_width; x++)
+					{
+						int r = pScan_line[x * 4 + 0];
+						int g = pScan_line[x * 4 + 1];
+						int b = pScan_line[x * 4 + 2];
+						*pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						pDst[0] = pScan_line[x * 4 + 0];
+						pDst[1] = pScan_line[x * 4 + 1];
+						pDst[2] = pScan_line[x * 4 + 2];
+						pDst += 3;
+					}
+				}
+			}
+		}
+
+		return pImage_data;
+	}
+
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+		return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps, flags);
+	}
+
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_file_stream file_stream;
+		if (!file_stream.open(pSrc_filename))
+			return nullptr;
+		return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps, flags);
+	}
+
+} // namespace jpgd
diff --git a/thirdparty/basis_universal/encoder/jpgd.h b/thirdparty/basis_universal/encoder/jpgd.h
new file mode 100644
index 0000000000..86a7814cae
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/jpgd.h
@@ -0,0 +1,347 @@
+// jpgd.h - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+#ifndef JPEG_DECODER_H
+#define JPEG_DECODER_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <setjmp.h>
+#include <assert.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define JPGD_NORETURN __declspec(noreturn) 
+#elif defined(__GNUC__)
+#define JPGD_NORETURN __attribute__ ((noreturn))
+#else
+#define JPGD_NORETURN
+#endif
+
+#define JPGD_HUFF_TREE_MAX_LENGTH 512
+#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
+
+namespace jpgd
+{
+	typedef unsigned char  uint8;
+	typedef   signed short int16;
+	typedef unsigned short uint16;
+	typedef unsigned int   uint;
+	typedef   signed int   int32;
+
+	// Loads a JPEG image from a memory buffer or a file.
+	// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+	// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+	// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+	// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	// Success/failure error codes.
+	enum jpgd_status
+	{
+		JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+		JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
+		JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
+		JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+		JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+		JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+		JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+		JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
+		JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
+	};
+
+	// Input stream interface.
+	// Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+	// The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+	// It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+	// Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+	class jpeg_decoder_stream
+	{
+	public:
+		jpeg_decoder_stream() { }
+		virtual ~jpeg_decoder_stream() { }
+
+		// The read() method is called when the internal input buffer is empty.
+		// Parameters:
+		// pBuf - input buffer
+		// max_bytes_to_read - maximum bytes that can be written to pBuf
+		// pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+		// Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+		// Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag) = 0;
+	};
+
+	// stdio FILE stream class.
+	class jpeg_decoder_file_stream : public jpeg_decoder_stream
+	{
+		jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
+		jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
+
+		FILE* m_pFile;
+		bool m_eof_flag, m_error_flag;
+
+	public:
+		jpeg_decoder_file_stream();
+		virtual ~jpeg_decoder_file_stream();
+
+		bool open(const char* Pfilename);
+		void close();
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Memory stream class.
+	class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+	{
+		const uint8* m_pSrc_data;
+		uint m_ofs, m_size;
+
+	public:
+		jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
+		jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
+
+		virtual ~jpeg_decoder_mem_stream() { }
+
+		bool open(const uint8* pSrc_data, uint size);
+		void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Loads JPEG file from a jpeg_decoder_stream.
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	enum
+	{
+		JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
+		JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
+	};
+
+	typedef int16 jpgd_quant_t;
+	typedef int16 jpgd_block_t;
+
+	class jpeg_decoder
+	{
+	public:
+		enum
+		{
+			cFlagLinearChromaFiltering = 1
+		};
+
+		// Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+		// methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+		jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags = cFlagLinearChromaFiltering);
+
+		~jpeg_decoder();
+
+		// Call this method after constructing the object to begin decompression.
+		// If JPGD_SUCCESS is returned you may then call decode() on each scanline.
+
+		int begin_decoding();
+
+		// Returns the next scan line.
+		// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+		// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+		// Returns JPGD_SUCCESS if a scan line has been returned.
+		// Returns JPGD_DONE if all scan lines have been returned.
+		// Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+		int decode(const void** pScan_line, uint* pScan_line_len);
+
+		inline jpgd_status get_error_code() const { return m_error_code; }
+
+		inline int get_width() const { return m_image_x_size; }
+		inline int get_height() const { return m_image_y_size; }
+
+		inline int get_num_components() const { return m_comps_in_frame; }
+
+		inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+		inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+
+		// Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+		inline int get_total_bytes_read() const { return m_total_bytes_read; }
+
+	private:
+		jpeg_decoder(const jpeg_decoder&);
+		jpeg_decoder& operator =(const jpeg_decoder&);
+
+		typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
+
+		struct huff_tables
+		{
+			bool ac_table;
+			uint  look_up[256];
+			uint  look_up2[256];
+			uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
+			uint  tree[JPGD_HUFF_TREE_MAX_LENGTH];
+		};
+
+		struct coeff_buf
+		{
+			uint8* pData;
+			int block_num_x, block_num_y;
+			int block_len_x, block_len_y;
+			int block_size;
+		};
+
+		struct mem_block
+		{
+			mem_block* m_pNext;
+			size_t m_used_count;
+			size_t m_size;
+			char m_data[1];
+		};
+
+		jmp_buf m_jmp_state;
+		uint32_t m_flags;
+		mem_block* m_pMem_blocks;
+		int m_image_x_size;
+		int m_image_y_size;
+		jpeg_decoder_stream* m_pStream;
+
+		int m_progressive_flag;
+
+		uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
+		uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+		uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+		jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+		int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+		int m_comps_in_frame;                         // # of components in frame
+		int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+		int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+		int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+		int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+		int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+		int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+		int m_comps_in_scan;                          // # of components in scan
+		int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+		int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+		int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+		int m_spectral_start;                         // spectral selection start
+		int m_spectral_end;                           // spectral selection end
+		int m_successive_low;                         // successive approximation low
+		int m_successive_high;                        // successive approximation high
+		int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+		int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+		int m_blocks_per_mcu;
+		int m_max_blocks_per_row;
+		int m_mcus_per_row, m_mcus_per_col;
+		int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+		int m_total_lines_left;                       // total # lines left in image
+		int m_mcu_lines_left;                         // total # lines left in this MCU
+		int m_num_buffered_scanlines;
+		int m_real_dest_bytes_per_scan_line;
+		int m_dest_bytes_per_scan_line;               // rounded up
+		int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+		huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+		coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+		coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+		int m_eob_run;
+		int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+		uint8* m_pIn_buf_ofs;
+		int m_in_buf_left;
+		int m_tem_flag;
+
+		uint8 m_in_buf_pad_start[64];
+		uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
+		uint8 m_in_buf_pad_end[64];
+
+		int m_bits_left;
+		uint m_bit_buf;
+		int m_restart_interval;
+		int m_restarts_left;
+		int m_next_restart_num;
+		int m_max_mcus_per_row;
+		int m_max_blocks_per_mcu;
+
+		int m_max_mcus_per_col;
+		uint m_last_dc_val[JPGD_MAX_COMPONENTS];
+		jpgd_block_t* m_pMCU_coefficients;
+		int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+		uint8* m_pSample_buf;
+		uint8* m_pSample_buf_prev;
+		int m_crr[256];
+		int m_cbb[256];
+		int m_crg[256];
+		int m_cbg[256];
+		uint8* m_pScan_line_0;
+		uint8* m_pScan_line_1;
+		jpgd_status m_error_code;
+		int m_total_bytes_read;
+
+		bool m_ready_flag;
+		bool m_eof_flag;
+		bool m_sample_buf_prev_valid;
+
+		inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
+		void free_all_blocks();
+		JPGD_NORETURN void stop_decoding(jpgd_status status);
+		void* alloc(size_t n, bool zero = false);
+		void word_clear(void* p, uint16 c, uint n);
+		void prep_in_buffer();
+		void read_dht_marker();
+		void read_dqt_marker();
+		void read_sof_marker();
+		void skip_variable_marker();
+		void read_dri_marker();
+		void read_sos_marker();
+		int next_marker();
+		int process_markers();
+		void locate_soi_marker();
+		void locate_sof_marker();
+		int locate_sos_marker();
+		void init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void create_look_ups();
+		void fix_in_buffer();
+		void transform_mcu(int mcu_row);
+		coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+		inline jpgd_block_t* coeff_buf_getp(coeff_buf* cb, int block_x, int block_y);
+		void load_next_row();
+		void decode_next_row();
+		void make_huff_table(int index, huff_tables* pH);
+		void check_quant_tables();
+		void check_huff_tables();
+		bool calc_mcu_block_order();
+		int init_scan();
+		void init_frame();
+		void process_restart();
+		void decode_scan(pDecode_block_func decode_block_func);
+		void init_progressive();
+		void init_sequential();
+		void decode_start();
+		void decode_init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void H2V2Convert();
+		uint32_t H2V2ConvertFiltered();
+		void H2V1Convert();
+		void H2V1ConvertFiltered();
+		void H1V2Convert();
+		void H1V2ConvertFiltered();
+		void H1V1Convert();
+		void gray_convert();
+		void find_eoi();
+		inline uint get_char();
+		inline uint get_char(bool* pPadding_flag);
+		inline void stuff_char(uint8 q);
+		inline uint8 get_octet();
+		inline uint get_bits(int num_bits);
+		inline uint get_bits_no_markers(int numbits);
+		inline int huff_decode(huff_tables* pH);
+		inline int huff_decode(huff_tables* pH, int& extrabits);
+
+		// Clamps a value between 0-255.
+		static inline uint8 clamp(int i)
+		{
+			if (static_cast<uint>(i) > 255)
+				i = (((~i) >> 31) & 0xFF);
+			return static_cast<uint8>(i);
+		}
+		int decode_next_mcu_row();
+
+		static void decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+	};
+
+} // namespace jpgd
+
+#endif // JPEG_DECODER_H
diff --git a/thirdparty/basis_universal/lodepng.cpp b/thirdparty/basis_universal/encoder/lodepng.cpp
index cf964d0555..63adcf49b6 100644
--- a/thirdparty/basis_universal/lodepng.cpp
+++ b/thirdparty/basis_universal/encoder/lodepng.cpp
@@ -29,6 +29,7 @@ Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for
 */
 
 #ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
 #pragma warning (disable : 4201)
 
 #ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
@@ -200,6 +201,7 @@ static void uivector_init(uivector* p) {
 /*returns 1 if success, 0 if failure ==> nothing done*/
 static unsigned uivector_push_back(uivector* p, unsigned c) {
   if(!uivector_resize(p, p->size + 1)) return 0;
+  if (!p->data) return 0;
   p->data[p->size - 1] = c;
   return 1;
 }
diff --git a/thirdparty/basis_universal/lodepng.h b/thirdparty/basis_universal/encoder/lodepng.h
index 476a2061e2..476a2061e2 100644
--- a/thirdparty/basis_universal/lodepng.h
+++ b/thirdparty/basis_universal/encoder/lodepng.h
diff --git a/thirdparty/basis_universal/transcoder/basisu.h b/thirdparty/basis_universal/transcoder/basisu.h
index 25600a69bf..f33baf67c8 100644
--- a/thirdparty/basis_universal/transcoder/basisu.h
+++ b/thirdparty/basis_universal/transcoder/basisu.h
@@ -1,5 +1,5 @@
 // basisu.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,10 +41,6 @@
 			#endif
 		#endif // defined(_DEBUG) || defined(DEBUG)
 
-		#ifndef NOMINMAX
-			#define NOMINMAX
-		#endif
-
 	#endif // BASISU_NO_ITERATOR_DEBUG_LEVEL
 
 #endif // _MSC_VER
@@ -63,10 +59,11 @@
 #include <functional>
 #include <iterator>
 #include <type_traits>
-#include <vector>
 #include <assert.h>
 #include <random>
 
+#include "basisu_containers.h"
+
 #ifdef max
 #undef max
 #endif
@@ -79,20 +76,20 @@
 #define strcasecmp _stricmp
 #endif
 
-// Set to one to enable debug printf()'s when any errors occur, for development/debugging.
-#ifndef BASISU_DEVEL_MESSAGES
-#define BASISU_DEVEL_MESSAGES 0
+// Set to one to enable debug printf()'s when any errors occur, for development/debugging. Especially useful for WebGL development.
+#ifndef BASISU_FORCE_DEVEL_MESSAGES
+#define BASISU_FORCE_DEVEL_MESSAGES 0
 #endif
 
 #define BASISU_NOTE_UNUSED(x) (void)(x)
 #define BASISU_ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(x) x(const x &) = delete; x& operator= (const x &) = delete;
 #define BASISU_ASSUME(x) static_assert(x, #x);
-#define BASISU_OFFSETOF(s, m) (uint32_t)(intptr_t)(&((s *)(0))->m)
+#define BASISU_OFFSETOF(s, m) offsetof(s, m)
 #define BASISU_STRINGIZE(x) #x
 #define BASISU_STRINGIZE2(x) BASISU_STRINGIZE(x)
 
-#if BASISU_DEVEL_MESSAGES
+#if BASISU_FORCE_DEVEL_MESSAGES
 	#define BASISU_DEVEL_ERROR(...) do { basisu::debug_printf(__VA_ARGS__); } while(0)
 #else
 	#define BASISU_DEVEL_ERROR(...)
@@ -108,26 +105,43 @@ namespace basisu
 	const char BASISU_PATH_SEPERATOR_CHAR = '/';
 #endif
 
-	typedef std::vector<uint8_t> uint8_vec;
-	typedef std::vector<int16_t> int16_vec;
-	typedef std::vector<uint16_t> uint16_vec;
-	typedef std::vector<uint32_t> uint_vec;
-	typedef std::vector<uint64_t> uint64_vec;
-	typedef std::vector<int> int_vec;
-	typedef std::vector<bool> bool_vec;
+	typedef basisu::vector<uint8_t> uint8_vec;
+	typedef basisu::vector<int16_t> int16_vec;
+	typedef basisu::vector<uint16_t> uint16_vec;
+	typedef basisu::vector<uint32_t> uint_vec;
+	typedef basisu::vector<uint64_t> uint64_vec;
+	typedef basisu::vector<int> int_vec;
+	typedef basisu::vector<bool> bool_vec;
 
 	void enable_debug_printf(bool enabled);
 	void debug_printf(const char *pFmt, ...);
 		
+
 	template <typename T> inline void clear_obj(T& obj) { memset(&obj, 0, sizeof(obj)); }
 
 	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }
 
 	template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
 	template <typename S> inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
+	template <typename S> inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
 	
 	template <typename S> inline S minimum(S a, S b) {	return (a < b) ? a : b; }
 	template <typename S> inline S minimum(S a, S b, S c) {	return minimum(minimum(a, b), c); }
+	template <typename S> inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
+
+	inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+	inline float saturate(float value) { return clampf(value, 0, 1.0f); }
+	inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; }
+	inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; }
+	inline int32_t minimumi(int32_t a, int32_t b) { return (a < b) ? a : b; }
+	inline float minimumf(float a, float b) { return (a < b) ? a : b; }
+	inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; }
+	inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+	inline int32_t maximumi(int32_t a, int32_t b) { return (a > b) ? a : b; }
+	inline float maximumf(float a, float b) { return (a > b) ? a : b; }
+	inline int squarei(int i) { return i * i; }
+	inline float squaref(float i) { return i * i; }
+	template<typename T> inline T square(T a) { return a * a; }
 
 	template <typename S> inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
 
@@ -137,12 +151,10 @@ namespace basisu
 	template<typename T> inline void clear_vector(T &vec) { vec.erase(vec.begin(), vec.end()); }		
 	template<typename T> inline typename T::value_type *enlarge_vector(T &vec, size_t n) { size_t cs = vec.size(); vec.resize(cs + n); return &vec[cs]; }
 
-	template<typename S> inline S square(S val) { return val * val; }
-
 	inline bool is_pow2(uint32_t x) { return x && ((x & (x - 1U)) == 0U); }
 	inline bool is_pow2(uint64_t x) { return x && ((x & (x - 1U)) == 0U); }
 
-	template<typename T> inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); return v; }
+	template<typename T> inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); BASISU_NOTE_UNUSED(minv); BASISU_NOTE_UNUSED(maxv); return v; }
 	template<typename T> inline T open_range_check(T v, T maxv) { assert(v < maxv); BASISU_NOTE_UNUSED(maxv); return v; }
 
 	inline uint32_t total_bits(uint32_t v) { uint32_t l = 0; for ( ; v > 0U; ++l) v >>= 1; return l; }
@@ -244,27 +256,92 @@ namespace basisu
 		if ((ha <= lb) || (la >= hb)) return false;
 		return true;
 	}
+
+	static inline uint32_t read_le_dword(const uint8_t *pBytes)
+	{
+		return (pBytes[3] << 24U) | (pBytes[2] << 16U) | (pBytes[1] << 8U) | (pBytes[0]);
+	}
+
+	static inline void write_le_dword(uint8_t* pBytes, uint32_t val)
+	{
+		pBytes[0] = (uint8_t)val;
+		pBytes[1] = (uint8_t)(val >> 8U);
+		pBytes[2] = (uint8_t)(val >> 16U);
+		pBytes[3] = (uint8_t)(val >> 24U);
+	}
 		
-	// Always little endian 2-4 byte unsigned int
+	// Always little endian 1-8 byte unsigned int
 	template<uint32_t NumBytes>
 	struct packed_uint
 	{
 		uint8_t m_bytes[NumBytes];
 
-		inline packed_uint() { static_assert(NumBytes <= 4, "NumBytes <= 4"); }
-		inline packed_uint(uint32_t v) { *this = v; }
+		inline packed_uint() { static_assert(NumBytes <= sizeof(uint64_t), "Invalid NumBytes"); }
+		inline packed_uint(uint64_t v) { *this = v; }
 		inline packed_uint(const packed_uint& other) { *this = other; }
+						
+		inline packed_uint& operator= (uint64_t v) 
+		{ 
+			for (uint32_t i = 0; i < NumBytes; i++) 
+				m_bytes[i] = static_cast<uint8_t>(v >> (i * 8)); 
+			return *this; 
+		}
 
-		inline packed_uint& operator= (uint32_t v) { for (uint32_t i = 0; i < NumBytes; i++) m_bytes[i] = static_cast<uint8_t>(v >> (i * 8)); return *this; }
+		inline packed_uint& operator= (const packed_uint& rhs) 
+		{ 
+			memcpy(m_bytes, rhs.m_bytes, sizeof(m_bytes)); 
+			return *this;
+		}
 
 		inline operator uint32_t() const
 		{
 			switch (NumBytes)
 			{
-				case 1:  return  m_bytes[0];
-				case 2:  return (m_bytes[1] << 8U) | m_bytes[0];
-				case 3:  return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | (m_bytes[0]);
-				default: return (m_bytes[3] << 24U) | (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | (m_bytes[0]);
+				case 1:  
+				{
+					return  m_bytes[0];
+				}
+				case 2:  
+				{
+					return (m_bytes[1] << 8U) | m_bytes[0];
+				}
+				case 3:  
+				{
+					return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | m_bytes[0];
+				}
+				case 4:  
+				{
+					return read_le_dword(m_bytes);
+				}
+				case 5:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 6:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = (m_bytes[5] << 8U) | m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 7:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = (m_bytes[6] << 16U) | (m_bytes[5] << 8U) | m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 8:  
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = read_le_dword(m_bytes + 4);
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				default: 
+				{
+					assert(0);
+					return 0;
+				}
 			}
 		}
 	};
@@ -278,7 +355,7 @@ namespace basisu
 	enum
 	{
 		cHuffmanMaxSupportedCodeSize = 16, cHuffmanMaxSupportedInternalCodeSize = 31, 
-		cHuffmanFastLookupBits = 10, cHuffmanFastLookupSize = 1 << cHuffmanFastLookupBits,
+		cHuffmanFastLookupBits = 10, 
 		cHuffmanMaxSymsLog2 = 14, cHuffmanMaxSyms = 1 << cHuffmanMaxSymsLog2,
 
 		// Small zero runs
@@ -308,15 +385,15 @@ namespace basisu
 		// Block-based formats
 		cETC1,			// ETC1
 		cETC1S,			// ETC1 (subset: diff colors only, no subblocks)
-		cETC2_RGB,		// ETC2 color block
-		cETC2_RGBA,		// ETC2 alpha block followed by ETC2 color block
+		cETC2_RGB,		// ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1)
+		cETC2_RGBA,		// ETC2 EAC alpha block followed by ETC2 color block
 		cETC2_ALPHA,	// ETC2 EAC alpha block 
 		cBC1,				// DXT1
-		cBC3,				// DXT5 (DXT5A block followed by a DXT1 block)
+		cBC3,				// DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block)
 		cBC4,				// DXT5A
-		cBC5,				// 3DC/DXN (two DXT5A blocks)
+		cBC5,				// 3DC/DXN (two BC4/DXT5A blocks)
 		cBC7,
-		cASTC4x4,		
+		cASTC4x4,		// LDR only
 		cPVRTC1_4_RGB,
 		cPVRTC1_4_RGBA,
 		cATC_RGB,
@@ -325,6 +402,9 @@ namespace basisu
 		cPVRTC2_4_RGBA,
 		cETC2_R11_EAC,
 		cETC2_RG11_EAC,
+		cUASTC4x4,		
+		cBC1_NV,
+		cBC1_AMD,
 		
 		// Uncompressed/raw pixels
 		cRGBA32,
@@ -343,6 +423,8 @@ namespace basisu
 		case texture_format::cETC2_RGB:
 		case texture_format::cETC2_ALPHA:
 		case texture_format::cBC1:
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
 		case texture_format::cBC4:
 		case texture_format::cPVRTC1_4_RGB:
 		case texture_format::cPVRTC1_4_RGBA:
diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h
new file mode 100644
index 0000000000..1ca4bab307
--- /dev/null
+++ b/thirdparty/basis_universal/transcoder/basisu_containers.h
@@ -0,0 +1,1908 @@
+// basisu_containers.h
+#pragma once
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <algorithm>
+
+#if defined(__linux__) && !defined(ANDROID)
+// Only for malloc_usable_size() in basisu_containers_impl.h
+#include <malloc.h>
+#define HAS_MALLOC_USABLE_SIZE 1
+#endif
+
+#ifdef _MSC_VER
+#define BASISU_FORCE_INLINE __forceinline
+#else
+#define BASISU_FORCE_INLINE inline
+#endif
+
+namespace basisu
+{
+   enum { cInvalidIndex = -1 };
+
+   namespace helpers
+   {
+      inline bool is_power_of_2(uint32_t x) { return x && ((x & (x - 1U)) == 0U); }
+      inline bool is_power_of_2(uint64_t x) { return x && ((x & (x - 1U)) == 0U); }
+      template<class T> const T& minimum(const T& a, const T& b) { return (b < a) ? b : a; }
+      template<class T> const T& maximum(const T& a, const T& b) { return (a < b) ? b : a; }
+
+      inline uint32_t floor_log2i(uint32_t v)
+      {
+         uint32_t l = 0;
+         while (v > 1U)
+         {
+            v >>= 1;
+            l++;
+         }
+         return l;
+      }
+
+      inline uint32_t next_pow2(uint32_t val)
+      {
+         val--;
+         val |= val >> 16;
+         val |= val >> 8;
+         val |= val >> 4;
+         val |= val >> 2;
+         val |= val >> 1;
+         return val + 1;
+      }
+
+      inline uint64_t next_pow2(uint64_t val)
+      {
+         val--;
+         val |= val >> 32;
+         val |= val >> 16;
+         val |= val >> 8;
+         val |= val >> 4;
+         val |= val >> 2;
+         val |= val >> 1;
+         return val + 1;
+      }
+   } // namespace helpers
+
+   template <typename T>
+   inline T* construct(T* p)
+   {
+      return new (static_cast<void*>(p)) T;
+   }
+
+   template <typename T, typename U>
+   inline T* construct(T* p, const U& init)
+   {
+      return new (static_cast<void*>(p)) T(init);
+   }
+
+   template <typename T>
+   inline void construct_array(T* p, size_t n)
+   {
+      T* q = p + n;
+      for (; p != q; ++p)
+         new (static_cast<void*>(p)) T;
+   }
+
+   template <typename T, typename U>
+   inline void construct_array(T* p, size_t n, const U& init)
+   {
+      T* q = p + n;
+      for (; p != q; ++p)
+         new (static_cast<void*>(p)) T(init);
+   }
+
+   template <typename T>
+   inline void destruct(T* p)
+   {
+      (void)p;
+      p->~T();
+   }
+
+   template <typename T> inline void destruct_array(T* p, size_t n)
+   {
+      T* q = p + n;
+      for (; p != q; ++p)
+         p->~T();
+   }
+
+   template<typename T> struct int_traits { enum { cMin = INT32_MIN, cMax = INT32_MAX, cSigned = true }; };
+
+   template<> struct int_traits<int8_t> { enum { cMin = INT8_MIN, cMax = INT8_MAX, cSigned = true }; };
+   template<> struct int_traits<int16_t> { enum { cMin = INT16_MIN, cMax = INT16_MAX, cSigned = true }; };
+   template<> struct int_traits<int32_t> { enum { cMin = INT32_MIN, cMax = INT32_MAX, cSigned = true }; };
+
+   template<> struct int_traits<uint8_t> { enum { cMin = 0, cMax = UINT8_MAX, cSigned = false }; };
+   template<> struct int_traits<uint16_t> { enum { cMin = 0, cMax = UINT16_MAX, cSigned = false }; };
+   template<> struct int_traits<uint32_t> { enum { cMin = 0, cMax = UINT32_MAX, cSigned = false }; };
+
+   template<typename T>
+   struct scalar_type
+   {
+      enum { cFlag = false };
+      static inline void construct(T* p) { basisu::construct(p); }
+      static inline void construct(T* p, const T& init) { basisu::construct(p, init); }
+      static inline void construct_array(T* p, size_t n) { basisu::construct_array(p, n); }
+      static inline void destruct(T* p) { basisu::destruct(p); }
+      static inline void destruct_array(T* p, size_t n) { basisu::destruct_array(p, n); }
+   };
+
+   template<typename T> struct scalar_type<T*>
+   {
+      enum { cFlag = true };
+      static inline void construct(T** p) { memset(p, 0, sizeof(T*)); }
+      static inline void construct(T** p, T* init) { *p = init; }
+      static inline void construct_array(T** p, size_t n) { memset(p, 0, sizeof(T*) * n); }
+      static inline void destruct(T** p) { p; }
+      static inline void destruct_array(T** p, size_t n) { p, n; }
+   };
+
+#define BASISU_DEFINE_BUILT_IN_TYPE(X) \
+   template<> struct scalar_type<X> { \
+   enum { cFlag = true }; \
+   static inline void construct(X* p) { memset(p, 0, sizeof(X)); } \
+   static inline void construct(X* p, const X& init) { memcpy(p, &init, sizeof(X)); } \
+   static inline void construct_array(X* p, size_t n) { memset(p, 0, sizeof(X) * n); } \
+   static inline void destruct(X* p) { p; } \
+   static inline void destruct_array(X* p, size_t n) { p, n; } };
+
+   BASISU_DEFINE_BUILT_IN_TYPE(bool)
+   BASISU_DEFINE_BUILT_IN_TYPE(char)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned char)
+   BASISU_DEFINE_BUILT_IN_TYPE(short)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned short)
+   BASISU_DEFINE_BUILT_IN_TYPE(int)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned int)
+   BASISU_DEFINE_BUILT_IN_TYPE(long)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned long)
+#ifdef __GNUC__
+   BASISU_DEFINE_BUILT_IN_TYPE(long long)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned long long)
+#else
+   BASISU_DEFINE_BUILT_IN_TYPE(__int64)
+   BASISU_DEFINE_BUILT_IN_TYPE(unsigned __int64)
+#endif
+   BASISU_DEFINE_BUILT_IN_TYPE(float)
+   BASISU_DEFINE_BUILT_IN_TYPE(double)
+   BASISU_DEFINE_BUILT_IN_TYPE(long double)
+
+#undef BASISU_DEFINE_BUILT_IN_TYPE
+
+   template<typename T>
+   struct bitwise_movable { enum { cFlag = false }; };
+
+#define BASISU_DEFINE_BITWISE_MOVABLE(Q) template<> struct bitwise_movable<Q> { enum { cFlag = true }; };
+
+   template<typename T>
+   struct bitwise_copyable { enum { cFlag = false }; };
+
+#define BASISU_DEFINE_BITWISE_COPYABLE(Q) template<> struct bitwise_copyable<Q> { enum { cFlag = true }; };
+
+#define BASISU_IS_POD(T) __is_pod(T)
+
+#define BASISU_IS_SCALAR_TYPE(T) (scalar_type<T>::cFlag)
+
+#if defined(__GNUC__) && __GNUC__<5
+   #define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+   #define BASISU_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// TODO: clean this up
+#define BASISU_IS_BITWISE_COPYABLE(T) (BASISU_IS_SCALAR_TYPE(T) || BASISU_IS_POD(T) || BASISU_IS_TRIVIALLY_COPYABLE(T) || (bitwise_copyable<T>::cFlag))
+
+#define BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T) (BASISU_IS_BITWISE_COPYABLE(T) || (bitwise_movable<T>::cFlag))
+
+#define BASISU_HAS_DESTRUCTOR(T) ((!scalar_type<T>::cFlag) && (!__is_pod(T)))
+
+   typedef char(&yes_t)[1];
+   typedef char(&no_t)[2];
+
+   template <class U> yes_t class_test(int U::*);
+   template <class U> no_t class_test(...);
+
+   template <class T> struct is_class
+   {
+      enum { value = (sizeof(class_test<T>(0)) == sizeof(yes_t)) };
+   };
+
+   template <typename T> struct is_pointer
+   {
+      enum { value = false };
+   };
+
+   template <typename T> struct is_pointer<T*>
+   {
+      enum { value = true };
+   };
+
+   struct empty_type { };
+
+   BASISU_DEFINE_BITWISE_COPYABLE(empty_type);
+   BASISU_DEFINE_BITWISE_MOVABLE(empty_type);
+
+   template<typename T> struct rel_ops
+   {
+      friend bool operator!=(const T& x, const T& y) { return (!(x == y)); }
+      friend bool operator> (const T& x, const T& y) { return (y < x); }
+      friend bool operator<=(const T& x, const T& y) { return (!(y < x)); }
+      friend bool operator>=(const T& x, const T& y) { return (!(x < y)); }
+   };
+
+   struct elemental_vector
+   {
+      void* m_p;
+      uint32_t m_size;
+      uint32_t m_capacity;
+
+      typedef void (*object_mover)(void* pDst, void* pSrc, uint32_t num);
+
+      bool increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pRelocate, bool nofail);
+   };
+
+   template<typename T>
+   class vector : public rel_ops< vector<T> >
+   {
+   public:
+      typedef T* iterator;
+      typedef const T* const_iterator;
+      typedef T value_type;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+
+      inline vector() :
+         m_p(NULL),
+         m_size(0),
+         m_capacity(0)
+      {
+      }
+
+      inline vector(uint32_t n, const T& init) :
+         m_p(NULL),
+         m_size(0),
+         m_capacity(0)
+      {
+         increase_capacity(n, false);
+         construct_array(m_p, n, init);
+         m_size = n;
+      }
+
+      inline vector(const vector& other) :
+         m_p(NULL),
+         m_size(0),
+         m_capacity(0)
+      {
+         increase_capacity(other.m_size, false);
+
+         m_size = other.m_size;
+
+         if (BASISU_IS_BITWISE_COPYABLE(T))
+            memcpy(m_p, other.m_p, m_size * sizeof(T));
+         else
+         {
+            T* pDst = m_p;
+            const T* pSrc = other.m_p;
+            for (uint32_t i = m_size; i > 0; i--)
+               construct(pDst++, *pSrc++);
+         }
+      }
+
+      inline explicit vector(size_t size) :
+         m_p(NULL),
+         m_size(0),
+         m_capacity(0)
+      {
+         resize(size);
+      }
+
+      inline ~vector()
+      {
+         if (m_p)
+         {
+            scalar_type<T>::destruct_array(m_p, m_size);
+            free(m_p);
+         }
+      }
+
+      inline vector& operator= (const vector& other)
+      {
+         if (this == &other)
+            return *this;
+
+         if (m_capacity >= other.m_size)
+            resize(0);
+         else
+         {
+            clear();
+            increase_capacity(other.m_size, false);
+         }
+
+         if (BASISU_IS_BITWISE_COPYABLE(T))
+            memcpy(m_p, other.m_p, other.m_size * sizeof(T));
+         else
+         {
+            T* pDst = m_p;
+            const T* pSrc = other.m_p;
+            for (uint32_t i = other.m_size; i > 0; i--)
+               construct(pDst++, *pSrc++);
+         }
+
+         m_size = other.m_size;
+
+         return *this;
+      }
+
+      BASISU_FORCE_INLINE const T* begin() const { return m_p; }
+      BASISU_FORCE_INLINE T* begin() { return m_p; }
+
+      BASISU_FORCE_INLINE const T* end() const { return m_p + m_size; }
+      BASISU_FORCE_INLINE T* end() { return m_p + m_size; }
+
+      BASISU_FORCE_INLINE bool empty() const { return !m_size; }
+      BASISU_FORCE_INLINE uint32_t size() const { return m_size; }
+      BASISU_FORCE_INLINE uint32_t size_in_bytes() const { return m_size * sizeof(T); }
+      BASISU_FORCE_INLINE uint32_t capacity() const { return m_capacity; }
+
+      // operator[] will assert on out of range indices, but in final builds there is (and will never be) any range checking on this method.
+      //BASISU_FORCE_INLINE const T& operator[] (uint32_t i) const { assert(i < m_size); return m_p[i]; }
+      //BASISU_FORCE_INLINE T& operator[] (uint32_t i) { assert(i < m_size); return m_p[i]; }
+
+      BASISU_FORCE_INLINE const T& operator[] (size_t i) const { assert(i < m_size); return m_p[i]; }
+      BASISU_FORCE_INLINE T& operator[] (size_t i) { assert(i < m_size); return m_p[i]; }
+
+      // at() always includes range checking, even in final builds, unlike operator [].
+      // The first element is returned if the index is out of range.
+      BASISU_FORCE_INLINE const T& at(size_t i) const { assert(i < m_size); return (i >= m_size) ? m_p[0] : m_p[i]; }
+      BASISU_FORCE_INLINE T& at(size_t i) { assert(i < m_size); return (i >= m_size) ? m_p[0] : m_p[i]; }
+
+      BASISU_FORCE_INLINE const T& front() const { assert(m_size); return m_p[0]; }
+      BASISU_FORCE_INLINE T& front() { assert(m_size); return m_p[0]; }
+
+      BASISU_FORCE_INLINE const T& back() const { assert(m_size); return m_p[m_size - 1]; }
+      BASISU_FORCE_INLINE T& back() { assert(m_size); return m_p[m_size - 1]; }
+
+      BASISU_FORCE_INLINE const T* get_ptr() const { return m_p; }
+      BASISU_FORCE_INLINE T* get_ptr() { return m_p; }
+
+      BASISU_FORCE_INLINE const T* data() const { return m_p; }
+      BASISU_FORCE_INLINE T* data() { return m_p; }
+
+      // clear() sets the container to empty, then frees the allocated block.
+      inline void clear()
+      {
+         if (m_p)
+         {
+            scalar_type<T>::destruct_array(m_p, m_size);
+            free(m_p);
+            m_p = NULL;
+            m_size = 0;
+            m_capacity = 0;
+         }
+      }
+
+      inline void clear_no_destruction()
+      {
+         if (m_p)
+         {
+            free(m_p);
+            m_p = NULL;
+            m_size = 0;
+            m_capacity = 0;
+         }
+      }
+
+      inline void reserve(size_t new_capacity_size_t)
+      {
+         if (new_capacity_size_t > UINT32_MAX)
+         {
+            assert(0);
+            return;
+         }
+
+         uint32_t new_capacity = (uint32_t)new_capacity_size_t;
+
+         if (new_capacity > m_capacity)
+            increase_capacity(new_capacity, false);
+         else if (new_capacity < m_capacity)
+         {
+            // Must work around the lack of a "decrease_capacity()" method.
+            // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize.
+            vector tmp;
+            tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false);
+            tmp = *this;
+            swap(tmp);
+         }
+      }
+
+      inline bool try_reserve(size_t new_capacity_size_t)
+      {
+         if (new_capacity_size_t > UINT32_MAX)
+         {
+            assert(0);
+            return false;
+         }
+
+         uint32_t new_capacity = (uint32_t)new_capacity_size_t;
+
+         if (new_capacity > m_capacity)
+         {
+            if (!increase_capacity(new_capacity, false))
+               return false;
+         }
+         else if (new_capacity < m_capacity)
+         {
+            // Must work around the lack of a "decrease_capacity()" method.
+            // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize.
+            vector tmp;
+            tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false);
+            tmp = *this;
+            swap(tmp);
+         }
+
+         return true;
+      }
+
+      // resize(0) sets the container to empty, but does not free the allocated block.
+      inline void resize(size_t new_size_size_t, bool grow_hint = false)
+      {
+         if (new_size_size_t > UINT32_MAX)
+         {
+            assert(0);
+            return;
+         }
+
+         uint32_t new_size = (uint32_t)new_size_size_t;
+
+         if (m_size != new_size)
+         {
+            if (new_size < m_size)
+               scalar_type<T>::destruct_array(m_p + new_size, m_size - new_size);
+            else
+            {
+               if (new_size > m_capacity)
+                  increase_capacity(new_size, (new_size == (m_size + 1)) || grow_hint);
+
+               scalar_type<T>::construct_array(m_p + m_size, new_size - m_size);
+            }
+
+            m_size = new_size;
+         }
+      }
+
+      inline bool try_resize(size_t new_size_size_t, bool grow_hint = false)
+      {
+         if (new_size_size_t > UINT32_MAX)
+         {
+            assert(0);
+            return false;
+         }
+
+         uint32_t new_size = (uint32_t)new_size_size_t;
+
+         if (m_size != new_size)
+         {
+            if (new_size < m_size)
+               scalar_type<T>::destruct_array(m_p + new_size, m_size - new_size);
+            else
+            {
+               if (new_size > m_capacity)
+               {
+                  if (!increase_capacity(new_size, (new_size == (m_size + 1)) || grow_hint, true))
+                     return false;
+               }
+
+               scalar_type<T>::construct_array(m_p + m_size, new_size - m_size);
+            }
+
+            m_size = new_size;
+         }
+
+         return true;
+      }
+
+      // If size >= capacity/2, reset() sets the container's size to 0 but doesn't free the allocated block (because the container may be similarly loaded in the future).
+      // Otherwise it blows away the allocated block. See http://www.codercorner.com/blog/?p=494
+      inline void reset()
+      {
+         if (m_size >= (m_capacity >> 1))
+            resize(0);
+         else
+            clear();
+      }
+
+      inline T* enlarge(uint32_t i)
+      {
+         uint32_t cur_size = m_size;
+         resize(cur_size + i, true);
+         return get_ptr() + cur_size;
+      }
+
+      inline T* try_enlarge(uint32_t i)
+      {
+         uint32_t cur_size = m_size;
+         if (!try_resize(cur_size + i, true))
+            return NULL;
+         return get_ptr() + cur_size;
+      }
+
+      BASISU_FORCE_INLINE void push_back(const T& obj)
+      {
+         assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size)));
+
+         if (m_size >= m_capacity)
+            increase_capacity(m_size + 1, true);
+
+         scalar_type<T>::construct(m_p + m_size, obj);
+         m_size++;
+      }
+
+      inline bool try_push_back(const T& obj)
+      {
+         assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size)));
+
+         if (m_size >= m_capacity)
+         {
+            if (!increase_capacity(m_size + 1, true, true))
+               return false;
+         }
+
+         scalar_type<T>::construct(m_p + m_size, obj);
+         m_size++;
+
+         return true;
+      }
+
+      inline void push_back_value(T obj)
+      {
+         if (m_size >= m_capacity)
+            increase_capacity(m_size + 1, true);
+
+         scalar_type<T>::construct(m_p + m_size, obj);
+         m_size++;
+      }
+
+      inline void pop_back()
+      {
+         assert(m_size);
+
+         if (m_size)
+         {
+            m_size--;
+            scalar_type<T>::destruct(&m_p[m_size]);
+         }
+      }
+
+      inline void insert(uint32_t index, const T* p, uint32_t n)
+      {
+         assert(index <= m_size);
+         if (!n)
+            return;
+
+         const uint32_t orig_size = m_size;
+         resize(m_size + n, true);
+
+         const uint32_t num_to_move = orig_size - index;
+
+         if (BASISU_IS_BITWISE_COPYABLE(T))
+         {
+            // This overwrites the destination object bits, but bitwise copyable means we don't need to worry about destruction.
+            memmove(m_p + index + n, m_p + index, sizeof(T) * num_to_move);
+         }
+         else
+         {
+            const T* pSrc = m_p + orig_size - 1;
+            T* pDst = const_cast<T*>(pSrc) + n;
+
+            for (uint32_t i = 0; i < num_to_move; i++)
+            {
+               assert((pDst - m_p) < (int)m_size);
+               *pDst-- = *pSrc--;
+            }
+         }
+
+         T* pDst = m_p + index;
+
+         if (BASISU_IS_BITWISE_COPYABLE(T))
+         {
+            // This copies in the new bits, overwriting the existing objects, which is OK for copyable types that don't need destruction.
+            memcpy(pDst, p, sizeof(T) * n);
+         }
+         else
+         {
+            for (uint32_t i = 0; i < n; i++)
+            {
+               assert((pDst - m_p) < (int)m_size);
+               *pDst++ = *p++;
+            }
+         }
+      }
+
+      inline void insert(T* p, const T& obj)
+      {
+         int64_t ofs = p - begin();
+         if ((ofs < 0) || (ofs > UINT32_MAX))
+         {
+            assert(0);
+            return;
+         }
+
+         insert((uint32_t)ofs, &obj, 1);
+      }
+
+      // push_front() isn't going to be very fast - it's only here for usability.
+      inline void push_front(const T& obj)
+      {
+         insert(0, &obj, 1);
+      }
+
+      vector& append(const vector& other)
+      {
+         if (other.m_size)
+            insert(m_size, &other[0], other.m_size);
+         return *this;
+      }
+
+      vector& append(const T* p, uint32_t n)
+      {
+         if (n)
+            insert(m_size, p, n);
+         return *this;
+      }
+            
+      inline void erase(uint32_t start, uint32_t n)
+      {
+         assert((start + n) <= m_size);
+         if ((start + n) > m_size)
+            return;
+
+         if (!n)
+            return;
+
+         const uint32_t num_to_move = m_size - (start + n);
+
+         T* pDst = m_p + start;
+
+         const T* pSrc = m_p + start + n;
+
+         if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T))
+         {
+            // This test is overly cautious.
+            if ((!BASISU_IS_BITWISE_COPYABLE(T)) || (BASISU_HAS_DESTRUCTOR(T)))
+            {
+               // Type has been marked explictly as bitwise movable, which means we can move them around but they may need to be destructed.
+               // First destroy the erased objects.
+               scalar_type<T>::destruct_array(pDst, n);
+            }
+
+            // Copy "down" the objects to preserve, filling in the empty slots.
+            memmove(pDst, pSrc, num_to_move * sizeof(T));
+         }
+         else
+         {
+            // Type is not bitwise copyable or movable. 
+            // Move them down one at a time by using the equals operator, and destroying anything that's left over at the end.
+            T* pDst_end = pDst + num_to_move;
+            while (pDst != pDst_end)
+               *pDst++ = *pSrc++;
+
+            scalar_type<T>::destruct_array(pDst_end, n);
+         }
+
+         m_size -= n;
+      }
+
+      inline void erase(uint32_t index)
+      {
+         erase(index, 1);
+      }
+
+      inline void erase(T* p)
+      {
+         assert((p >= m_p) && (p < (m_p + m_size)));
+         erase(static_cast<uint32_t>(p - m_p));
+      }
+
+      inline void erase(T *pFirst, T *pEnd)
+      {
+         assert(pFirst <= pEnd);
+         assert(pFirst >= begin() && pFirst <= end());
+         assert(pEnd >= begin() && pEnd <= end());
+
+         int64_t ofs = pFirst - begin();
+         if ((ofs < 0) || (ofs > UINT32_MAX))
+         {
+            assert(0);
+            return;
+         }
+
+         int64_t n = pEnd - pFirst;
+         if ((n < 0) || (n > UINT32_MAX))
+         {
+            assert(0);
+            return;
+         }
+
+         erase((uint32_t)ofs, (uint32_t)n);
+      }
+
+      void erase_unordered(uint32_t index)
+      {
+         assert(index < m_size);
+
+         if ((index + 1) < m_size)
+            (*this)[index] = back();
+
+         pop_back();
+      }
+
+      inline bool operator== (const vector& rhs) const
+      {
+         if (m_size != rhs.m_size)
+            return false;
+         else if (m_size)
+         {
+            if (scalar_type<T>::cFlag)
+               return memcmp(m_p, rhs.m_p, sizeof(T) * m_size) == 0;
+            else
+            {
+               const T* pSrc = m_p;
+               const T* pDst = rhs.m_p;
+               for (uint32_t i = m_size; i; i--)
+                  if (!(*pSrc++ == *pDst++))
+                     return false;
+            }
+         }
+
+         return true;
+      }
+
+      inline bool operator< (const vector& rhs) const
+      {
+         const uint32_t min_size = helpers::minimum(m_size, rhs.m_size);
+
+         const T* pSrc = m_p;
+         const T* pSrc_end = m_p + min_size;
+         const T* pDst = rhs.m_p;
+
+         while ((pSrc < pSrc_end) && (*pSrc == *pDst))
+         {
+            pSrc++;
+            pDst++;
+         }
+
+         if (pSrc < pSrc_end)
+            return *pSrc < *pDst;
+
+         return m_size < rhs.m_size;
+      }
+
+      inline void swap(vector& other)
+      {
+         std::swap(m_p, other.m_p);
+         std::swap(m_size, other.m_size);
+         std::swap(m_capacity, other.m_capacity);
+      }
+
+      inline void sort()
+      {
+         std::sort(begin(), end());
+      }
+
+      inline void unique()
+      {
+         if (!empty())
+         {
+            sort();
+
+            resize(std::unique(begin(), end()) - begin());
+         }
+      }
+
+      inline void reverse()
+      {
+         uint32_t j = m_size >> 1;
+         for (uint32_t i = 0; i < j; i++)
+            std::swap(m_p[i], m_p[m_size - 1 - i]);
+      }
+
+      inline int find(const T& key) const
+      {
+         const T* p = m_p;
+         const T* p_end = m_p + m_size;
+
+         uint32_t index = 0;
+
+         while (p != p_end)
+         {
+            if (key == *p)
+               return index;
+
+            p++;
+            index++;
+         }
+
+         return cInvalidIndex;
+      }
+
+      inline int find_sorted(const T& key) const
+      {
+         if (m_size)
+         {
+            // Uniform binary search - Knuth Algorithm 6.2.1 U, unrolled twice.
+            int i = ((m_size + 1) >> 1) - 1;
+            int m = m_size;
+
+            for (; ; )
+            {
+               assert(i >= 0 && i < (int)m_size);
+               const T* pKey_i = m_p + i;
+               int cmp = key < *pKey_i;
+#if defined(_DEBUG) || defined(DEBUG)
+               int cmp2 = *pKey_i < key;
+               assert((cmp != cmp2) || (key == *pKey_i));
+#endif
+               if ((!cmp) && (key == *pKey_i)) return i;
+               m >>= 1;
+               if (!m) break;
+               cmp = -cmp;
+               i += (((m + 1) >> 1) ^ cmp) - cmp;
+               if (i < 0)
+                  break;
+
+               assert(i >= 0 && i < (int)m_size);
+               pKey_i = m_p + i;
+               cmp = key < *pKey_i;
+#if defined(_DEBUG) || defined(DEBUG)
+               cmp2 = *pKey_i < key;
+               assert((cmp != cmp2) || (key == *pKey_i));
+#endif
+               if ((!cmp) && (key == *pKey_i)) return i;
+               m >>= 1;
+               if (!m) break;
+               cmp = -cmp;
+               i += (((m + 1) >> 1) ^ cmp) - cmp;
+               if (i < 0)
+                  break;
+            }
+         }
+
+         return cInvalidIndex;
+      }
+
+      template<typename Q>
+      inline int find_sorted(const T& key, Q less_than) const
+      {
+         if (m_size)
+         {
+            // Uniform binary search - Knuth Algorithm 6.2.1 U, unrolled twice.
+            int i = ((m_size + 1) >> 1) - 1;
+            int m = m_size;
+
+            for (; ; )
+            {
+               assert(i >= 0 && i < (int)m_size);
+               const T* pKey_i = m_p + i;
+               int cmp = less_than(key, *pKey_i);
+               if ((!cmp) && (!less_than(*pKey_i, key))) return i;
+               m >>= 1;
+               if (!m) break;
+               cmp = -cmp;
+               i += (((m + 1) >> 1) ^ cmp) - cmp;
+               if (i < 0)
+                  break;
+
+               assert(i >= 0 && i < (int)m_size);
+               pKey_i = m_p + i;
+               cmp = less_than(key, *pKey_i);
+               if ((!cmp) && (!less_than(*pKey_i, key))) return i;
+               m >>= 1;
+               if (!m) break;
+               cmp = -cmp;
+               i += (((m + 1) >> 1) ^ cmp) - cmp;
+               if (i < 0) 
+                  break;
+            }
+         }
+
+         return cInvalidIndex;
+      }
+
+      inline uint32_t count_occurences(const T& key) const
+      {
+         uint32_t c = 0;
+
+         const T* p = m_p;
+         const T* p_end = m_p + m_size;
+
+         while (p != p_end)
+         {
+            if (key == *p)
+               c++;
+
+            p++;
+         }
+
+         return c;
+      }
+
+      inline void set_all(const T& o)
+      {
+         if ((sizeof(T) == 1) && (scalar_type<T>::cFlag))
+            memset(m_p, *reinterpret_cast<const uint8_t*>(&o), m_size);
+         else
+         {
+            T* pDst = m_p;
+            T* pDst_end = pDst + m_size;
+            while (pDst != pDst_end)
+               *pDst++ = o;
+         }
+      }
+
+      // Caller assumes ownership of the heap block associated with the container. Container is cleared.
+      inline void* assume_ownership()
+      {
+         T* p = m_p;
+         m_p = NULL;
+         m_size = 0;
+         m_capacity = 0;
+         return p;
+      }
+
+      // Caller is granting ownership of the indicated heap block.
+      // Block must have size constructed elements, and have enough room for capacity elements.
+      inline bool grant_ownership(T* p, uint32_t size, uint32_t capacity)
+      {
+         // To to prevent the caller from obviously shooting themselves in the foot.
+         if (((p + capacity) > m_p) && (p < (m_p + m_capacity)))
+         {
+            // Can grant ownership of a block inside the container itself!
+            assert(0);
+            return false;
+         }
+
+         if (size > capacity)
+         {
+            assert(0);
+            return false;
+         }
+
+         if (!p)
+         {
+            if (capacity)
+            {
+               assert(0);
+               return false;
+            }
+         }
+         else if (!capacity)
+         {
+            assert(0);
+            return false;
+         }
+
+         clear();
+         m_p = p;
+         m_size = size;
+         m_capacity = capacity;
+         return true;
+      }
+
+   private:
+      T* m_p;
+      uint32_t m_size;
+      uint32_t m_capacity;
+
+      template<typename Q> struct is_vector { enum { cFlag = false }; };
+      template<typename Q> struct is_vector< vector<Q> > { enum { cFlag = true }; };
+
+      static void object_mover(void* pDst_void, void* pSrc_void, uint32_t num)
+      {
+         T* pSrc = static_cast<T*>(pSrc_void);
+         T* const pSrc_end = pSrc + num;
+         T* pDst = static_cast<T*>(pDst_void);
+
+         while (pSrc != pSrc_end)
+         {
+            // placement new
+            new (static_cast<void*>(pDst)) T(*pSrc);
+            pSrc->~T();
+            ++pSrc;
+            ++pDst;
+         }
+      }
+
+      inline bool increase_capacity(uint32_t min_new_capacity, bool grow_hint, bool nofail = false)
+      {
+         return reinterpret_cast<elemental_vector*>(this)->increase_capacity(
+            min_new_capacity, grow_hint, sizeof(T),
+            (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T) || (is_vector<T>::cFlag)) ? NULL : object_mover, nofail);
+      }
+   };
+
+   template<typename T> struct bitwise_movable< vector<T> > { enum { cFlag = true }; };
+
+   // Hash map
+
+   template <typename T>
+   struct hasher
+   {
+      inline size_t operator() (const T& key) const { return static_cast<size_t>(key); }
+   };
+
+   template <typename T>
+   struct equal_to
+   {
+      inline bool operator()(const T& a, const T& b) const { return a == b; }
+   };
+
+   // Important: The Hasher and Equals objects must be bitwise movable!
+   template<typename Key, typename Value = empty_type, typename Hasher = hasher<Key>, typename Equals = equal_to<Key> >
+   class hash_map
+   {
+   public:
+      class iterator;
+      class const_iterator;
+   
+   private:
+      friend class iterator;
+      friend class const_iterator;
+
+      enum state
+      {
+         cStateInvalid = 0,
+         cStateValid = 1
+      };
+
+      enum
+      {
+         cMinHashSize = 4U
+      };
+
+   public:
+      typedef hash_map<Key, Value, Hasher, Equals> hash_map_type;
+      typedef std::pair<Key, Value> value_type;
+      typedef Key                   key_type;
+      typedef Value                 referent_type;
+      typedef Hasher                hasher_type;
+      typedef Equals                equals_type;
+
+      hash_map() :
+         m_hash_shift(32), m_num_valid(0), m_grow_threshold(0)
+      {
+      }
+
+      hash_map(const hash_map& other) :
+         m_values(other.m_values),
+         m_hash_shift(other.m_hash_shift),
+         m_hasher(other.m_hasher),
+         m_equals(other.m_equals),
+         m_num_valid(other.m_num_valid),
+         m_grow_threshold(other.m_grow_threshold)
+      {
+      }
+
+      hash_map& operator= (const hash_map& other)
+      {
+         if (this == &other)
+            return *this;
+
+         clear();
+
+         m_values = other.m_values;
+         m_hash_shift = other.m_hash_shift;
+         m_num_valid = other.m_num_valid;
+         m_grow_threshold = other.m_grow_threshold;
+         m_hasher = other.m_hasher;
+         m_equals = other.m_equals;
+
+         return *this;
+      }
+
+      inline ~hash_map()
+      {
+         clear();
+      }
+
+      const Equals& get_equals() const { return m_equals; }
+      Equals& get_equals() { return m_equals; }
+
+      void set_equals(const Equals& equals) { m_equals = equals; }
+
+      const Hasher& get_hasher() const { return m_hasher; }
+      Hasher& get_hasher() { return m_hasher; }
+
+      void set_hasher(const Hasher& hasher) { m_hasher = hasher; }
+
+      inline void clear()
+      {
+         if (!m_values.empty())
+         {
+            if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value))
+            {
+               node* p = &get_node(0);
+               node* p_end = p + m_values.size();
+
+               uint32_t num_remaining = m_num_valid;
+               while (p != p_end)
+               {
+                  if (p->state)
+                  {
+                     destruct_value_type(p);
+                     num_remaining--;
+                     if (!num_remaining)
+                        break;
+                  }
+
+                  p++;
+               }
+            }
+
+            m_values.clear_no_destruction();
+
+            m_hash_shift = 32;
+            m_num_valid = 0;
+            m_grow_threshold = 0;
+         }
+      }
+
+      inline void reset()
+      {
+         if (!m_num_valid)
+            return;
+
+         if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value))
+         {
+            node* p = &get_node(0);
+            node* p_end = p + m_values.size();
+
+            uint32_t num_remaining = m_num_valid;
+            while (p != p_end)
+            {
+               if (p->state)
+               {
+                  destruct_value_type(p);
+                  p->state = cStateInvalid;
+
+                  num_remaining--;
+                  if (!num_remaining)
+                     break;
+               }
+
+               p++;
+            }
+         }
+         else if (sizeof(node) <= 32)
+         {
+            memset(&m_values[0], 0, m_values.size_in_bytes());
+         }
+         else
+         {
+            node* p = &get_node(0);
+            node* p_end = p + m_values.size();
+
+            uint32_t num_remaining = m_num_valid;
+            while (p != p_end)
+            {
+               if (p->state)
+               {
+                  p->state = cStateInvalid;
+
+                  num_remaining--;
+                  if (!num_remaining)
+                     break;
+               }
+
+               p++;
+            }
+         }
+
+         m_num_valid = 0;
+      }
+
+      inline uint32_t size()
+      {
+         return m_num_valid;
+      }
+
+      inline uint32_t get_table_size()
+      {
+         return m_values.size();
+      }
+
+      inline bool empty()
+      {
+         return !m_num_valid;
+      }
+
+      inline void reserve(uint32_t new_capacity)
+      {
+         uint64_t new_hash_size = helpers::maximum(1U, new_capacity);
+
+         new_hash_size = new_hash_size * 2ULL;
+
+         if (!helpers::is_power_of_2(new_hash_size))
+            new_hash_size = helpers::next_pow2(new_hash_size);
+
+         new_hash_size = helpers::maximum<uint64_t>(cMinHashSize, new_hash_size);
+
+         new_hash_size = helpers::minimum<uint64_t>(0x80000000UL, new_hash_size);
+
+         if (new_hash_size > m_values.size())
+            rehash((uint32_t)new_hash_size);
+      }
+            
+      class iterator
+      {
+         friend class hash_map<Key, Value, Hasher, Equals>;
+         friend class hash_map<Key, Value, Hasher, Equals>::const_iterator;
+
+      public:
+         inline iterator() : m_pTable(NULL), m_index(0) { }
+         inline iterator(hash_map_type& table, uint32_t index) : m_pTable(&table), m_index(index) { }
+         inline iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { }
+
+         inline iterator& operator= (const iterator& other)
+         {
+            m_pTable = other.m_pTable;
+            m_index = other.m_index;
+            return *this;
+         }
+
+         // post-increment
+         inline iterator operator++(int)
+         {
+            iterator result(*this);
+            ++*this;
+            return result;
+         }
+
+         // pre-increment
+         inline iterator& operator++()
+         {
+            probe();
+            return *this;
+         }
+
+         inline value_type& operator*() const { return *get_cur(); }
+         inline value_type* operator->() const { return get_cur(); }
+
+         inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); }
+         inline bool operator != (const iterator& b) const { return !(*this == b); }
+         inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); }
+         inline bool operator != (const const_iterator& b) const { return !(*this == b); }
+
+      private:
+         hash_map_type* m_pTable;
+         uint32_t m_index;
+
+         inline value_type* get_cur() const
+         {
+            assert(m_pTable && (m_index < m_pTable->m_values.size()));
+            assert(m_pTable->get_node_state(m_index) == cStateValid);
+
+            return &m_pTable->get_node(m_index);
+         }
+
+         inline void probe()
+         {
+            assert(m_pTable);
+            m_index = m_pTable->find_next(m_index);
+         }
+      };
+
+      class const_iterator
+      {
+         friend class hash_map<Key, Value, Hasher, Equals>;
+         friend class hash_map<Key, Value, Hasher, Equals>::iterator;
+
+      public:
+         inline const_iterator() : m_pTable(NULL), m_index(0) { }
+         inline const_iterator(const hash_map_type& table, uint32_t index) : m_pTable(&table), m_index(index) { }
+         inline const_iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { }
+         inline const_iterator(const const_iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { }
+
+         inline const_iterator& operator= (const const_iterator& other)
+         {
+            m_pTable = other.m_pTable;
+            m_index = other.m_index;
+            return *this;
+         }
+
+         inline const_iterator& operator= (const iterator& other)
+         {
+            m_pTable = other.m_pTable;
+            m_index = other.m_index;
+            return *this;
+         }
+
+         // post-increment
+         inline const_iterator operator++(int)
+         {
+            const_iterator result(*this);
+            ++*this;
+            return result;
+         }
+
+         // pre-increment
+         inline const_iterator& operator++()
+         {
+            probe();
+            return *this;
+         }
+
+         inline const value_type& operator*() const { return *get_cur(); }
+         inline const value_type* operator->() const { return get_cur(); }
+
+         inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); }
+         inline bool operator != (const const_iterator& b) const { return !(*this == b); }
+         inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); }
+         inline bool operator != (const iterator& b) const { return !(*this == b); }
+
+      private:
+         const hash_map_type* m_pTable;
+         uint32_t m_index;
+
+         inline const value_type* get_cur() const
+         {
+            assert(m_pTable && (m_index < m_pTable->m_values.size()));
+            assert(m_pTable->get_node_state(m_index) == cStateValid);
+
+            return &m_pTable->get_node(m_index);
+         }
+
+         inline void probe()
+         {
+            assert(m_pTable);
+            m_index = m_pTable->find_next(m_index);
+         }
+      };
+
+      inline const_iterator begin() const
+      {
+         if (!m_num_valid)
+            return end();
+
+         return const_iterator(*this, find_next(UINT32_MAX));
+      }
+
+      inline const_iterator end() const
+      {
+         return const_iterator(*this, m_values.size());
+      }
+
+      inline iterator begin()
+      {
+         if (!m_num_valid)
+            return end();
+
+         return iterator(*this, find_next(UINT32_MAX));
+      }
+
+      inline iterator end()
+      {
+         return iterator(*this, m_values.size());
+      }
+
+      // insert_result.first will always point to inserted key/value (or the already existing key/value).
+      // insert_resutt.second will be true if a new key/value was inserted, or false if the key already existed (in which case first will point to the already existing value).
+      typedef std::pair<iterator, bool> insert_result;
+
+      inline insert_result insert(const Key& k, const Value& v = Value())
+      {
+         insert_result result;
+         if (!insert_no_grow(result, k, v))
+         {
+            grow();
+
+            // This must succeed.
+            if (!insert_no_grow(result, k, v))
+            {
+               fprintf(stderr, "insert() failed");
+               abort();
+            }
+         }
+
+         return result;
+      }
+
+      inline insert_result insert(const value_type& v)
+      {
+         return insert(v.first, v.second);
+      }
+
+      inline const_iterator find(const Key& k) const
+      {
+         return const_iterator(*this, find_index(k));
+      }
+
+      inline iterator find(const Key& k)
+      {
+         return iterator(*this, find_index(k));
+      }
+
+      inline bool erase(const Key& k)
+      {
+         uint32_t i = find_index(k);
+
+         if (i >= m_values.size())
+            return false;
+
+         node* pDst = &get_node(i);
+         destruct_value_type(pDst);
+         pDst->state = cStateInvalid;
+
+         m_num_valid--;
+
+         for (; ; )
+         {
+            uint32_t r, j = i;
+
+            node* pSrc = pDst;
+
+            do
+            {
+               if (!i)
+               {
+                  i = m_values.size() - 1;
+                  pSrc = &get_node(i);
+               }
+               else
+               {
+                  i--;
+                  pSrc--;
+               }
+
+               if (!pSrc->state)
+                  return true;
+
+               r = hash_key(pSrc->first);
+
+            } while ((i <= r && r < j) || (r < j && j < i) || (j < i && i <= r));
+
+            move_node(pDst, pSrc);
+
+            pDst = pSrc;
+         }
+      }
+
+      inline void swap(hash_map_type& other)
+      {
+         m_values.swap(other.m_values);
+         std::swap(m_hash_shift, other.m_hash_shift);
+         std::swap(m_num_valid, other.m_num_valid);
+         std::swap(m_grow_threshold, other.m_grow_threshold);
+         std::swap(m_hasher, other.m_hasher);
+         std::swap(m_equals, other.m_equals);
+      }
+
+   private:
+      struct node : public value_type
+      {
+         uint8_t state;
+      };
+
+      static inline void construct_value_type(value_type* pDst, const Key& k, const Value& v)
+      {
+         if (BASISU_IS_BITWISE_COPYABLE(Key))
+            memcpy(&pDst->first, &k, sizeof(Key));
+         else
+            scalar_type<Key>::construct(&pDst->first, k);
+
+         if (BASISU_IS_BITWISE_COPYABLE(Value))
+            memcpy(&pDst->second, &v, sizeof(Value));
+         else
+            scalar_type<Value>::construct(&pDst->second, v);
+      }
+
+      static inline void construct_value_type(value_type* pDst, const value_type* pSrc)
+      {
+         if ((BASISU_IS_BITWISE_COPYABLE(Key)) && (BASISU_IS_BITWISE_COPYABLE(Value)))
+         {
+            memcpy(pDst, pSrc, sizeof(value_type));
+         }
+         else
+         {
+            if (BASISU_IS_BITWISE_COPYABLE(Key))
+               memcpy(&pDst->first, &pSrc->first, sizeof(Key));
+            else
+               scalar_type<Key>::construct(&pDst->first, pSrc->first);
+
+            if (BASISU_IS_BITWISE_COPYABLE(Value))
+               memcpy(&pDst->second, &pSrc->second, sizeof(Value));
+            else
+               scalar_type<Value>::construct(&pDst->second, pSrc->second);
+         }
+      }
+
+      static inline void destruct_value_type(value_type* p)
+      {
+         scalar_type<Key>::destruct(&p->first);
+         scalar_type<Value>::destruct(&p->second);
+      }
+
+      // Moves *pSrc to *pDst efficiently.
+      // pDst should NOT be constructed on entry.
+      static inline void move_node(node* pDst, node* pSrc, bool update_src_state = true)
+      {
+         assert(!pDst->state);
+
+         if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key) && BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value))
+         {
+            memcpy(pDst, pSrc, sizeof(node));
+         }
+         else
+         {
+            if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key))
+               memcpy(&pDst->first, &pSrc->first, sizeof(Key));
+            else
+            {
+               scalar_type<Key>::construct(&pDst->first, pSrc->first);
+               scalar_type<Key>::destruct(&pSrc->first);
+            }
+
+            if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value))
+               memcpy(&pDst->second, &pSrc->second, sizeof(Value));
+            else
+            {
+               scalar_type<Value>::construct(&pDst->second, pSrc->second);
+               scalar_type<Value>::destruct(&pSrc->second);
+            }
+
+            pDst->state = cStateValid;
+         }
+
+         if (update_src_state)
+            pSrc->state = cStateInvalid;
+      }
+
+      struct raw_node
+      {
+         inline raw_node()
+         {
+            node* p = reinterpret_cast<node*>(this);
+            p->state = cStateInvalid;
+         }
+
+         inline ~raw_node()
+         {
+            node* p = reinterpret_cast<node*>(this);
+            if (p->state)
+               hash_map_type::destruct_value_type(p);
+         }
+
+         inline raw_node(const raw_node& other)
+         {
+            node* pDst = reinterpret_cast<node*>(this);
+            const node* pSrc = reinterpret_cast<const node*>(&other);
+
+            if (pSrc->state)
+            {
+               hash_map_type::construct_value_type(pDst, pSrc);
+               pDst->state = cStateValid;
+            }
+            else
+               pDst->state = cStateInvalid;
+         }
+
+         inline raw_node& operator= (const raw_node& rhs)
+         {
+            if (this == &rhs)
+               return *this;
+
+            node* pDst = reinterpret_cast<node*>(this);
+            const node* pSrc = reinterpret_cast<const node*>(&rhs);
+
+            if (pSrc->state)
+            {
+               if (pDst->state)
+               {
+                  pDst->first = pSrc->first;
+                  pDst->second = pSrc->second;
+               }
+               else
+               {
+                  hash_map_type::construct_value_type(pDst, pSrc);
+                  pDst->state = cStateValid;
+               }
+            }
+            else if (pDst->state)
+            {
+               hash_map_type::destruct_value_type(pDst);
+               pDst->state = cStateInvalid;
+            }
+
+            return *this;
+         }
+
+         uint8_t m_bits[sizeof(node)];
+      };
+
+      typedef basisu::vector<raw_node> node_vector;
+
+      node_vector    m_values;
+      uint32_t       m_hash_shift;
+
+      Hasher         m_hasher;
+      Equals         m_equals;
+
+      uint32_t       m_num_valid;
+
+      uint32_t       m_grow_threshold;
+
+      inline uint32_t hash_key(const Key& k) const
+      {
+         assert((1U << (32U - m_hash_shift)) == m_values.size());
+
+         uint32_t hash = static_cast<uint32_t>(m_hasher(k));
+
+         // Fibonacci hashing
+         hash = (2654435769U * hash) >> m_hash_shift;
+
+         assert(hash < m_values.size());
+         return hash;
+      }
+
+      inline const node& get_node(uint32_t index) const
+      {
+         return *reinterpret_cast<const node*>(&m_values[index]);
+      }
+
+      inline node& get_node(uint32_t index)
+      {
+         return *reinterpret_cast<node*>(&m_values[index]);
+      }
+
+      inline state get_node_state(uint32_t index) const
+      {
+         return static_cast<state>(get_node(index).state);
+      }
+
+      inline void set_node_state(uint32_t index, bool valid)
+      {
+         get_node(index).state = valid;
+      }
+
+      inline void grow()
+      {
+         uint64_t n = m_values.size() * 3ULL; // was * 2
+         
+         if (!helpers::is_power_of_2(n))
+            n = helpers::next_pow2(n);
+
+         if (n > 0x80000000UL)
+            n = 0x80000000UL;
+
+         rehash(helpers::maximum<uint32_t>(cMinHashSize, (uint32_t)n));
+      }
+
+      inline void rehash(uint32_t new_hash_size)
+      {
+         assert(new_hash_size >= m_num_valid);
+         assert(helpers::is_power_of_2(new_hash_size));
+
+         if ((new_hash_size < m_num_valid) || (new_hash_size == m_values.size()))
+            return;
+
+         hash_map new_map;
+         new_map.m_values.resize(new_hash_size);
+         new_map.m_hash_shift = 32U - helpers::floor_log2i(new_hash_size);
+         assert(new_hash_size == (1U << (32U - new_map.m_hash_shift)));
+         new_map.m_grow_threshold = UINT_MAX;
+
+         node* pNode = reinterpret_cast<node*>(m_values.begin());
+         node* pNode_end = pNode + m_values.size();
+
+         while (pNode != pNode_end)
+         {
+            if (pNode->state)
+            {
+               new_map.move_into(pNode);
+
+               if (new_map.m_num_valid == m_num_valid)
+                  break;
+            }
+
+            pNode++;
+         }
+
+         new_map.m_grow_threshold = (new_hash_size + 1U) >> 1U;
+
+         m_values.clear_no_destruction();
+         m_hash_shift = 32;
+
+         swap(new_map);
+      }
+
+      inline uint32_t find_next(uint32_t index) const
+      {
+         index++;
+
+         if (index >= m_values.size())
+            return index;
+
+         const node* pNode = &get_node(index);
+
+         for (; ; )
+         {
+            if (pNode->state)
+               break;
+
+            if (++index >= m_values.size())
+               break;
+
+            pNode++;
+         }
+
+         return index;
+      }
+
+      inline uint32_t find_index(const Key& k) const
+      {
+         if (m_num_valid)
+         {
+            uint32_t index = hash_key(k);
+            const node* pNode = &get_node(index);
+
+            if (pNode->state)
+            {
+               if (m_equals(pNode->first, k))
+                  return index;
+
+               const uint32_t orig_index = index;
+
+               for (; ; )
+               {
+                  if (!index)
+                  {
+                     index = m_values.size() - 1;
+                     pNode = &get_node(index);
+                  }
+                  else
+                  {
+                     index--;
+                     pNode--;
+                  }
+
+                  if (index == orig_index)
+                     break;
+
+                  if (!pNode->state)
+                     break;
+
+                  if (m_equals(pNode->first, k))
+                     return index;
+               }
+            }
+         }
+
+         return m_values.size();
+      }
+
+      inline bool insert_no_grow(insert_result& result, const Key& k, const Value& v = Value())
+      {
+         if (!m_values.size())
+            return false;
+
+         uint32_t index = hash_key(k);
+         node* pNode = &get_node(index);
+
+         if (pNode->state)
+         {
+            if (m_equals(pNode->first, k))
+            {
+               result.first = iterator(*this, index);
+               result.second = false;
+               return true;
+            }
+
+            const uint32_t orig_index = index;
+
+            for (; ; )
+            {
+               if (!index)
+               {
+                  index = m_values.size() - 1;
+                  pNode = &get_node(index);
+               }
+               else
+               {
+                  index--;
+                  pNode--;
+               }
+
+               if (orig_index == index)
+                  return false;
+
+               if (!pNode->state)
+                  break;
+
+               if (m_equals(pNode->first, k))
+               {
+                  result.first = iterator(*this, index);
+                  result.second = false;
+                  return true;
+               }
+            }
+         }
+
+         if (m_num_valid >= m_grow_threshold)
+            return false;
+
+         construct_value_type(pNode, k, v);
+
+         pNode->state = cStateValid;
+
+         m_num_valid++;
+         assert(m_num_valid <= m_values.size());
+
+         result.first = iterator(*this, index);
+         result.second = true;
+
+         return true;
+      }
+
+      inline void move_into(node* pNode)
+      {
+         uint32_t index = hash_key(pNode->first);
+         node* pDst_node = &get_node(index);
+
+         if (pDst_node->state)
+         {
+            const uint32_t orig_index = index;
+
+            for (; ; )
+            {
+               if (!index)
+               {
+                  index = m_values.size() - 1;
+                  pDst_node = &get_node(index);
+               }
+               else
+               {
+                  index--;
+                  pDst_node--;
+               }
+
+               if (index == orig_index)
+               {
+                  assert(false);
+                  return;
+               }
+
+               if (!pDst_node->state)
+                  break;
+            }
+         }
+
+         move_node(pDst_node, pNode, false);
+
+         m_num_valid++;
+      }
+   };
+
+   template<typename Key, typename Value, typename Hasher, typename Equals>
+   struct bitwise_movable< hash_map<Key, Value, Hasher, Equals> > { enum { cFlag = true }; };
+   
+#if BASISU_HASHMAP_TEST
+   extern void hash_map_test();
+#endif
+      
+} // namespace basisu
+
+namespace std
+{
+   template<typename T>
+   inline void swap(basisu::vector<T>& a, basisu::vector<T>& b)
+   {
+      a.swap(b);
+   }
+
+   template<typename Key, typename Value, typename Hasher, typename Equals>
+   inline void swap(basisu::hash_map<Key, Value, Hasher, Equals>& a, basisu::hash_map<Key, Value, Hasher, Equals>& b)
+   {
+      a.swap(b);
+   }
+
+} // namespace std
diff --git a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
new file mode 100644
index 0000000000..6555171419
--- /dev/null
+++ b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
@@ -0,0 +1,311 @@
+// basisu_containers_impl.h
+// Do not include directly
+
+#ifdef _MSC_VER
+#pragma warning (disable:4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace basisu
+{
+   bool elemental_vector::increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pMover, bool nofail)
+   {
+      assert(m_size <= m_capacity);
+
+      if (sizeof(void *) == sizeof(uint64_t))
+         assert(min_new_capacity < (0x400000000ULL / element_size));
+      else
+         assert(min_new_capacity < (0x7FFF0000U / element_size));
+
+      if (m_capacity >= min_new_capacity)
+         return true;
+
+      size_t new_capacity = min_new_capacity;
+      if ((grow_hint) && (!helpers::is_power_of_2((uint64_t)new_capacity)))
+      {
+         new_capacity = (size_t)helpers::next_pow2((uint64_t)new_capacity);
+
+         assert(new_capacity && (new_capacity > m_capacity));
+
+         if (new_capacity < min_new_capacity)
+         {
+            if (nofail)
+               return false;
+            fprintf(stderr, "vector too large\n");
+            abort();
+         }
+      }
+            
+      const size_t desired_size = element_size * new_capacity;
+      size_t actual_size = 0;
+      if (!pMover)
+      {
+         void* new_p = realloc(m_p, desired_size);
+         if (!new_p)
+         {
+            if (nofail)
+               return false;
+
+            char buf[256];
+#ifdef _MSC_VER
+            sprintf_s(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
+#else
+            sprintf(buf, "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
+#endif
+            fprintf(stderr, "%s", buf);
+            abort();
+         }
+
+#ifdef _MSC_VER
+         actual_size = _msize(new_p);
+#elif HAS_MALLOC_USABLE_SIZE
+         actual_size = malloc_usable_size(new_p);
+#else
+         actual_size = desired_size;
+#endif
+         m_p = new_p;
+      }
+      else
+      {
+         void* new_p = malloc(desired_size);
+         if (!new_p)
+         {
+            if (nofail)
+               return false;
+
+            char buf[256];
+#ifdef _MSC_VER
+            sprintf_s(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
+#else
+            sprintf(buf, "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
+#endif
+            fprintf(stderr, "%s", buf);
+            abort();
+         }
+
+#ifdef _MSC_VER
+         actual_size = _msize(new_p);
+#elif HAS_MALLOC_USABLE_SIZE
+         actual_size = malloc_usable_size(new_p);
+#else
+         actual_size = desired_size;
+#endif
+
+         (*pMover)(new_p, m_p, m_size);
+
+         if (m_p)
+            free(m_p);
+         
+         m_p = new_p;
+      }
+
+      if (actual_size > desired_size)
+         m_capacity = static_cast<uint32_t>(actual_size / element_size);
+      else
+         m_capacity = static_cast<uint32_t>(new_capacity);
+
+      return true;
+   }
+
+#if BASISU_HASHMAP_TEST
+
+#define HASHMAP_TEST_VERIFY(c) do { if (!(c)) handle_hashmap_test_verify_failure(__LINE__); } while(0)
+
+   static void handle_hashmap_test_verify_failure(int line)
+   {
+      fprintf(stderr, "HASHMAP_TEST_VERIFY() faild on line %i\n", line);
+      abort();
+   }
+
+   class counted_obj
+   {
+   public:
+      counted_obj(uint32_t v = 0) :
+         m_val(v)
+      {
+         m_count++;
+      }
+
+      counted_obj(const counted_obj& obj) :
+         m_val(obj.m_val)
+      {
+         m_count++;
+      }
+
+      ~counted_obj()
+      {
+         assert(m_count > 0);
+         m_count--;
+      }
+
+      static uint32_t m_count;
+
+      uint32_t m_val;
+
+      operator size_t() const { return m_val; }
+
+      bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; }
+      bool operator== (const uint32_t rhs) const { return m_val == rhs; }
+
+   };
+
+   uint32_t counted_obj::m_count;
+
+   static uint32_t urand32()
+   {
+      uint32_t a = rand();
+      uint32_t b = rand() << 15;
+      uint32_t c = rand() << (32 - 15);
+      return a ^ b ^ c;
+   }
+
+   static int irand32(int l, int h)
+   {
+      assert(l < h);
+      if (l >= h)
+         return l;
+
+      uint32_t range = static_cast<uint32_t>(h - l);
+
+      uint32_t rnd = urand32();
+
+      uint32_t rnd_range = static_cast<uint32_t>((((uint64_t)range) * ((uint64_t)rnd)) >> 32U);
+
+      int result = l + rnd_range;
+      assert((result >= l) && (result < h));
+      return result;
+   }
+
+   void hash_map_test()
+   {
+      {
+         basisu::hash_map<uint64_t, uint64_t> k;
+         basisu::hash_map<uint64_t, uint64_t> l;
+         std::swap(k, l);
+
+         k.begin();
+         k.end();
+         k.clear();
+         k.empty();
+         k.erase(0);
+         k.insert(0, 1);
+         k.find(0);
+         k.get_equals();
+         k.get_hasher();
+         k.get_table_size();
+         k.reset();
+         k.reserve(1);
+         k = l;
+         k.set_equals(l.get_equals());
+         k.set_hasher(l.get_hasher());
+         k.get_table_size();
+      }
+
+      uint32_t seed = 0;
+      for (; ; )
+      {
+         seed++;
+
+         typedef basisu::hash_map<counted_obj, counted_obj> my_hash_map;
+         my_hash_map m;
+
+         const uint32_t n = irand32(0, 100000);
+
+         printf("%u\n", n);
+
+         srand(seed); // r1.seed(seed);
+
+         basisu::vector<int> q;
+
+         uint32_t count = 0;
+         for (uint32_t i = 0; i < n; i++)
+         {
+            uint32_t v = urand32() & 0x7FFFFFFF;
+            my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef));
+            if (res.second)
+            {
+               count++;
+               q.push_back(v);
+            }
+         }
+
+         HASHMAP_TEST_VERIFY(m.size() == count);
+
+         srand(seed);
+
+         my_hash_map cm(m);
+         m.clear();
+         m = cm;
+         cm.reset();
+
+         for (uint32_t i = 0; i < n; i++)
+         {
+            uint32_t v = urand32() & 0x7FFFFFFF;
+            my_hash_map::const_iterator it = m.find(counted_obj(v));
+            HASHMAP_TEST_VERIFY(it != m.end());
+            HASHMAP_TEST_VERIFY(it->first == v);
+            HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef));
+         }
+
+         for (uint32_t t = 0; t < 2; t++)
+         {
+            const uint32_t nd = irand32(1, q.size() + 1);
+            for (uint32_t i = 0; i < nd; i++)
+            {
+               uint32_t p = irand32(0, q.size());
+
+               int k = q[p];
+               if (k >= 0)
+               {
+                  q[p] = -k - 1;
+
+                  bool s = m.erase(counted_obj(k));
+                  HASHMAP_TEST_VERIFY(s);
+               }
+            }
+
+            typedef basisu::hash_map<uint32_t, empty_type> uint_hash_set;
+            uint_hash_set s;
+
+            for (uint32_t i = 0; i < q.size(); i++)
+            {
+               int v = q[i];
+
+               if (v >= 0)
+               {
+                  my_hash_map::const_iterator it = m.find(counted_obj(v));
+                  HASHMAP_TEST_VERIFY(it != m.end());
+                  HASHMAP_TEST_VERIFY(it->first == (uint32_t)v);
+                  HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef));
+
+                  s.insert(v);
+               }
+               else
+               {
+                  my_hash_map::const_iterator it = m.find(counted_obj(-v - 1));
+                  HASHMAP_TEST_VERIFY(it == m.end());
+               }
+            }
+
+            uint32_t found_count = 0;
+            for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it)
+            {
+               HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef));
+
+               uint_hash_set::const_iterator fit(s.find((uint32_t)it->first));
+               HASHMAP_TEST_VERIFY(fit != s.end());
+
+               HASHMAP_TEST_VERIFY(fit->first == it->first);
+
+               found_count++;
+            }
+
+            HASHMAP_TEST_VERIFY(found_count == s.size());
+         }
+
+         HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2);
+      }
+   }
+
+#endif // BASISU_HASHMAP_TEST
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/transcoder/basisu_file_headers.h b/thirdparty/basis_universal/transcoder/basisu_file_headers.h
index c90b3f3af0..4316d738e6 100644
--- a/thirdparty/basis_universal/transcoder/basisu_file_headers.h
+++ b/thirdparty/basis_universal/transcoder/basisu_file_headers.h
@@ -1,5 +1,5 @@
 // basis_file_headers.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,8 +20,11 @@ namespace basist
 	// Slice desc header flags
 	enum basis_slice_desc_flags
 	{
-		cSliceDescFlagsIsAlphaData = 1,
-		cSliceDescFlagsFrameIsIFrame = 2			// Video only: Frame doesn't refer to previous frame (no usage of conditional replenishment pred symbols)
+		cSliceDescFlagsHasAlpha = 1,
+		
+		// Video only: Frame doesn't refer to previous frame (no usage of conditional replenishment pred symbols)
+		// Currently the first frame is always an I-Frame, all subsequent frames are P-Frames. This will eventually be changed to periodic I-Frames.
+		cSliceDescFlagsFrameIsIFrame = 2			
 	};
 
 #pragma pack(push)
@@ -38,7 +41,7 @@ namespace basist
 		basisu::packed_uint<2> m_num_blocks_x;	// The slice's block X dimensions. Each block is 4x4 pixels. The slice's pixel resolution may or may not be a power of 2.
 		basisu::packed_uint<2> m_num_blocks_y;	// The slice's block Y dimensions. 
 
-		basisu::packed_uint<4> m_file_ofs;		// Offset from the header to the start of the slice's data
+		basisu::packed_uint<4> m_file_ofs;		// Offset from the start of the file to the start of the slice's data
 		basisu::packed_uint<4> m_file_size;		// The size of the compressed slice data in bytes
 
 		basisu::packed_uint<2> m_slice_data_crc16; // The CRC16 of the compressed slice data, for extra-paranoid use cases
@@ -47,9 +50,21 @@ namespace basist
 	// File header files
 	enum basis_header_flags
 	{
-		cBASISHeaderFlagETC1S = 1,					// Always set for basis universal files
-		cBASISHeaderFlagYFlipped = 2,				// Set if the texture had to be Y flipped before encoding
-		cBASISHeaderFlagHasAlphaSlices = 4		// True if the odd slices contain alpha data
+		// Always set for ETC1S files. Not set for UASTC files.
+		cBASISHeaderFlagETC1S = 1,					 
+		
+		// Set if the texture had to be Y flipped before encoding. The actual interpretation of this (is Y up or down?) is up to the user.
+		cBASISHeaderFlagYFlipped = 2,				 
+		
+		// Set if any slices contain alpha (for ETC1S, if the odd slices contain alpha data)
+		cBASISHeaderFlagHasAlphaSlices = 4,		 
+		
+		// For ETC1S files, this will be true if the file utilizes a codebook from another .basis file. 
+		cBASISHeaderFlagUsesGlobalCodebook = 8, 
+		
+		// Set if the texture data is sRGB, otherwise it's linear. 
+		// In reality, we have no idea if the texture data is actually linear or sRGB. This is the m_perceptual parameter passed to the compressor.
+		cBASISHeaderFlagSRGB = 16,					 
 	};
 
 	// The image type field attempts to describe how to interpret the image data in a Basis file.
@@ -71,6 +86,12 @@ namespace basist
 		cBASISMaxUSPerFrame = 0xFFFFFF
 	};
 
+	enum class basis_tex_format
+	{
+		cETC1S = 0,
+		cUASTC4x4 = 1
+	};
+
 	struct basis_file_header
 	{
 		enum
@@ -82,16 +103,16 @@ namespace basist
 		basisu::packed_uint<2>      m_sig;				// 2 byte file signature
 		basisu::packed_uint<2>      m_ver;				// Baseline file version
 		basisu::packed_uint<2>      m_header_size;	// Header size in bytes, sizeof(basis_file_header)
-		basisu::packed_uint<2>      m_header_crc16;	// crc16 of the remaining header data
+		basisu::packed_uint<2>      m_header_crc16;	// CRC16 of the remaining header data
 
 		basisu::packed_uint<4>      m_data_size;		// The total size of all data after the header
 		basisu::packed_uint<2>      m_data_crc16;		// The CRC16 of all data after the header
 
-		basisu::packed_uint<3>      m_total_slices;	// The total # of compressed slices (1 slice per image, or 2 for alpha basis files)
+		basisu::packed_uint<3>      m_total_slices;	// The total # of compressed slices (1 slice per image, or 2 for alpha .basis files)
 
 		basisu::packed_uint<3>      m_total_images;	// The total # of images
 				
-		basisu::packed_uint<1>      m_format;			// enum basist::block_format
+		basisu::packed_uint<1>      m_tex_format;		// enum basis_tex_format
 		basisu::packed_uint<2>      m_flags;			// enum basist::header_flags
 		basisu::packed_uint<1>      m_tex_type;		// enum basist::basis_texture_type
 		basisu::packed_uint<3>      m_us_per_frame;	// Framerate of video, in microseconds per frame
@@ -101,11 +122,11 @@ namespace basist
 		basisu::packed_uint<4>      m_userdata1;		// For client use
 
 		basisu::packed_uint<2>      m_total_endpoints;			// The number of endpoints in the endpoint codebook 
-		basisu::packed_uint<4>      m_endpoint_cb_file_ofs;	// The compressed endpoint codebook's file offset relative to the header
+		basisu::packed_uint<4>      m_endpoint_cb_file_ofs;	// The compressed endpoint codebook's file offset relative to the start of the file
 		basisu::packed_uint<3>      m_endpoint_cb_file_size;	// The compressed endpoint codebook's size in bytes
 
 		basisu::packed_uint<2>      m_total_selectors;			// The number of selectors in the endpoint codebook 
-		basisu::packed_uint<4>      m_selector_cb_file_ofs;	// The compressed selectors codebook's file offset relative to the header
+		basisu::packed_uint<4>      m_selector_cb_file_ofs;	// The compressed selectors codebook's file offset relative to the start of the file
 		basisu::packed_uint<3>      m_selector_cb_file_size;	// The compressed selector codebook's size in bytes
 
 		basisu::packed_uint<4>      m_tables_file_ofs;			// The file offset of the compressed Huffman codelength tables, for decompressing slices
diff --git a/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h b/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h
index 695b0b3b97..8ab5098898 100644
--- a/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h
+++ b/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h b/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h
index b0260541c3..8bedf94710 100644
--- a/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h
+++ b/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h
@@ -1,5 +1,7 @@
 // basisu_global_selector_palette.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// 
+// TODO: NONE of this is used in .basis/.ktx2 files. It will be deleted soon.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -609,7 +611,7 @@ namespace basist
 		uint8_t m_selectors[16];
 	};
 
-	typedef std::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
+	typedef basisu::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
 
 	extern const uint32_t g_global_selector_cb[];
 	extern const uint32_t g_global_selector_cb_size;
@@ -628,7 +630,7 @@ namespace basist
 		void set(uint32_t palette_index, const etc1_global_palette_entry_modifier &modifier) { m_palette_index = palette_index; m_modifier = modifier; }
 	};
 
-	typedef std::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;
+	typedef basisu::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;
 
 	class etc1_global_selector_codebook
 	{
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
index d15b6013d9..29eb3c0d55 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
@@ -1,5 +1,5 @@
 // basisu_transcoder.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,65 +15,88 @@
 
 #include "basisu_transcoder.h"
 #include <limits.h>
-#include <vector>
+#include "basisu_containers_impl.h"
+
+#ifndef BASISD_IS_BIG_ENDIAN
+// TODO: This doesn't work on OSX. How can this be so difficult?
+//#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN)
+//	#define BASISD_IS_BIG_ENDIAN (1)
+//#else
+	#define BASISD_IS_BIG_ENDIAN (0)
+//#endif
+#endif
+
+#ifndef BASISD_USE_UNALIGNED_WORD_READS
+	#ifdef __EMSCRIPTEN__
+		// Can't use unaligned loads/stores with WebAssembly.
+		#define BASISD_USE_UNALIGNED_WORD_READS (0)
+	#elif defined(_M_AMD64) || defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)
+		#define BASISD_USE_UNALIGNED_WORD_READS (1)
+	#else
+		#define BASISD_USE_UNALIGNED_WORD_READS (0)
+	#endif
+#endif
 
-// The supported .basis file header version. Keep in sync with BASIS_FILE_VERSION.
 #define BASISD_SUPPORTED_BASIS_VERSION (0x13)
 
+#ifndef BASISD_SUPPORT_KTX2
+	#error Must have defined BASISD_SUPPORT_KTX2
+#endif
+
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+#error Must have defined BASISD_SUPPORT_KTX2_ZSTD
+#endif
+
 // Set to 1 for fuzz testing. This will disable all CRC16 checks on headers and compressed data.
 #ifndef BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS
-#define BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS 0
+	#define BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS 0
 #endif
 
 #ifndef BASISD_SUPPORT_DXT1
-#define BASISD_SUPPORT_DXT1 1
+	#define BASISD_SUPPORT_DXT1 1
 #endif
 
 #ifndef BASISD_SUPPORT_DXT5A
-#define BASISD_SUPPORT_DXT5A 1
+	#define BASISD_SUPPORT_DXT5A 1
 #endif
 
 // Disable all BC7 transcoders if necessary (useful when cross compiling to Javascript)
 #if defined(BASISD_SUPPORT_BC7) && !BASISD_SUPPORT_BC7
-	#ifndef BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-	#define BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY 0
-	#endif
 	#ifndef BASISD_SUPPORT_BC7_MODE5
-	#define BASISD_SUPPORT_BC7_MODE5 0
+		#define BASISD_SUPPORT_BC7_MODE5 0
 	#endif
 #endif // !BASISD_SUPPORT_BC7
 
-// BC7 mode 6 opaque only is the highest quality (compared to ETC1), but the tables are massive.
-// For web/mobile use you probably should disable this.
-#ifndef BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-#define BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY 1
-#endif
-
-// BC7 mode 5 supports both opaque and opaque+alpha textures, and uses substantially less memory than BC7 mode 6 and even BC1.
+// BC7 mode 5 supports both opaque and opaque+alpha textures, and uses less memory BC1.
 #ifndef BASISD_SUPPORT_BC7_MODE5
-#define BASISD_SUPPORT_BC7_MODE5 1
+	#define BASISD_SUPPORT_BC7_MODE5 1
 #endif
 
 #ifndef BASISD_SUPPORT_PVRTC1
-#define BASISD_SUPPORT_PVRTC1 1
+	#define BASISD_SUPPORT_PVRTC1 1
 #endif
 
 #ifndef BASISD_SUPPORT_ETC2_EAC_A8
-#define BASISD_SUPPORT_ETC2_EAC_A8 1
+	#define BASISD_SUPPORT_ETC2_EAC_A8 1
+#endif
+
+// Set BASISD_SUPPORT_UASTC to 0 to completely disable support for transcoding UASTC files.
+#ifndef BASISD_SUPPORT_UASTC
+	#define BASISD_SUPPORT_UASTC 1
 #endif
 
 #ifndef BASISD_SUPPORT_ASTC
-#define BASISD_SUPPORT_ASTC 1
+	#define BASISD_SUPPORT_ASTC 1
 #endif
 
 // Note that if BASISD_SUPPORT_ATC is enabled, BASISD_SUPPORT_DXT5A should also be enabled for alpha support.
 #ifndef BASISD_SUPPORT_ATC
-#define BASISD_SUPPORT_ATC 1
+	#define BASISD_SUPPORT_ATC 1
 #endif
 
 // Support for ETC2 EAC R11 and ETC2 EAC RG11
 #ifndef BASISD_SUPPORT_ETC2_EAC_RG11
-#define BASISD_SUPPORT_ETC2_EAC_RG11 1
+	#define BASISD_SUPPORT_ETC2_EAC_RG11 1
 #endif
 
 // If BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY is 1, opaque blocks will be transcoded to ASTC at slightly higher quality (higher than BC1), but the transcoder tables will be 2x as large.
@@ -89,26 +112,25 @@
 #endif
 
 #ifndef BASISD_SUPPORT_FXT1
-#define BASISD_SUPPORT_FXT1 1
+	#define BASISD_SUPPORT_FXT1 1
 #endif
 
 #ifndef BASISD_SUPPORT_PVRTC2
-#define BASISD_SUPPORT_PVRTC2 1
+	#define BASISD_SUPPORT_PVRTC2 1
 #endif
 
 #if BASISD_SUPPORT_PVRTC2
-#if !BASISD_SUPPORT_ATC
-#error BASISD_SUPPORT_ATC must be 1 if BASISD_SUPPORT_PVRTC2 is 1
-#endif
+	#if !BASISD_SUPPORT_ATC
+		#error BASISD_SUPPORT_ATC must be 1 if BASISD_SUPPORT_PVRTC2 is 1
+	#endif
 #endif
 
 #if BASISD_SUPPORT_ATC
-#if !BASISD_SUPPORT_DXT5A
-#error BASISD_SUPPORT_DXT5A must be 1 if BASISD_SUPPORT_ATC is 1
-#endif
+	#if !BASISD_SUPPORT_DXT5A
+		#error BASISD_SUPPORT_DXT5A must be 1 if BASISD_SUPPORT_ATC is 1
+	#endif
 #endif
 
-#define BASISD_WRITE_NEW_BC7_TABLES					0
 #define BASISD_WRITE_NEW_BC7_MODE5_TABLES			0
 #define BASISD_WRITE_NEW_DXT1_TABLES				0
 #define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES		0
@@ -117,7 +139,16 @@
 #define BASISD_WRITE_NEW_ETC2_EAC_R11_TABLES		0
 
 #ifndef BASISD_ENABLE_DEBUG_FLAGS
-#define BASISD_ENABLE_DEBUG_FLAGS	0
+	#define BASISD_ENABLE_DEBUG_FLAGS	0
+#endif
+
+// If KTX2 support is enabled, we may need Zstd for decompression of supercompressed UASTC files. Include this header.
+#if BASISD_SUPPORT_KTX2
+   // If BASISD_SUPPORT_KTX2_ZSTD is 0, UASTC files compressed with Zstd cannot be loaded.
+	#if BASISD_SUPPORT_KTX2_ZSTD
+		// We only use two Zstd API's: ZSTD_decompress() and ZSTD_isError()
+		#include "../zstd/zstd.h"
+	#endif
 #endif
 
 namespace basisu
@@ -131,7 +162,7 @@ namespace basisu
 
 	void debug_printf(const char* pFmt, ...)
 	{
-#if BASISU_DEVEL_MESSAGES	
+#if BASISU_FORCE_DEVEL_MESSAGES	
 		g_debug_printf = true;
 #endif
 		if (g_debug_printf)
@@ -146,9 +177,6 @@ namespace basisu
 
 namespace basist
 {
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-#include "basisu_transcoder_tables_bc7_m6.inc"
-#endif
 
 #if BASISD_ENABLE_DEBUG_FLAGS
 	static uint32_t g_debug_flags = 0;
@@ -165,16 +193,28 @@ namespace basist
 
 	void set_debug_flags(uint32_t f)
 	{
-		(void)f;
+		BASISU_NOTE_UNUSED(f);
 #if BASISD_ENABLE_DEBUG_FLAGS
 		g_debug_flags = f;
 #endif
 	}
+
+	inline uint16_t byteswap_uint16(uint16_t v)
+	{
+		return static_cast<uint16_t>((v >> 8) | (v << 8));
+	}
+
+	static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+	static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+	static inline float saturate(float value) { return clampf(value, 0, 1.0f); }
+
+	static inline uint8_t mul_8(uint32_t v, uint32_t q) { v = v * q + 128; return (uint8_t)((v + (v >> 8)) >> 8); }
+
 	uint16_t crc16(const void* r, size_t size, uint16_t crc)
 	{
 		crc = ~crc;
 
-		const uint8_t* p = reinterpret_cast<const uint8_t*>(r);
+		const uint8_t* p = static_cast<const uint8_t*>(r);
 		for (; size; --size)
 		{
 			const uint16_t q = *p++ ^ (crc >> 8);
@@ -279,6 +319,9 @@ namespace basist
 	DECLARE_ETC1_INTEN_TABLE(g_etc1_inten_tables, 1);
 	DECLARE_ETC1_INTEN_TABLE(g_etc1_inten_tables16, 16);
 	DECLARE_ETC1_INTEN_TABLE(g_etc1_inten_tables48, 3 * 16);
+
+	//const uint8_t g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
+	const uint8_t g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
 	
 	static const uint8_t g_etc_5_to_8[32] = { 0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255 };
 
@@ -522,11 +565,39 @@ namespace basist
 			return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
 		}
 
+		inline uint16_t get_base4_color(uint32_t idx) const
+		{
+			uint32_t r, g, b;
+			if (idx)
+			{
+				r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
+				g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
+				b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
+			}
+			else
+			{
+				r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
+				g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
+				b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
+			}
+			return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
+		}
+
 		inline color32 get_base5_color_unscaled() const
 		{
 			return color32(m_differential.m_red1, m_differential.m_green1, m_differential.m_blue1, 255);
 		}
 
+		inline bool get_flip_bit() const
+		{
+			return (m_bytes[3] & 1) != 0;
+		}
+
+		inline bool get_diff_bit() const
+		{
+			return (m_bytes[3] & 2) != 0;
+		}
+				
 		inline uint32_t get_inten_table(uint32_t subblock_id) const
 		{
 			assert(subblock_id < 2);
@@ -534,6 +605,38 @@ namespace basist
 			return (m_bytes[3] >> ofs) & 7;
 		}
 
+		inline uint16_t get_delta3_color() const
+		{
+			const uint32_t r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
+			const uint32_t g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
+			const uint32_t b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
+			return static_cast<uint16_t>(b | (g << 3U) | (r << 6U));
+		}
+		
+		void get_block_colors(color32* pBlock_colors, uint32_t subblock_index) const
+		{
+			color32 b;
+
+			if (get_diff_bit())
+			{
+				if (subblock_index)
+					unpack_color5(b, get_base5_color(), get_delta3_color(), true, 255);
+				else
+					unpack_color5(b, get_base5_color(), true);
+			}
+			else
+			{
+				b = unpack_color4(get_base4_color(subblock_index), true, 255);
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[get_inten_table(subblock_index)];
+
+			pBlock_colors[0].set_noclamp_rgba(clamp255(b.r + pInten_table[0]), clamp255(b.g + pInten_table[0]), clamp255(b.b + pInten_table[0]), 255);
+			pBlock_colors[1].set_noclamp_rgba(clamp255(b.r + pInten_table[1]), clamp255(b.g + pInten_table[1]), clamp255(b.b + pInten_table[1]), 255);
+			pBlock_colors[2].set_noclamp_rgba(clamp255(b.r + pInten_table[2]), clamp255(b.g + pInten_table[2]), clamp255(b.b + pInten_table[2]), 255);
+			pBlock_colors[3].set_noclamp_rgba(clamp255(b.r + pInten_table[3]), clamp255(b.g + pInten_table[3]), clamp255(b.b + pInten_table[3]), 255);
+		}
+
 		static uint16_t pack_color4(const color32& color, bool scaled, uint32_t bias = 127U)
 		{
 			return pack_color4(color.r, color.g, color.b, scaled, bias);
@@ -592,7 +695,17 @@ namespace basist
 			return static_cast<uint16_t>(b | (g << 3) | (r << 6));
 		}
 
-		static color32 unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha = 255)
+		static void unpack_delta3(int& r, int& g, int& b, uint16_t packed_delta3)
+		{
+			r = (packed_delta3 >> 6) & 7;
+			g = (packed_delta3 >> 3) & 7;
+			b = packed_delta3 & 7;
+			if (r >= 4) r -= 8;
+			if (g >= 4) g -= 8;
+			if (b >= 4) b -= 8;
+		}
+
+		static color32 unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha)
 		{
 			uint32_t b = packed_color5 & 31U;
 			uint32_t g = (packed_color5 >> 5U) & 31U;
@@ -605,7 +718,9 @@ namespace basist
 				r = (r << 3U) | (r >> 2U);
 			}
 
-			return color32(r, g, b, alpha);
+			assert(alpha <= 255);
+
+			return color32(cNoClamp, r, g, b, alpha);
 		}
 
 		static void unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, bool scaled)
@@ -615,6 +730,64 @@ namespace basist
 			g = c.g;
 			b = c.b;
 		}
+				
+		static void unpack_color5(color32& result, uint16_t packed_color5, bool scaled)
+		{
+			result = unpack_color5(packed_color5, scaled, 255);
+		}
+
+		static bool unpack_color5(color32& result, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
+		{
+			int dr, dg, db;
+			unpack_delta3(dr, dg, db, packed_delta3);
+
+			int r = ((packed_color5 >> 10U) & 31U) + dr;
+			int g = ((packed_color5 >> 5U) & 31U) + dg;
+			int b = (packed_color5 & 31U) + db;
+
+			bool success = true;
+			if (static_cast<uint32_t>(r | g | b) > 31U)
+			{
+				success = false;
+				r = basisu::clamp<int>(r, 0, 31);
+				g = basisu::clamp<int>(g, 0, 31);
+				b = basisu::clamp<int>(b, 0, 31);
+			}
+
+			if (scaled)
+			{
+				b = (b << 3U) | (b >> 2U);
+				g = (g << 3U) | (g >> 2U);
+				r = (r << 3U) | (r >> 2U);
+			}
+
+			result.set_noclamp_rgba(r, g, b, basisu::minimum(alpha, 255U));
+			return success;
+		}
+
+		static color32 unpack_color4(uint16_t packed_color4, bool scaled, uint32_t alpha)
+		{
+			uint32_t b = packed_color4 & 15U;
+			uint32_t g = (packed_color4 >> 4U) & 15U;
+			uint32_t r = (packed_color4 >> 8U) & 15U;
+
+			if (scaled)
+			{
+				b = (b << 4U) | b;
+				g = (g << 4U) | g;
+				r = (r << 4U) | r;
+			}
+
+			return color32(cNoClamp, r, g, b, basisu::minimum(alpha, 255U));
+		}
+
+		static void unpack_color4(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color4, bool scaled)
+		{
+			color32 c(unpack_color4(packed_color4, scaled, 0));
+			r = c.r;
+			g = c.g;
+			b = c.b;
+		}
 
 		static void get_diff_subblock_colors(color32* pDst, uint16_t packed_color5, uint32_t table_idx)
 		{
@@ -823,197 +996,6 @@ namespace basist
 		uint32_t m_high;
 	};
 
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-	static dxt_selector_range g_etc1_to_bc7_selector_ranges[] =
-	{
-		{ 0, 0 },
-		{ 1, 1 },
-		{ 2, 2 },
-		{ 3, 3 },
-
-		{ 0, 3 },
-
-		{ 1, 3 },
-		{ 0, 2 },
-
-		{ 1, 2 },
-
-		{ 2, 3 },
-		{ 0, 1 },
-	};
-	const uint32_t NUM_ETC1_TO_BC7_M6_SELECTOR_RANGES = sizeof(g_etc1_to_bc7_selector_ranges) / sizeof(g_etc1_to_bc7_selector_ranges[0]);
-
-	static uint32_t g_etc1_to_bc7_m6_selector_range_index[4][4];
-		
-	static const uint8_t g_etc1_to_bc7_selector_mappings[][4] =
-	{
-#if 1
-		{ 5 * 0, 5 * 0, 5 * 0, 5 * 0 },
-		{ 5 * 0, 5 * 0, 5 * 0, 5 * 1 },
-		{ 5 * 0, 5 * 0, 5 * 0, 5 * 2 },
-		{ 5 * 0, 5 * 0, 5 * 0, 5 * 3 },
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 1 },
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 2 },
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 3 },
-		{ 5 * 0, 5 * 0, 5 * 2, 5 * 2 },
-		{ 5 * 0, 5 * 0, 5 * 2, 5 * 3 },
-		{ 5 * 0, 5 * 0, 5 * 3, 5 * 3 },
-		{ 5 * 0, 5 * 1, 5 * 1, 5 * 1 },
-		{ 5 * 0, 5 * 1, 5 * 1, 5 * 2 },
-		{ 5 * 0, 5 * 1, 5 * 1, 5 * 3 },
-		{ 5 * 0, 5 * 1, 5 * 2, 5 * 2 },
-		{ 5 * 0, 5 * 1, 5 * 2, 5 * 3 },
-		{ 5 * 0, 5 * 1, 5 * 3, 5 * 3 },
-		{ 5 * 0, 5 * 2, 5 * 2, 5 * 2 },
-		{ 5 * 0, 5 * 2, 5 * 2, 5 * 3 },
-		{ 5 * 0, 5 * 2, 5 * 3, 5 * 3 },
-		{ 5 * 0, 5 * 3, 5 * 3, 5 * 3 },
-		{ 5 * 1, 5 * 1, 5 * 1, 5 * 1 },
-		{ 5 * 1, 5 * 1, 5 * 1, 5 * 2 },
-		{ 5 * 1, 5 * 1, 5 * 1, 5 * 3 },
-		{ 5 * 1, 5 * 1, 5 * 2, 5 * 2 },
-		{ 5 * 1, 5 * 1, 5 * 2, 5 * 3 },
-		{ 5 * 1, 5 * 1, 5 * 3, 5 * 3 },
-		{ 5 * 1, 5 * 2, 5 * 2, 5 * 2 },
-		{ 5 * 1, 5 * 2, 5 * 2, 5 * 3 },
-		{ 5 * 1, 5 * 2, 5 * 3, 5 * 3 },
-		{ 5 * 1, 5 * 3, 5 * 3, 5 * 3 },
-		{ 5 * 2, 5 * 2, 5 * 2, 5 * 2 },
-		{ 5 * 2, 5 * 2, 5 * 2, 5 * 3 },
-		{ 5 * 2, 5 * 2, 5 * 3, 5 * 3 },
-		{ 5 * 2, 5 * 3, 5 * 3, 5 * 3 },
-		{ 5 * 3, 5 * 3, 5 * 3, 5 * 3 },
-
-		{ 0, 1, 2, 3 },
-		{ 0, 0, 1, 1 },
-		{ 0, 0, 0, 1 },
-		{ 0, 2, 4, 6 },
-		{ 0, 3, 6, 9 },
-		{ 0, 4, 8, 12 },
-
-		{ 0, 4, 9, 15 },
-		{ 0, 6, 11, 15 },
-
-		{ 1, 2, 3, 4 },
-		{ 1, 3, 5, 7 },
-
-		{ 1, 8, 8, 14 },
-#else
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 1 },
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 2 },
-		{ 5 * 0, 5 * 0, 5 * 1, 5 * 3 },
-		{ 5 * 0, 5 * 0, 5 * 2, 5 * 3 },
-		{ 5 * 0, 5 * 1, 5 * 1, 5 * 1 },
-		{ 5 * 0, 5 * 1, 5 * 2, 5 * 2 },
-		{ 5 * 0, 5 * 1, 5 * 2, 5 * 3 },
-		{ 5 * 0, 5 * 2, 5 * 3, 5 * 3 },
-		{ 5 * 1, 5 * 2, 5 * 2, 5 * 2 },
-#endif
-		{ 5 * 1, 5 * 2, 5 * 3, 5 * 3 },
-		{ 8, 8, 8, 8 },
-	};
-	const uint32_t NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS = sizeof(g_etc1_to_bc7_selector_mappings) / sizeof(g_etc1_to_bc7_selector_mappings[0]);
-
-	static uint8_t g_etc1_to_bc7_selector_mappings_inv[NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS][4];
-
-	// encoding from LSB to MSB: low8, high8, error16, size is [32*8][NUM_ETC1_TO_BC7_M6_SELECTOR_RANGES][NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS]
-	extern const uint32_t* g_etc1_to_bc7_m6_table[];
-
-	const uint16_t s_bptc_table_aWeight4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-
-#if BASISD_WRITE_NEW_BC7_TABLES
-	static void create_etc1_to_bc7_m6_conversion_table()
-	{
-		FILE* pFile = NULL;
-
-		pFile = fopen("basisu_decoder_tables_bc7_m6.inc", "w");
-
-		for (int inten = 0; inten < 8; inten++)
-		{
-			for (uint32_t g = 0; g < 32; g++)
-			{
-				color32 block_colors[4];
-				decoder_etc_block::get_diff_subblock_colors(block_colors, decoder_etc_block::pack_color5(color32(g, g, g, 255), false), inten);
-
-				fprintf(pFile, "static const uint32_t g_etc1_to_bc7_m6_table%u[] = {\n", g + inten * 32);
-				uint32_t n = 0;
-
-				for (uint32_t sr = 0; sr < NUM_ETC1_TO_BC7_M6_SELECTOR_RANGES; sr++)
-				{
-					const uint32_t low_selector = g_etc1_to_bc7_selector_ranges[sr].m_low;
-					const uint32_t high_selector = g_etc1_to_bc7_selector_ranges[sr].m_high;
-
-					for (uint32_t m = 0; m < NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS; m++)
-					{
-						uint32_t best_lo = 0;
-						uint32_t best_hi = 0;
-						uint64_t best_err = UINT64_MAX;
-
-						for (uint32_t hi = 0; hi <= 127; hi++)
-						{
-							for (uint32_t lo = 0; lo <= 127; lo++)
-							{
-								uint32_t bc7_block_colors[16];
-
-								bc7_block_colors[0] = lo << 1;
-								bc7_block_colors[15] = (hi << 1) | 1;
-
-								for (uint32_t i = 1; i < 15; i++)
-									bc7_block_colors[i] = (bc7_block_colors[0] * (64 - s_bptc_table_aWeight4[i]) + bc7_block_colors[15] * s_bptc_table_aWeight4[i] + 32) >> 6;
-
-								uint64_t total_err = 0;
-
-								for (uint32_t s = low_selector; s <= high_selector; s++)
-								{
-									int err = (int)block_colors[s].g - (int)bc7_block_colors[g_etc1_to_bc7_selector_mappings[m][s]];
-
-									total_err += err * err;
-								}
-
-								if (total_err < best_err)
-								{
-									best_err = total_err;
-									best_lo = lo;
-									best_hi = hi;
-								}
-							} // lo
-
-						} // hi
-
-						best_err = basisu::minimum<uint32_t>(best_err, 0xFFFF);
-
-						const uint32_t index = (g + inten * 32) * (NUM_ETC1_TO_BC7_M6_SELECTOR_RANGES * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS) + (sr * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS) + m;
-
-						uint32_t v = best_err | (best_lo << 18) | (best_hi << 25);
-
-						fprintf(pFile, "0x%X,", v);
-						n++;
-						if ((n & 31) == 31)
-							fprintf(pFile, "\n");
-
-					} // m
-				} // sr
-
-				fprintf(pFile, "};\n");
-
-			} // g
-		} // inten
-
-		fprintf(pFile, "const uint32_t *g_etc1_to_bc7_m6_table[] = {\n");
-
-		for (uint32_t i = 0; i < 32 * 8; i++)
-		{
-			fprintf(pFile, "g_etc1_to_bc7_m6_table%u, ", i);
-			if ((i & 15) == 15)
-				fprintf(pFile, "\n");
-		}
-
-		fprintf(pFile, "};\n");
-		fclose(pFile);
-	}
-#endif
-#endif
-
 	struct etc1_to_dxt1_56_solution
 	{
 		uint8_t m_lo;
@@ -1064,7 +1046,9 @@ namespace basist
 	static const etc1_to_dxt1_56_solution g_etc1_to_dxt_5[32 * 8 * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS * NUM_ETC1_TO_DXT1_SELECTOR_RANGES] = {
 #include "basisu_transcoder_tables_dxt1_5.inc"
 	};
+#endif // BASISD_SUPPORT_DXT1
 
+#if BASISD_SUPPORT_DXT1 || BASISD_SUPPORT_UASTC
 	// First saw the idea for optimal BC1 single-color block encoding using lookup tables in ryg_dxt.
 	struct bc1_match_entry
 	{
@@ -1089,14 +1073,15 @@ namespace basist
 					if (sel == 1)
 					{
 						// Selector 1
-						e = abs(((hi_e * 2 + lo_e) / 3) - i) + ((abs(hi_e - lo_e) >> 5));
+						e = basisu::iabs(((hi_e * 2 + lo_e) / 3) - i);
+						e += (basisu::iabs(hi_e - lo_e) * 3) / 100;
 					}
 					else
 					{
 						assert(sel == 0);
 
 						// Selector 0
-						e = abs(hi_e - i);
+						e = basisu::iabs(hi_e - i);
 					}
 
 					if (e < lowest_e)
@@ -1111,7 +1096,7 @@ namespace basist
 			} // lo
 		}
 	}
-#endif // BASISD_SUPPORT_DXT1
+#endif
 
 #if BASISD_WRITE_NEW_DXT1_TABLES
 	static void create_etc1_to_dxt1_5_conversion_table()
@@ -1268,7 +1253,8 @@ namespace basist
 	}
 #endif
 
-#if BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_ETC2_EAC_RG11
+
+#if BASISD_SUPPORT_UASTC || BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_ETC2_EAC_RG11
 	static const int8_t g_eac_modifier_table[16][8] =
 	{
 		{ -3, -6, -9, -15, 2, 5, 8, 14 },
@@ -1344,6 +1330,9 @@ namespace basist
 		}
 	};
 
+#endif // #if BASISD_SUPPORT_UASTC BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_ETC2_EAC_RG11
+
+#if BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_ETC2_EAC_RG11
 	static const dxt_selector_range s_etc2_eac_selector_ranges[] =
 	{
 		{ 0, 3 },
@@ -1372,8 +1361,8 @@ namespace basist
 		uint32_t m_base;
 		uint32_t m_table;
 		uint32_t m_multiplier;
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_selectors_temp;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_selectors_temp;
 	};
 
 	static uint64_t pack_eac_a8_exhaustive(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels)
@@ -1769,8 +1758,8 @@ namespace basist
 		uint32_t m_base;
 		uint32_t m_table;
 		uint32_t m_multiplier;
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_selectors_temp;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_selectors_temp;
 	};
 
 	static uint64_t pack_eac_r11_exhaustive(pack_eac_r11_results& results, const uint8_t* pPixels, uint32_t num_pixels)
@@ -1924,14 +1913,28 @@ namespace basist
 #if BASISD_SUPPORT_PVRTC2
 	static void transcoder_init_pvrtc2();
 #endif
+
+#if BASISD_SUPPORT_UASTC
+	void uastc_init();
+#endif
+
+	static bool g_transcoder_initialized;
 		
 	// Library global initialization. Requires ~9 milliseconds when compiled and executed natively on a Core i7 2.2 GHz.
 	// If this is too slow, these computed tables can easilky be moved to be compiled in.
 	void basisu_transcoder_init()
 	{
-		static bool s_initialized;
-		if (s_initialized)
+		if (g_transcoder_initialized)
+      {
+         BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Called more than once\n");      
 			return;
+      }
+         
+     BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Initializing (this is not an error)\n");      
+
+#if BASISD_SUPPORT_UASTC
+		uastc_init();
+#endif
 
 #if BASISD_SUPPORT_ASTC
 		transcoder_init_astc();
@@ -1943,11 +1946,6 @@ namespace basist
 		exit(0);
 #endif
 
-#if BASISD_WRITE_NEW_BC7_TABLES
-		create_etc1_to_bc7_m6_conversion_table();
-		exit(0);
-#endif
-
 #if BASISD_WRITE_NEW_BC7_MODE5_TABLES
 		create_etc1_to_bc7_m5_color_conversion_table();
 		create_etc1_to_bc7_m5_alpha_conversion_table();
@@ -1975,7 +1973,7 @@ namespace basist
 		exit(0);
 #endif
 
-#if BASISD_SUPPORT_DXT1
+#if BASISD_SUPPORT_DXT1 || BASISD_SUPPORT_UASTC
 		uint8_t bc1_expand5[32];
 		for (int i = 0; i < 32; i++)
 			bc1_expand5[i] = static_cast<uint8_t>((i << 3) | (i >> 2));
@@ -1988,6 +1986,17 @@ namespace basist
 		prepare_bc1_single_color_table(g_bc1_match6_equals_1, bc1_expand6, 64, 64, 1);
 		prepare_bc1_single_color_table(g_bc1_match6_equals_0, bc1_expand6, 1, 64, 0);
 
+#if 0
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			printf("%u %u %u\n", i, (i * 63 + 127) / 255, g_bc1_match6_equals_0[i].m_hi);
+		}
+		exit(0);
+#endif
+
+#endif
+
+#if BASISD_SUPPORT_DXT1
 		for (uint32_t i = 0; i < NUM_ETC1_TO_DXT1_SELECTOR_RANGES; i++)
 		{
 			uint32_t l = g_etc1_to_dxt1_selector_ranges[i].m_low;
@@ -2023,19 +2032,6 @@ namespace basist
 		}
 #endif
 
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-		for (uint32_t i = 0; i < NUM_ETC1_TO_BC7_M6_SELECTOR_RANGES; i++)
-		{
-			uint32_t l = g_etc1_to_bc7_selector_ranges[i].m_low;
-			uint32_t h = g_etc1_to_bc7_selector_ranges[i].m_high;
-			g_etc1_to_bc7_m6_selector_range_index[l][h] = i;
-		}
-
-		for (uint32_t sm = 0; sm < NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS; sm++)
-			for (uint32_t j = 0; j < 4; j++)
-				g_etc1_to_bc7_selector_mappings_inv[sm][j] = 15 - g_etc1_to_bc7_selector_mappings[sm][j];
-#endif
-
 #if BASISD_SUPPORT_BC7_MODE5
 		transcoder_init_bc7_mode5();
 #endif
@@ -2048,7 +2044,7 @@ namespace basist
 		transcoder_init_pvrtc2();
 #endif
 
-		s_initialized = true;
+		g_transcoder_initialized = true;
 	}
 
 #if BASISD_SUPPORT_DXT1
@@ -2780,7 +2776,7 @@ namespace basist
 
 	// PVRTC
 
-#if BASISD_SUPPORT_PVRTC1
+#if BASISD_SUPPORT_PVRTC1 || BASISD_SUPPORT_UASTC
 	static const  uint16_t g_pvrtc_swizzle_table[256] =
 	{
 		0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
@@ -3304,6 +3300,7 @@ namespace basist
 		}
 	};
 
+#if 0
 	static const uint8_t g_pvrtc_bilinear_weights[16][4] =
 	{
 		{ 4, 4, 4, 4 }, { 2, 6, 2, 6 }, { 8, 0, 8, 0 }, { 6, 2, 6, 2 },
@@ -3311,6 +3308,7 @@ namespace basist
 		{ 8, 8, 0, 0 }, { 4, 12, 0, 0 }, { 16, 0, 0, 0 }, { 12, 4, 0, 0 },
 		{ 6, 6, 2, 2 }, { 3, 9, 1, 3 }, { 12, 0, 4, 0 }, { 9, 3, 3, 1 },
 	};
+#endif
 
 	struct pvrtc1_temp_block
 	{
@@ -3402,7 +3400,9 @@ namespace basist
 		color32 c(get_endpoint_8888(endpoints, endpoint_index));
 		return c.r + c.g + c.b + c.a;
 	}
+#endif
 
+#if BASISD_SUPPORT_PVRTC1
 	// TODO: Support decoding a non-pow2 ETC1S texture into the next larger pow2 PVRTC texture.
 	static void fixup_pvrtc1_4_modulation_rgb(const decoder_etc_block* pETC_Blocks, const uint32_t* pPVRTC_endpoints, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y)
 	{
@@ -3411,7 +3411,7 @@ namespace basist
 		const uint32_t x_bits = basisu::total_bits(x_mask);
 		const uint32_t y_bits = basisu::total_bits(y_mask);
 		const uint32_t min_bits = basisu::minimum(x_bits, y_bits);
-		const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
+		//const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
 		const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1;
 
 		uint32_t block_index = 0;
@@ -3592,7 +3592,7 @@ namespace basist
 		const uint32_t x_bits = basisu::total_bits(x_mask);
 		const uint32_t y_bits = basisu::total_bits(y_mask);
 		const uint32_t min_bits = basisu::minimum(x_bits, y_bits);
-		const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
+		//const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
 		const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1;
 
 		uint32_t block_index = 0;
@@ -3763,257 +3763,6 @@ namespace basist
 	}
 #endif // BASISD_SUPPORT_PVRTC1
 
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-	struct bc7_mode_6
-	{
-		struct
-		{
-			uint64_t m_mode : 7;
-			uint64_t m_r0 : 7;
-			uint64_t m_r1 : 7;
-			uint64_t m_g0 : 7;
-			uint64_t m_g1 : 7;
-			uint64_t m_b0 : 7;
-			uint64_t m_b1 : 7;
-			uint64_t m_a0 : 7;
-			uint64_t m_a1 : 7;
-			uint64_t m_p0 : 1;
-		} m_lo;
-
-		union
-		{
-			struct
-			{
-				uint64_t m_p1 : 1;
-				uint64_t m_s00 : 3;
-				uint64_t m_s10 : 4;
-				uint64_t m_s20 : 4;
-				uint64_t m_s30 : 4;
-
-				uint64_t m_s01 : 4;
-				uint64_t m_s11 : 4;
-				uint64_t m_s21 : 4;
-				uint64_t m_s31 : 4;
-
-				uint64_t m_s02 : 4;
-				uint64_t m_s12 : 4;
-				uint64_t m_s22 : 4;
-				uint64_t m_s32 : 4;
-
-				uint64_t m_s03 : 4;
-				uint64_t m_s13 : 4;
-				uint64_t m_s23 : 4;
-				uint64_t m_s33 : 4;
-
-			} m_hi;
-
-			uint64_t m_hi_bits;
-		};
-	};
-
-	static void convert_etc1s_to_bc7_m6(bc7_mode_6* pDst_block, const endpoint *pEndpoint, const selector* pSelector)
-	{
-#if !BASISD_WRITE_NEW_BC7_TABLES
-		const uint32_t low_selector = pSelector->m_lo_selector;
-		const uint32_t high_selector = pSelector->m_hi_selector;
-				
-		const uint32_t base_color_r = pEndpoint->m_color5.r;
-		const uint32_t base_color_g = pEndpoint->m_color5.g;
-		const uint32_t base_color_b = pEndpoint->m_color5.b;
-		const uint32_t inten_table = pEndpoint->m_inten5;
-
-		if (pSelector->m_num_unique_selectors <= 2)
-		{
-			// Only two unique selectors so just switch to block truncation coding (BTC) to avoid quality issues on extreme blocks.
-			pDst_block->m_lo.m_mode = 64;
-
-			pDst_block->m_lo.m_a0 = 127;
-			pDst_block->m_lo.m_a1 = 127;
-
-			color32 block_colors[4];
-
-			decoder_etc_block::get_block_colors5(block_colors, color32(base_color_r, base_color_g, base_color_b, 255), inten_table);
-
-			const uint32_t r0 = block_colors[low_selector].r;
-			const uint32_t g0 = block_colors[low_selector].g;
-			const uint32_t b0 = block_colors[low_selector].b;
-			const uint32_t low_bits0 = (r0 & 1) + (g0 & 1) + (b0 & 1);
-			uint32_t p0 = low_bits0 >= 2;
-
-			const uint32_t r1 = block_colors[high_selector].r;
-			const uint32_t g1 = block_colors[high_selector].g;
-			const uint32_t b1 = block_colors[high_selector].b;
-			const uint32_t low_bits1 = (r1 & 1) + (g1 & 1) + (b1 & 1);
-			uint32_t p1 = low_bits1 >= 2;
-
-			pDst_block->m_lo.m_r0 = r0 >> 1;
-			pDst_block->m_lo.m_g0 = g0 >> 1;
-			pDst_block->m_lo.m_b0 = b0 >> 1;
-			pDst_block->m_lo.m_p0 = p0;
-
-			pDst_block->m_lo.m_r1 = r1 >> 1;
-			pDst_block->m_lo.m_g1 = g1 >> 1;
-			pDst_block->m_lo.m_b1 = b1 >> 1;
-
-			uint32_t output_low_selector = 0;
-			uint32_t output_bit_offset = 1;
-			uint64_t output_hi_bits = p1;
-
-			for (uint32_t y = 0; y < 4; y++)
-			{
-				for (uint32_t x = 0; x < 4; x++)
-				{
-					uint32_t s = pSelector->get_selector(x, y);
-					uint32_t os = (s == low_selector) ? output_low_selector : (15 ^ output_low_selector);
-
-					uint32_t num_bits = 4;
-
-					if ((x | y) == 0)
-					{
-						if (os & 8)
-						{
-							pDst_block->m_lo.m_r0 = r1 >> 1;
-							pDst_block->m_lo.m_g0 = g1 >> 1;
-							pDst_block->m_lo.m_b0 = b1 >> 1;
-							pDst_block->m_lo.m_p0 = p1;
-
-							pDst_block->m_lo.m_r1 = r0 >> 1;
-							pDst_block->m_lo.m_g1 = g0 >> 1;
-							pDst_block->m_lo.m_b1 = b0 >> 1;
-
-							output_hi_bits &= ~1ULL;
-							output_hi_bits |= p0;
-							std::swap(p0, p1);
-
-							output_low_selector = 15;
-							os = 0;
-						}
-
-						num_bits = 3;
-					}
-
-					output_hi_bits |= (static_cast<uint64_t>(os) << output_bit_offset);
-					output_bit_offset += num_bits;
-				}
-			}
-
-			pDst_block->m_hi_bits = output_hi_bits;
-
-			assert(pDst_block->m_hi.m_p1 == p1);
-
-			return;
-		}
-
-		uint32_t selector_range_table = g_etc1_to_bc7_m6_selector_range_index[low_selector][high_selector];
-
-		const uint32_t* pTable_r = g_etc1_to_bc7_m6_table[base_color_r + inten_table * 32] + (selector_range_table * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS);
-		const uint32_t* pTable_g = g_etc1_to_bc7_m6_table[base_color_g + inten_table * 32] + (selector_range_table * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS);
-		const uint32_t* pTable_b = g_etc1_to_bc7_m6_table[base_color_b + inten_table * 32] + (selector_range_table * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS);
-
-#if 1
-		assert(NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS == 48);
-
-		uint32_t best_err0 = UINT_MAX, best_err1 = UINT_MAX;
-
-#define DO_ITER2(idx) \
-		{  \
-			uint32_t v0 = ((pTable_r[(idx)+0] + pTable_g[(idx)+0] + pTable_b[(idx)+0]) << 14) | ((idx) + 0); if (v0 < best_err0) best_err0 = v0; \
-			uint32_t v1 = ((pTable_r[(idx)+1] + pTable_g[(idx)+1] + pTable_b[(idx)+1]) << 14) | ((idx) + 1); if (v1 < best_err1) best_err1 = v1; \
-		}
-#define DO_ITER4(idx) DO_ITER2(idx); DO_ITER2((idx) + 2);
-#define DO_ITER8(idx) DO_ITER4(idx); DO_ITER4((idx) + 4);
-#define DO_ITER16(idx) DO_ITER8(idx); DO_ITER8((idx) + 8);
-
-		DO_ITER16(0);
-		DO_ITER16(16);
-		DO_ITER16(32);
-#undef DO_ITER2
-#undef DO_ITER4
-#undef DO_ITER8
-#undef DO_ITER16
-
-		uint32_t best_err = basisu::minimum(best_err0, best_err1);
-		uint32_t best_mapping = best_err & 0xFF;
-		//best_err >>= 14;
-#else
-		uint32_t best_err = UINT_MAX;
-		uint32_t best_mapping = 0;
-		assert((NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS % 2) == 0);
-		for (uint32_t m = 0; m < NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS; m += 2)
-		{
-#define DO_ITER(idx)	{ uint32_t total_err = (pTable_r[idx] + pTable_g[idx] + pTable_b[idx]) & 0x3FFFF; if (total_err < best_err) { best_err = total_err; best_mapping = idx; } }
-			DO_ITER(m);
-			DO_ITER(m + 1);
-#undef DO_ITER
-		}
-#endif		
-
-		pDst_block->m_lo.m_mode = 64;
-
-		pDst_block->m_lo.m_a0 = 127;
-		pDst_block->m_lo.m_a1 = 127;
-
-		uint64_t v = 0;
-		const uint8_t* pSelectors_xlat;
-
-		if (g_etc1_to_bc7_selector_mappings[best_mapping][pSelector->get_selector(0, 0)] & 8)
-		{
-			pDst_block->m_lo.m_r1 = (pTable_r[best_mapping] >> 18) & 0x7F;
-			pDst_block->m_lo.m_g1 = (pTable_g[best_mapping] >> 18) & 0x7F;
-			pDst_block->m_lo.m_b1 = (pTable_b[best_mapping] >> 18) & 0x7F;
-
-			pDst_block->m_lo.m_r0 = (pTable_r[best_mapping] >> 25) & 0x7F;
-			pDst_block->m_lo.m_g0 = (pTable_g[best_mapping] >> 25) & 0x7F;
-			pDst_block->m_lo.m_b0 = (pTable_b[best_mapping] >> 25) & 0x7F;
-
-			pDst_block->m_lo.m_p0 = 1;
-			pDst_block->m_hi.m_p1 = 0;
-
-			v = 0;
-			pSelectors_xlat = &g_etc1_to_bc7_selector_mappings_inv[best_mapping][0];
-		}
-		else
-		{
-			pDst_block->m_lo.m_r0 = (pTable_r[best_mapping] >> 18) & 0x7F;
-			pDst_block->m_lo.m_g0 = (pTable_g[best_mapping] >> 18) & 0x7F;
-			pDst_block->m_lo.m_b0 = (pTable_b[best_mapping] >> 18) & 0x7F;
-
-			pDst_block->m_lo.m_r1 = (pTable_r[best_mapping] >> 25) & 0x7F;
-			pDst_block->m_lo.m_g1 = (pTable_g[best_mapping] >> 25) & 0x7F;
-			pDst_block->m_lo.m_b1 = (pTable_b[best_mapping] >> 25) & 0x7F;
-
-			pDst_block->m_lo.m_p0 = 0;
-			pDst_block->m_hi.m_p1 = 1;
-
-			v = 1;
-			pSelectors_xlat = &g_etc1_to_bc7_selector_mappings[best_mapping][0];
-		}
-
-		uint64_t v1 = 0, v2 = 0, v3 = 0;
-
-#define DO_X(x, s0, s1, s2, s3) { \
-		v |= ((uint64_t)pSelectors_xlat[(pSelector->m_selectors[0] >> ((x) * 2)) & 3] << (s0)); \
-		v1 |= ((uint64_t)pSelectors_xlat[(pSelector->m_selectors[1] >> ((x) * 2)) & 3] << (s1)); \
-		v2 |= ((uint64_t)pSelectors_xlat[(pSelector->m_selectors[2] >> ((x) * 2)) & 3] << (s2)); \
-		v3 |= ((uint64_t)pSelectors_xlat[(pSelector->m_selectors[3] >> ((x) * 2)) & 3] << (s3)); }
-
-		// 1  4  8  12
-		// 16 20 24 28
-		// 32 36 40 44
-		// 48 52 56 60
-
-		DO_X(0, 1, 16, 32, 48);
-		DO_X(1, 4, 20, 36, 52);
-		DO_X(2, 8, 24, 40, 56);
-		DO_X(3, 12, 28, 44, 60);
-#undef DO_X
-
-		pDst_block->m_hi_bits = v | v1 | v2 | v3;
-#endif
-
-	}
-#endif // BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-
 #if BASISD_SUPPORT_BC7_MODE5
 	static dxt_selector_range g_etc1_to_bc7_m5_selector_ranges[] =
 	{
@@ -4085,7 +3834,7 @@ namespace basist
 		assert(num_bits < 32);
 		assert(val < (1ULL << num_bits));
 
-		uint32_t mask = (1 << num_bits) - 1;
+		uint32_t mask = static_cast<uint32_t>((1ULL << num_bits) - 1);
 
 		while (num_bits)
 		{
@@ -4425,9 +4174,11 @@ namespace basist
 	{
 		bc7_mode_5* pDst_block = static_cast<bc7_mode_5*>(pDst);
 				
+		// First ensure the block is cleared to all 0's
 		static_cast<uint64_t*>(pDst)[0] = 0;
 		static_cast<uint64_t*>(pDst)[1] = 0;
 
+		// Set alpha to 255
 		pDst_block->m_lo.m_mode = 1 << 5;
 		pDst_block->m_lo.m_a0 = 255;
 		pDst_block->m_lo.m_a1_0 = 63;
@@ -4690,7 +4441,11 @@ namespace basist
 		set_block_bits((uint8_t*)pDst, output_bits, 31, 97);
 	}
 #endif // BASISD_SUPPORT_BC7_MODE5
-	
+
+#if BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_UASTC
+	static const uint8_t g_etc2_eac_a8_sel4[6] = { 0x92, 0x49, 0x24, 0x92, 0x49, 0x24 };
+#endif
+
 #if BASISD_SUPPORT_ETC2_EAC_A8
 	static void convert_etc1s_to_etc2_eac_a8(eac_block* pDst_block, const endpoint* pEndpoints, const selector* pSelector)
 	{
@@ -4712,8 +4467,7 @@ namespace basist
 			pDst_block->m_multiplier = 1;
 
 			// selectors are all 4's
-			static const uint8_t s_etc2_eac_a8_sel4[6] = { 0x92, 0x49, 0x24, 0x92, 0x49, 0x24 };
-			memcpy(pDst_block->m_selectors, s_etc2_eac_a8_sel4, sizeof(s_etc2_eac_a8_sel4));
+			memcpy(pDst_block->m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4));
 
 			return;
 		}
@@ -5325,31 +5079,30 @@ namespace basist
 
 #endif
 
-#if BASISD_SUPPORT_ASTC
-	struct astc_block_params
-	{
-		// 2 groups of 5, but only a max of 8 are used (RRGGBBAA00)
-		uint8_t m_endpoints[10]; 
-		uint8_t m_weights[32];
-	};
-		
+#if BASISD_SUPPORT_UASTC || BASISD_SUPPORT_ASTC
 	// Table encodes 5 trits to 8 output bits. 3^5 entries.
 	// Inverse of the trit bit manipulation process in https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
-	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39, 
-		43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154, 
-		131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202, 
-		208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224, 
-		225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159, 
+	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39,
+		43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154,
+		131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202,
+		208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224,
+		225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159,
 		191, 223, 124, 125, 126 };
 
+	// Extracts bits [low,high]
+	static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high)
+	{
+		return (bits >> low) & ((1 << (high - low + 1)) - 1);
+	}
+
 	// Writes bits to output in an endian safe way
-	static inline void astc_set_bits(uint32_t *pOutput, int &bit_pos, uint32_t value, int total_bits)
+	static inline void astc_set_bits(uint32_t* pOutput, int& bit_pos, uint32_t value, uint32_t total_bits)
 	{
 		uint8_t* pBytes = reinterpret_cast<uint8_t*>(pOutput);
-		
+
 		while (total_bits)
 		{
-			const uint32_t bits_to_write = std::min(total_bits, 8 - (bit_pos & 7));
+			const uint32_t bits_to_write = basisu::minimum<int>(total_bits, 8 - (bit_pos & 7));
 
 			pBytes[bit_pos >> 3] |= static_cast<uint8_t>(value << (bit_pos & 7));
 
@@ -5359,14 +5112,8 @@ namespace basist
 		}
 	}
 
-	// Extracts bits [low,high]
-	static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high)
-	{
-		return (bits >> low) & ((1 << (high - low + 1)) - 1);
-	}
-
 	// Encodes 5 values to output, usable for any range that uses trits and bits
-	static void astc_encode_trits(uint32_t *pOutput, const uint8_t *pValues, int& bit_pos, int n)
+	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n)
 	{
 		// First extract the trits and the bits from the 5 input values
 		int trits = 0, bits[5];
@@ -5374,9 +5121,9 @@ namespace basist
 		for (int i = 0; i < 5; i++)
 		{
 			static const int s_muls[5] = { 1, 3, 9, 27, 81 };
-			
+
 			const int t = pValues[i] >> n;
-			
+
 			trits += t * s_muls[i];
 			bits[i] = pValues[i] & bit_mask;
 		}
@@ -5386,14 +5133,23 @@ namespace basist
 
 		assert(trits < 243);
 		const int T = g_astc_trit_encode[trits];
-		
+
 		// Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94.
 		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2);
 
-		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) | 
+		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) |
 			(bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6);
 	}
+#endif // #if BASISD_SUPPORT_UASTC || BASISD_SUPPORT_ASTC
 
+#if BASISD_SUPPORT_ASTC
+	struct astc_block_params
+	{
+		// 2 groups of 5, but only a max of 8 are used (RRGGBBAA00)
+		uint8_t m_endpoints[10]; 
+		uint8_t m_weights[32];
+	};
+	
 	// Packs a single format ASTC block using Color Endpoint Mode 12 (LDR RGBA direct), endpoint BISE range 13, 2-bit weights (range 2). 
 	// We're always going to output blocks containing alpha, even if the input doesn't have alpha, for simplicity.
 	// Each block always has 4x4 weights, uses range 13 BISE encoding on the endpoints (0-47), and each weight ranges from 0-3. This encoding should be roughly equal in quality vs. BC1 for color.
@@ -6255,12 +6011,15 @@ namespace basist
 	static const etc1s_to_atc_solution g_etc1s_to_pvrtc2_45[32 * 8 * NUM_ETC1S_TO_ATC_SELECTOR_MAPPINGS * NUM_ETC1S_TO_ATC_SELECTOR_RANGES] = {
 #include "basisu_transcoder_tables_pvrtc2_45.inc"
 	};
-		
+
+#if 0
 	static const etc1s_to_atc_solution g_etc1s_to_pvrtc2_alpha_33[32 * 8 * NUM_ETC1S_TO_ATC_SELECTOR_MAPPINGS * NUM_ETC1S_TO_ATC_SELECTOR_RANGES] = {
 #include "basisu_transcoder_tables_pvrtc2_alpha_33.inc"
 	};
 #endif
 
+#endif
+
 	static const etc1s_to_atc_solution g_etc1s_to_atc_55[32 * 8 * NUM_ETC1S_TO_ATC_SELECTOR_MAPPINGS * NUM_ETC1S_TO_ATC_SELECTOR_RANGES] = {
 #include "basisu_transcoder_tables_atc_55.inc"
 	};
@@ -7167,10 +6926,7 @@ namespace basist
 	}
 
 	typedef struct { float c[4]; } vec4F;
-
-	static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
-	static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
-	static inline float saturate(float value) { return clampf(value, 0, 1.0f); }
+		
 	static inline vec4F* vec4F_set_scalar(vec4F* pV, float x) { pV->c[0] = x; pV->c[1] = x; pV->c[2] = x;	pV->c[3] = x;	return pV; }
 	static inline vec4F* vec4F_set(vec4F* pV, float x, float y, float z, float w) { pV->c[0] = x;	pV->c[1] = y;	pV->c[2] = z;	pV->c[3] = w;	return pV; }
 	static inline vec4F* vec4F_saturate_in_place(vec4F* pV) { pV->c[0] = saturate(pV->c[0]); pV->c[1] = saturate(pV->c[1]); pV->c[2] = saturate(pV->c[2]); pV->c[3] = saturate(pV->c[3]); return pV; }
@@ -7188,7 +6944,7 @@ namespace basist
 	}
 
 	static inline int sq(int x) { return x * x; }
-				
+						
 	// PVRTC2 is a slightly borked format for alpha: In Non-Interpolated mode, the way AlphaB8 is exanded from 4 to 8 bits means it can never be 0. 
 	// This is actually very bad, because on 100% transparent blocks which have non-trivial color pixels, part of the color channel will leak into alpha! 
 	// And there's nothing straightforward we can do because using the other modes is too expensive/complex. I can see why Apple didn't adopt it.
@@ -7461,7 +7217,7 @@ namespace basist
 			}
 
 			vec4F_normalize_in_place(&axis);
-
+						
 			if (vec4F_dot(&axis, &axis) < .5f)
 				vec4F_set_scalar(&axis, .5f);
 
@@ -7488,7 +7244,15 @@ namespace basist
 			minColor = vec4F_saturate(&c0);
 			maxColor = vec4F_saturate(&c1);
 			if (minColor.c[3] > maxColor.c[3])
-				std::swap(minColor, maxColor);
+			{
+				// VS 2019 release Code Generator issue
+				//std::swap(minColor, maxColor);
+
+				float a = minColor.c[0], b = minColor.c[1], c = minColor.c[2], d = minColor.c[3];
+				minColor.c[0] = maxColor.c[0]; minColor.c[1] = maxColor.c[1]; minColor.c[2] = maxColor.c[2]; minColor.c[3] = maxColor.c[3];
+				minColor.c[0] = maxColor.c[0]; minColor.c[1] = maxColor.c[1]; minColor.c[2] = maxColor.c[2]; minColor.c[3] = maxColor.c[3];
+				maxColor.c[0] = a; maxColor.c[1] = b; maxColor.c[2] = c; maxColor.c[3] = d;
+			}
 		}
 		else
 		{
@@ -7648,7 +7412,7 @@ namespace basist
 
 					uint32_t m = (le * 5 + he * 3) / 8;
 
-					int err = labs((int)v - (int)m);
+					int err = (int)labs((int)v - (int)m);
 					if (err < lowest_err)
 					{
 						lowest_err = err;
@@ -7671,7 +7435,7 @@ namespace basist
 				uint32_t le = (l << 1);
 				le = (le << 4) | le;
 
-				int err = labs((int)v - (int)le);
+				int err = (int)labs((int)v - (int)le);
 				if (err < lowest_err)
 				{
 					lowest_err = err;
@@ -7693,7 +7457,7 @@ namespace basist
 				uint32_t he = (h << 1) | 1;
 				he = (he << 4) | he;
 
-				int err = labs((int)v - (int)he);
+				int err = (int)labs((int)v - (int)he);
 				if (err < lowest_err)
 				{
 					lowest_err = err;
@@ -7722,7 +7486,7 @@ namespace basist
 
 					uint32_t m = (le * 5 + he * 3) / 8;
 
-					int err = labs((int)v - (int)m);
+					int err = (int)labs((int)v - (int)m);
 					if (err < lowest_err)
 					{
 						lowest_err = err;
@@ -7752,7 +7516,7 @@ namespace basist
 
 					uint32_t m = (le * 5 + he * 3) / 8;
 
-					int err = labs((int)v - (int)m);
+					int err = (int)labs((int)v - (int)m);
 					if (err < lowest_err)
 					{
 						lowest_err = err;
@@ -7768,59 +7532,65 @@ namespace basist
 	}
 #endif // BASISD_SUPPORT_PVRTC2
 
-	basisu_lowlevel_transcoder::basisu_lowlevel_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook) :
+	basisu_lowlevel_etc1s_transcoder::basisu_lowlevel_etc1s_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook) :
+		m_pGlobal_codebook(nullptr),
 		m_pGlobal_sel_codebook(pGlobal_sel_codebook),
 		m_selector_history_buf_size(0)
 	{
 	}
 
-	bool basisu_lowlevel_transcoder::decode_palettes(
+	bool basisu_lowlevel_etc1s_transcoder::decode_palettes(
 		uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size,
 		uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size)
 	{
+		if (m_pGlobal_codebook)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 11\n");
+			return false;
+		}
 		bitwise_decoder sym_codec;
 
 		huffman_decoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model;
 
 		if (!sym_codec.init(pEndpoints_data, endpoints_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 0\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 0\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(color5_delta_model0))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(color5_delta_model1))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(color5_delta_model2))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(inten_delta_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n");
 			return false;
 		}
 
 		if (!color5_delta_model0.is_valid() || !color5_delta_model1.is_valid() || !color5_delta_model2.is_valid() || !inten_delta_model.is_valid())
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n");
 			return false;
 		}
 
 		const bool endpoints_are_grayscale = sym_codec.get_bits(1) != 0;
 
-		m_endpoints.resize(num_endpoints);
+		m_local_endpoints.resize(num_endpoints);
 
 		color32 prev_color5(16, 16, 16, 0);
 		uint32_t prev_inten = 0;
@@ -7828,8 +7598,8 @@ namespace basist
 		for (uint32_t i = 0; i < num_endpoints; i++)
 		{
 			uint32_t inten_delta = sym_codec.decode_huffman(inten_delta_model);
-			m_endpoints[i].m_inten5 = static_cast<uint8_t>((inten_delta + prev_inten) & 7);
-			prev_inten = m_endpoints[i].m_inten5;
+			m_local_endpoints[i].m_inten5 = static_cast<uint8_t>((inten_delta + prev_inten) & 7);
+			prev_inten = m_local_endpoints[i].m_inten5;
 
 			for (uint32_t c = 0; c < (endpoints_are_grayscale ? 1U : 3U); c++)
 			{
@@ -7843,25 +7613,25 @@ namespace basist
 
 				int v = (prev_color5[c] + delta) & 31;
 
-				m_endpoints[i].m_color5[c] = static_cast<uint8_t>(v);
+				m_local_endpoints[i].m_color5[c] = static_cast<uint8_t>(v);
 
 				prev_color5[c] = static_cast<uint8_t>(v);
 			}
 
 			if (endpoints_are_grayscale)
 			{
-				m_endpoints[i].m_color5[1] = m_endpoints[i].m_color5[0];
-				m_endpoints[i].m_color5[2] = m_endpoints[i].m_color5[0];
+				m_local_endpoints[i].m_color5[1] = m_local_endpoints[i].m_color5[0];
+				m_local_endpoints[i].m_color5[2] = m_local_endpoints[i].m_color5[0];
 			}
 		}
 
 		sym_codec.stop();
 
-		m_selectors.resize(num_selectors);
+		m_local_selectors.resize(num_selectors);
 		
 		if (!sym_codec.init(pSelectors_data, selectors_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 5\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 5\n");
 			return false;
 		}
 
@@ -7880,12 +7650,12 @@ namespace basist
 			{
 				if (!sym_codec.read_huffman_table(mod_model))
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 6\n");
 					return false;
 				}
 				if (!mod_model.is_valid())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6a\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 6a\n");
 					return false;
 				}
 			}
@@ -7902,7 +7672,7 @@ namespace basist
 
 				if (pal_index >= m_pGlobal_sel_codebook->size())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7z\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 7z\n");
 					return false;
 				}
 
@@ -7911,9 +7681,9 @@ namespace basist
 				// TODO: Optimize this
 				for (uint32_t y = 0; y < 4; y++)
 					for (uint32_t x = 0; x < 4; x++)
-						m_selectors[i].set_selector(x, y, e[x + y * 4]);
+						m_local_selectors[i].set_selector(x, y, e[x + y * 4]);
 
-				m_selectors[i].init_flags();
+				m_local_selectors[i].init_flags();
 			}
 		}
 		else
@@ -7928,12 +7698,12 @@ namespace basist
 				basist::huffman_decoding_table uses_global_cb_bitflags_model;
 				if (!sym_codec.read_huffman_table(uses_global_cb_bitflags_model))
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 7\n");
 					return false;
 				}
 				if (!uses_global_cb_bitflags_model.is_valid())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7a\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 7a\n");
 					return false;
 				}
 
@@ -7942,12 +7712,12 @@ namespace basist
 				{
 					if (!sym_codec.read_huffman_table(global_mod_indices_model))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 8\n");
 						return false;
 					}
 					if (!global_mod_indices_model.is_valid())
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8a\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 8a\n");
 						return false;
 					}
 				}
@@ -7975,7 +7745,7 @@ namespace basist
 
 						if (pal_index >= m_pGlobal_sel_codebook->size())
 						{
-							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8b\n");
+							BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 8b\n");
 							return false;
 						}
 
@@ -7983,7 +7753,7 @@ namespace basist
 
 						for (uint32_t y = 0; y < 4; y++)
 							for (uint32_t x = 0; x < 4; x++)
-								m_selectors[q].set_selector(x, y, e[x + y * 4]);
+								m_local_selectors[q].set_selector(x, y, e[x + y * 4]);
 					}
 					else
 					{
@@ -7992,11 +7762,11 @@ namespace basist
 							uint32_t cur_byte = sym_codec.get_bits(8);
 
 							for (uint32_t k = 0; k < 4; k++)
-								m_selectors[q].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+								m_local_selectors[q].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
 						}
 					}
 
-					m_selectors[q].init_flags();
+					m_local_selectors[q].init_flags();
 				}
 			}
 			else
@@ -8012,23 +7782,23 @@ namespace basist
 							uint32_t cur_byte = sym_codec.get_bits(8);
 
 							for (uint32_t k = 0; k < 4; k++)
-								m_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+								m_local_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
 						}
 
-						m_selectors[i].init_flags();
+						m_local_selectors[i].init_flags();
 					}
 				}
 				else
 				{
 					if (!sym_codec.read_huffman_table(delta_selector_pal_model))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 10\n");
 						return false;
 					}
 
 					if ((num_selectors > 1) && (!delta_selector_pal_model.is_valid()))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10a\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 10a\n");
 						return false;
 					}
 
@@ -8044,9 +7814,9 @@ namespace basist
 								prev_bytes[j] = static_cast<uint8_t>(cur_byte);
 
 								for (uint32_t k = 0; k < 4; k++)
-									m_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+									m_local_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
 							}
-							m_selectors[i].init_flags();
+							m_local_selectors[i].init_flags();
 							continue;
 						}
 
@@ -8058,9 +7828,9 @@ namespace basist
 							prev_bytes[j] = static_cast<uint8_t>(cur_byte);
 
 							for (uint32_t k = 0; k < 4; k++)
-								m_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+								m_local_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
 						}
-						m_selectors[i].init_flags();
+						m_local_selectors[i].init_flags();
 					}
 				}
 			}
@@ -8071,60 +7841,60 @@ namespace basist
 		return true;
 	}
 
-	bool basisu_lowlevel_transcoder::decode_tables(const uint8_t* pTable_data, uint32_t table_data_size)
+	bool basisu_lowlevel_etc1s_transcoder::decode_tables(const uint8_t* pTable_data, uint32_t table_data_size)
 	{
 		basist::bitwise_decoder sym_codec;
 		if (!sym_codec.init(pTable_data, table_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 0\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 0\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_endpoint_pred_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 1\n");
 			return false;
 		}
 
 		if (m_endpoint_pred_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 1a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_delta_endpoint_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 2\n");
 			return false;
 		}
 
 		if (m_delta_endpoint_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 2a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_selector_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 3\n");
 			return false;
 		}
 
 		if (m_selector_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 3a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_selector_history_buf_rle_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 4\n");
 			return false;
 		}
 
 		if (m_selector_history_buf_rle_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4a\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_tables: fail 4a\n");
 			return false;
 		}
 
@@ -8135,27 +7905,37 @@ namespace basist
 		return true;
 	}
 
-	bool basisu_lowlevel_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
-		uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels,
+	bool basisu_lowlevel_etc1s_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+		uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels,
 		basisu_transcoder_state* pState, bool transcode_alpha, void *pAlpha_blocks, uint32_t output_rows_in_pixels)
 	{
-		(void)transcode_alpha;
-		(void)pAlpha_blocks;
+		// 'pDst_blocks' unused when disabling *all* hardware transcode options
+		// (and 'bc1_allow_threecolor_blocks' when disabling DXT)
+		BASISU_NOTE_UNUSED(pDst_blocks);
+		BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks);
+		BASISU_NOTE_UNUSED(transcode_alpha);
+		BASISU_NOTE_UNUSED(pAlpha_blocks);
+
+		assert(g_transcoder_initialized);
+		if (!g_transcoder_initialized)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: Transcoder not globally initialized.\n");
+			return false;
+		}
 
 		if (!pState)
 			pState = &m_def_state;
 
-		const bool is_video = (header.m_tex_type == cBASISTexTypeVideoFrames);
 		const uint32_t total_blocks = num_blocks_x * num_blocks_y;
 
 		if (!output_row_pitch_in_blocks_or_pixels)
 		{
 			if (basis_block_format_is_uncompressed(fmt))
-				output_row_pitch_in_blocks_or_pixels = slice_desc.m_orig_width;
+				output_row_pitch_in_blocks_or_pixels = orig_width;
 			else
 			{
 				if (fmt == block_format::cFXT1_RGB)
-					output_row_pitch_in_blocks_or_pixels = (slice_desc.m_orig_width + 7) / 8;
+					output_row_pitch_in_blocks_or_pixels = (orig_width + 7) / 8;
 				else
 					output_row_pitch_in_blocks_or_pixels = num_blocks_x;
 			}
@@ -8164,23 +7944,23 @@ namespace basist
 		if (basis_block_format_is_uncompressed(fmt))
 		{
 			if (!output_rows_in_pixels)
-				output_rows_in_pixels = slice_desc.m_orig_height;
+				output_rows_in_pixels = orig_height;
 		}
 		
-		std::vector<uint32_t>* pPrev_frame_indices = nullptr;
+		basisu::vector<uint32_t>* pPrev_frame_indices = nullptr;
 		if (is_video)
 		{
 			// TODO: Add check to make sure the caller hasn't tried skipping past p-frames
-			const bool alpha_flag = (slice_desc.m_flags & cSliceDescFlagsIsAlphaData) != 0;
-			const uint32_t level_index = slice_desc.m_level_index;
+			//const bool alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0;
+			//const uint32_t level_index = slice_desc.m_level_index;
 
 			if (level_index >= basisu_transcoder_state::cMaxPrevFrameLevels)
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: unsupported level_index\n");
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: unsupported level_index\n");
 				return false;
 			}
 
-			pPrev_frame_indices = &pState->m_prev_frame_indices[alpha_flag][level_index];
+			pPrev_frame_indices = &pState->m_prev_frame_indices[is_alpha_slice][level_index];
 			if (pPrev_frame_indices->size() < total_blocks)
 				pPrev_frame_indices->resize(total_blocks);
 		}
@@ -8189,14 +7969,12 @@ namespace basist
 
 		if (!sym_codec.init(pImage_data, image_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: sym_codec.init failed\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: sym_codec.init failed\n");
 			return false;
 		}
 
 		approx_move_to_front selector_history_buf(m_selector_history_buf_size);
-
-		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = (uint32_t)m_selectors.size();
-		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = m_selector_history_buf_size + SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX;
+				
 		uint32_t cur_selector_rle_count = 0;
 
 		decoder_etc_block block;
@@ -8212,7 +7990,7 @@ namespace basist
 			pPVRTC_work_mem = malloc(num_blocks_x * num_blocks_y * (sizeof(decoder_etc_block) + sizeof(uint32_t)));
 			if (!pPVRTC_work_mem)
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: malloc failed\n");
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: malloc failed\n");
 				return false;
 			}
 			pPVRTC_endpoints = (uint32_t*) & ((decoder_etc_block*)pPVRTC_work_mem)[num_blocks_x * num_blocks_y];
@@ -8228,6 +8006,16 @@ namespace basist
 		int prev_endpoint_pred_sym = 0;
 		int endpoint_pred_repeat_count = 0;
 		uint32_t prev_endpoint_index = 0;
+		const endpoint_vec& endpoints = m_pGlobal_codebook ? m_pGlobal_codebook->m_local_endpoints : m_local_endpoints;
+		const selector_vec& selectors = m_pGlobal_codebook ? m_pGlobal_codebook->m_local_selectors : m_local_selectors;
+		if (!endpoints.size() || !selectors.size())
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: global codebooks must be unpacked first\n");
+			return false;
+		}
+
+		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = (uint32_t)selectors.size();
+		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = m_selector_history_buf_size + SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX;
 
 		for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 		{
@@ -8279,7 +8067,7 @@ namespace basist
 					// Left
 					if (!block_x)
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (0)\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (0)\n");
 						if (pPVRTC_work_mem)
 							free(pPVRTC_work_mem);
 						return false;
@@ -8292,7 +8080,7 @@ namespace basist
 					// Upper
 					if (!block_y)
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (1)\n");
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (1)\n");
 						if (pPVRTC_work_mem)
 							free(pPVRTC_work_mem);
 						return false;
@@ -8314,7 +8102,7 @@ namespace basist
 						// Upper left
 						if ((!block_x) || (!block_y))
 						{
-							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (2)\n");
+							BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (2)\n");
 							if (pPVRTC_work_mem)
 								free(pPVRTC_work_mem);
 							return false;
@@ -8329,8 +8117,8 @@ namespace basist
 					const uint32_t delta_sym = sym_codec.decode_huffman(m_delta_endpoint_model);
 
 					endpoint_index = delta_sym + prev_endpoint_index;
-					if (endpoint_index >= m_endpoints.size())
-						endpoint_index -= (int)m_endpoints.size();
+					if (endpoint_index >= endpoints.size())
+						endpoint_index -= (int)endpoints.size();
 				}
 
 				pState->m_block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_endpoint_index = (uint16_t)endpoint_index;
@@ -8345,7 +8133,7 @@ namespace basist
 					{
 						cur_selector_rle_count--;
 
-						selector_sym = (int)m_selectors.size();
+						selector_sym = (int)selectors.size();
 					}
 					else
 					{
@@ -8363,28 +8151,28 @@ namespace basist
 							if (cur_selector_rle_count > total_blocks)
 							{
 								// The file is corrupted or we've got a bug.
-								BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (3)\n");
+								BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (3)\n");
 								if (pPVRTC_work_mem)
 									free(pPVRTC_work_mem);
 								return false;
 							}
 
-							selector_sym = (int)m_selectors.size();
+							selector_sym = (int)selectors.size();
 
 							cur_selector_rle_count--;
 						}
 					}
 
-					if (selector_sym >= (int)m_selectors.size())
+					if (selector_sym >= (int)selectors.size())
 					{
 						assert(m_selector_history_buf_size > 0);
 
-						int history_buf_index = selector_sym - (int)m_selectors.size();
+						int history_buf_index = selector_sym - (int)selectors.size();
 
 						if (history_buf_index >= (int)selector_history_buf.size())
 						{
 							// The file is corrupted or we've got a bug.
-							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (4)\n");
+							BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (4)\n");
 							if (pPVRTC_work_mem)
 								free(pPVRTC_work_mem);
 							return false;
@@ -8404,10 +8192,10 @@ namespace basist
 					}
 				}
 
-				if ((endpoint_index >= m_endpoints.size()) || (selector_index >= m_selectors.size()))
+				if ((endpoint_index >= endpoints.size()) || (selector_index >= selectors.size()))
 				{
 					// The file is corrupted or we've got a bug.
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (5)\n");
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: invalid datastream (5)\n");
 					if (pPVRTC_work_mem)
 						free(pPVRTC_work_mem);
 					return false;
@@ -8428,8 +8216,8 @@ namespace basist
 				}
 #endif
 
-				const endpoint* pEndpoints = &m_endpoints[endpoint_index];
-				const selector* pSelector = &m_selectors[selector_index];
+				const endpoint* pEndpoints = &endpoints[endpoint_index];
+				const selector* pSelector = &selectors[selector_index];
 
 				switch (fmt)
 				{
@@ -8448,9 +8236,8 @@ namespace basist
 				}
 				case block_format::cBC1:
 				{
-					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes;
-
 #if BASISD_SUPPORT_DXT1
+					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes;
 #if BASISD_ENABLE_DEBUG_FLAGS
 					if (g_debug_flags & (cDebugFlagVisBC1Sels | cDebugFlagVisBC1Endpoints))
 						convert_etc1s_to_dxt1_vis(static_cast<dxt1_block*>(pDst_block), pEndpoints, pSelector, bc1_allow_threecolor_blocks);
@@ -8534,8 +8321,8 @@ namespace basist
 
 					const uint16_t* pAlpha_block = reinterpret_cast<uint16_t*>(static_cast<uint8_t*>(pAlpha_blocks) + (block_x + block_y * num_blocks_x) * sizeof(uint32_t));
 
-					const endpoint* pAlpha_endpoints = &m_endpoints[pAlpha_block[0]];
-					const selector* pAlpha_selector = &m_selectors[pAlpha_block[1]];
+					const endpoint* pAlpha_endpoints = &endpoints[pAlpha_block[0]];
+					const selector* pAlpha_selector = &selectors[pAlpha_block[1]];
 
 					const color32& alpha_base_color = pAlpha_endpoints->m_color5;
 					const uint32_t alpha_inten_table = pAlpha_endpoints->m_inten5;
@@ -8559,16 +8346,7 @@ namespace basist
 
 					break;
 				}
-				case block_format::cBC7_M6_OPAQUE_ONLY:
-				{
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes;
-					convert_etc1s_to_bc7_m6(static_cast<bc7_mode_6*>(pDst_block), pEndpoints, pSelector);
-#else	
-					assert(0);
-#endif
-					break;
-				}
+				case block_format::cBC7:				// for more consistency with UASTC
 				case block_format::cBC7_M5_COLOR:
 				{
 #if BASISD_SUPPORT_BC7_MODE5
@@ -8603,7 +8381,7 @@ namespace basist
 				{
 #if BASISD_SUPPORT_ASTC
 					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes;
-					convert_etc1s_to_astc_4x4(pDst_block, pEndpoints, pSelector, transcode_alpha, &m_endpoints[0], &m_selectors[0]);
+					convert_etc1s_to_astc_4x4(pDst_block, pEndpoints, pSelector, transcode_alpha, &endpoints[0], &selectors[0]);
 #else
 					assert(0);
 #endif
@@ -8648,8 +8426,8 @@ namespace basist
 					assert(transcode_alpha);
 
 					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes;
-					
-					convert_etc1s_to_pvrtc2_rgba(pDst_block, pEndpoints, pSelector, &m_endpoints[0], &m_selectors[0]);
+										
+					convert_etc1s_to_pvrtc2_rgba(pDst_block, pEndpoints, pSelector, &endpoints[0], &selectors[0]);
 #endif
 					break;
 				}
@@ -8665,8 +8443,8 @@ namespace basist
 					assert(sizeof(uint32_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t);
 										
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 					
 					int colors[4];
 					decoder_etc_block::get_block_colors5_g(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
@@ -8705,8 +8483,8 @@ namespace basist
 					assert(sizeof(uint32_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
@@ -8734,8 +8512,8 @@ namespace basist
 					assert(sizeof(uint32_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
@@ -8765,8 +8543,8 @@ namespace basist
 					assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
@@ -8775,12 +8553,20 @@ namespace basist
 					if (fmt == block_format::cRGB565)
 					{
 						for (uint32_t i = 0; i < 4; i++)
-							packed_colors[i] = static_cast<uint16_t>(((colors[i].r >> 3) << 11) | ((colors[i].g >> 2) << 5) | (colors[i].b >> 3));
+						{
+							packed_colors[i] = static_cast<uint16_t>((mul_8(colors[i].r, 31) << 11) | (mul_8(colors[i].g, 63) << 5) | mul_8(colors[i].b, 31));
+							if (BASISD_IS_BIG_ENDIAN)
+								packed_colors[i] = byteswap_uint16(packed_colors[i]);
+						}
 					}
 					else
 					{
 						for (uint32_t i = 0; i < 4; i++)
-							packed_colors[i] = static_cast<uint16_t>(((colors[i].b >> 3) << 11) | ((colors[i].g >> 2) << 5) | (colors[i].r >> 3));
+						{
+							packed_colors[i] = static_cast<uint16_t>((mul_8(colors[i].b, 31) << 11) | (mul_8(colors[i].g, 63) << 5) | mul_8(colors[i].r, 31));
+							if (BASISD_IS_BIG_ENDIAN)
+								packed_colors[i] = byteswap_uint16(packed_colors[i]);
+						}
 					}
 
 					for (uint32_t y = 0; y < max_y; y++)
@@ -8800,15 +8586,17 @@ namespace basist
 					assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
 
 					uint16_t packed_colors[4];
 					for (uint32_t i = 0; i < 4; i++)
-						packed_colors[i] = static_cast<uint16_t>(((colors[i].r >> 4) << 12) | ((colors[i].g >> 4) << 8) | ((colors[i].b >> 4) << 4));
+					{
+						packed_colors[i] = static_cast<uint16_t>((mul_8(colors[i].r, 15) << 12) | (mul_8(colors[i].g, 15) << 8) | (mul_8(colors[i].b, 15) << 4));
+					}
 
 					for (uint32_t y = 0; y < max_y; y++)
 					{
@@ -8817,7 +8605,14 @@ namespace basist
 						for (uint32_t x = 0; x < max_x; x++)
 						{
 							uint16_t cur = reinterpret_cast<uint16_t*>(pDst_pixels)[x];
+							if (BASISD_IS_BIG_ENDIAN)
+								cur = byteswap_uint16(cur);
+
 							cur = (cur & 0xF) | packed_colors[(s >> (x * 2)) & 3];
+							
+							if (BASISD_IS_BIG_ENDIAN)
+								cur = byteswap_uint16(cur);
+
 							reinterpret_cast<uint16_t*>(pDst_pixels)[x] = cur;
 						}
 
@@ -8831,15 +8626,19 @@ namespace basist
 					assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
 
 					uint16_t packed_colors[4];
 					for (uint32_t i = 0; i < 4; i++)
-						packed_colors[i] = static_cast<uint16_t>(((colors[i].r >> 4) << 12) | ((colors[i].g >> 4) << 8) | ((colors[i].b >> 4) << 4) |  0xF);
+					{
+						packed_colors[i] = static_cast<uint16_t>((mul_8(colors[i].r, 15) << 12) | (mul_8(colors[i].g, 15) << 8) | (mul_8(colors[i].b, 15) << 4) | 0xF);
+						if (BASISD_IS_BIG_ENDIAN)
+							packed_colors[i] = byteswap_uint16(packed_colors[i]);
+					}
 
 					for (uint32_t y = 0; y < max_y; y++)
 					{
@@ -8858,22 +8657,28 @@ namespace basist
 					assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
 					uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
 
-					const uint32_t max_x = basisu::minimum<int>(4, output_row_pitch_in_blocks_or_pixels - block_x * 4);
-					const uint32_t max_y = basisu::minimum<int>(4, output_rows_in_pixels - block_y * 4);
+					const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+					const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
 
 					color32 colors[4];
 					decoder_etc_block::get_block_colors5(colors, pEndpoints->m_color5, pEndpoints->m_inten5);
 
 					uint16_t packed_colors[4];
 					for (uint32_t i = 0; i < 4; i++)
-						packed_colors[i] = colors[i].g >> 4;
+					{
+						packed_colors[i] = mul_8(colors[i].g, 15);
+						if (BASISD_IS_BIG_ENDIAN)
+							packed_colors[i] = byteswap_uint16(packed_colors[i]);
+					}
 
 					for (uint32_t y = 0; y < max_y; y++)
 					{
 						const uint32_t s = pSelector->m_selectors[y];
 
 						for (uint32_t x = 0; x < max_x; x++)
+						{
 							reinterpret_cast<uint16_t*>(pDst_pixels)[x] = packed_colors[(s >> (x * 2)) & 3];
+						}
 
 						pDst_pixels += output_row_pitch_in_blocks_or_pixels * sizeof(uint16_t);
 					}
@@ -8903,7 +8708,7 @@ namespace basist
 
 		if (endpoint_pred_repeat_count != 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: endpoint_pred_repeat_count != 0. The file is corrupted or this is a bug\n");
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: endpoint_pred_repeat_count != 0. The file is corrupted or this is a bug\n");
 			return false;
 		}
 
@@ -8914,7 +8719,7 @@ namespace basist
 		if (fmt == block_format::cPVRTC1_4_RGB)
 			fixup_pvrtc1_4_modulation_rgb((decoder_etc_block*)pPVRTC_work_mem, pPVRTC_endpoints, pDst_blocks, num_blocks_x, num_blocks_y);
 		else if (fmt == block_format::cPVRTC1_4_RGBA)
-			fixup_pvrtc1_4_modulation_rgba((decoder_etc_block*)pPVRTC_work_mem, pPVRTC_endpoints, pDst_blocks, num_blocks_x, num_blocks_y, pAlpha_blocks, &m_endpoints[0], &m_selectors[0]);
+			fixup_pvrtc1_4_modulation_rgba((decoder_etc_block*)pPVRTC_work_mem, pPVRTC_endpoints, pDst_blocks, num_blocks_x, num_blocks_y, pAlpha_blocks, &endpoints[0], &selectors[0]);
 #endif // BASISD_SUPPORT_PVRTC1
 
 		if (pPVRTC_work_mem)
@@ -8923,8 +8728,1187 @@ namespace basist
 		return true;
 	}
 
+	bool basis_validate_output_buffer_size(transcoder_texture_format target_format,
+		uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		uint32_t orig_width, uint32_t orig_height,
+		uint32_t output_row_pitch_in_blocks_or_pixels,
+		uint32_t output_rows_in_pixels,
+		uint32_t total_slice_blocks)
+	{
+		if (basis_transcoder_format_is_uncompressed(target_format))
+		{
+			// Assume the output buffer is orig_width by orig_height
+			if (!output_row_pitch_in_blocks_or_pixels)
+				output_row_pitch_in_blocks_or_pixels = orig_width;
+
+			if (!output_rows_in_pixels)
+				output_rows_in_pixels = orig_height;
+
+			// Now make sure the output buffer is large enough, or we'll overwrite memory.
+			if (output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels))
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n");
+				return false;
+			}
+		}
+		else if (target_format == transcoder_texture_format::cTFFXT1_RGB)
+		{
+			const uint32_t num_blocks_fxt1_x = (orig_width + 7) / 8;
+			const uint32_t num_blocks_fxt1_y = (orig_height + 3) / 4;
+			const uint32_t total_blocks_fxt1 = num_blocks_fxt1_x * num_blocks_fxt1_y;
+
+			if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n");
+				return false;
+			}
+		}
+		else
+		{
+			if (output_blocks_buf_size_in_blocks_or_pixels < total_slice_blocks)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n");
+				return false;
+			}
+		}
+		return true;
+	}
+
+	bool basisu_lowlevel_etc1s_transcoder::transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t rgb_offset, uint32_t rgb_length, uint32_t alpha_offset, uint32_t alpha_length,
+			uint32_t decode_flags,
+			bool basis_file_has_alpha_slices,
+			bool is_video,
+			uint32_t output_row_pitch_in_blocks_or_pixels,
+			basisu_transcoder_state* pState,
+			uint32_t output_rows_in_pixels)
+	{
+		if (((uint64_t)rgb_offset + rgb_length) > (uint64_t)compressed_data_length)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: source data buffer too small (color)\n");
+			return false;
+		}
+
+		if (alpha_length)
+		{
+			if (((uint64_t)alpha_offset + alpha_length) > (uint64_t)compressed_data_length)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: source data buffer too small (alpha)\n");
+				return false;
+			}
+		}
+		else
+		{
+			assert(!basis_file_has_alpha_slices);
+		}
+
+		if ((target_format == transcoder_texture_format::cTFPVRTC1_4_RGB) || (target_format == transcoder_texture_format::cTFPVRTC1_4_RGBA))
+		{
+			if ((!basisu::is_pow2(num_blocks_x * 4)) || (!basisu::is_pow2(num_blocks_y * 4)))
+			{
+				// PVRTC1 only supports power of 2 dimensions
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: PVRTC1 only supports power of 2 dimensions\n");
+				return false;
+			}
+		}
+
+		if ((target_format == transcoder_texture_format::cTFPVRTC1_4_RGBA) && (!basis_file_has_alpha_slices))
+		{
+			// Switch to PVRTC1 RGB if the input doesn't have alpha.
+			target_format = transcoder_texture_format::cTFPVRTC1_4_RGB;
+		}
+				
+		const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0;
+		const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format);
+		const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y;
+
+		if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks))
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output buffer size too small\n");
+			return false;
+		}
+
+		bool status = false;
+
+		const uint8_t* pData = pCompressed_data + rgb_offset;
+		uint32_t data_len = rgb_length;
+		bool is_alpha_slice = false;
+
+		// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
+		if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
+		{
+			pData = pCompressed_data + alpha_offset;
+			data_len = alpha_length;
+			is_alpha_slice = true;
+		}
+
+		switch (target_format)
+		{
+		case transcoder_texture_format::cTFETC1_RGB:
+		{
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+							
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC1 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC1_RGB:
+		{
+#if !BASISD_SUPPORT_DXT1
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: BC1/DXT1 unsupported\n");
+			return false;
+#else
+			// status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC1, bytes_per_block_or_pixel, true, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC1 failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFBC4_R:
+		{
+#if !BASISD_SUPPORT_DXT5A
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: BC4/DXT5A unsupported\n");
+			return false;
+#else
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC4 failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFPVRTC1_4_RGB:
+		{
+#if !BASISD_SUPPORT_PVRTC1
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: PVRTC1 4 unsupported\n");
+			return false;
+#else
+			// output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?)
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGB failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+		{
+#if !BASISD_SUPPORT_PVRTC1
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: PVRTC1 4 unsupported\n");
+			return false;
+#else
+			assert(basis_file_has_alpha_slices);
+			assert(alpha_length);
+
+			// Temp buffer to hold alpha block endpoint/selector indices
+			basisu::vector<uint32_t> temp_block_indices(total_slice_blocks);
+
+			// First transcode alpha data to temp buffer
+			//status = transcode_slice(pData, data_size, slice_index + 1, &temp_block_indices[0], total_slice_blocks, block_format::cIndices, sizeof(uint32_t), decode_flags, pSlice_descs[slice_index].m_num_blocks_x, pState);
+			status = transcode_slice(&temp_block_indices[0], num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, num_blocks_x, pState, false, nullptr, 0);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGBA failed (0)\n");
+			}
+			else
+			{
+				// output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?)
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState, &temp_block_indices[0]);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, &temp_block_indices[0], 0);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGBA failed (1)\n");
+				}
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFBC7_RGBA:
+		case transcoder_texture_format::cTFBC7_ALT:
+		{
+#if !BASISD_SUPPORT_BC7_MODE5
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: BC7 unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+			// We used to support transcoding just alpha to BC7 - but is that useful at all?
+
+			// First transcode the color slice. The cBC7_M5_COLOR transcoder will output opaque mode 5 blocks.
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_COLOR, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC7_M5_COLOR, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+
+			if ((status) && (basis_file_has_alpha_slices))
+			{
+				// Now transcode the alpha slice. The cBC7_M5_ALPHA transcoder will now change the opaque mode 5 blocks to blocks with alpha.
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_ALPHA, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC7_M5_ALPHA, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			}
+
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC7 failed (0)\n");
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFETC2_RGBA:
+		{
+#if !BASISD_SUPPORT_ETC2_EAC_A8
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: ETC2 EAC A8 unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+
+			if (basis_file_has_alpha_slices)
+			{
+				// First decode the alpha data 
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_A8, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_A8, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			}
+			else
+			{
+				//write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_A8, 16, output_row_pitch_in_blocks_or_pixels);
+				basisu_transcoder::write_opaque_alpha_blocks(num_blocks_x, num_blocks_y, pOutput_blocks, block_format::cETC2_EAC_A8, 16, output_row_pitch_in_blocks_or_pixels);
+				status = true;
+			}
+
+			if (status)
+			{
+				// Now decode the color data
+				//status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2 RGB failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2 A failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFBC3_RGBA:
+		{
+#if !BASISD_SUPPORT_DXT1
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: DXT1 unsupported\n");
+			return false;
+#elif !BASISD_SUPPORT_DXT5A
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: DXT5A unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+						
+			// First decode the alpha data 
+			if (basis_file_has_alpha_slices)
+			{
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			}
+			else
+			{
+				basisu_transcoder::write_opaque_alpha_blocks(num_blocks_x, num_blocks_y, pOutput_blocks, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
+				status = true;
+			}
+
+			if (status)
+			{
+				// Now decode the color data. Forbid 3 color blocks, which aren't allowed in BC3.
+				//status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, 16, decode_flags | cDecodeFlagsBC1ForbidThreeColorBlocks, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC3 RGB failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC3 A failed\n");
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFBC5_RG:
+		{
+#if !BASISD_SUPPORT_DXT5A
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: DXT5A unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+
+			//bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+				//	uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+				//	basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+
+			// Decode the R data (actually the green channel of the color data slice in the basis file)
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (status)
+			{
+				if (basis_file_has_alpha_slices)
+				{
+					// Decode the G data (actually the green channel of the alpha data slice in the basis file)
+					//status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+					status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+					if (!status)
+					{
+						BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC5 1 failed\n");
+					}
+				}
+				else
+				{
+					basisu_transcoder::write_opaque_alpha_blocks(num_blocks_x, num_blocks_y, (uint8_t*)pOutput_blocks + 8, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
+					status = true;
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC5 channel 0 failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		{
+#if !BASISD_SUPPORT_ASTC
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: ASTC unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+
+			if (basis_file_has_alpha_slices)
+			{
+				// First decode the alpha data to the output (we're using the output texture as a temp buffer here).
+				//status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (status)
+				{
+					// Now decode the color data and transcode to ASTC. The transcoder function will read the alpha selector data from the output texture as it converts and
+					// transcode both the alpha and color data at the same time to ASTC.
+					//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState);
+					status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels);
+				}
+			}
+			else
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ASTC failed (0)\n");
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFATC_RGB:
+		{
+#if !BASISD_SUPPORT_ATC
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: ATC unsupported\n");
+			return false;
+#else
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ATC_RGB failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFATC_RGBA:
+		{
+#if !BASISD_SUPPORT_ATC
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: ATC unsupported\n");
+			return false;
+#elif !BASISD_SUPPORT_DXT5A
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: DXT5A unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+
+			// First decode the alpha data 
+			if (basis_file_has_alpha_slices)
+			{
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			}
+			else
+			{
+				basisu_transcoder::write_opaque_alpha_blocks(num_blocks_x, num_blocks_y, pOutput_blocks, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
+				status = true;
+			}
+
+			if (status)
+			{
+				//status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ATC RGB failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ATC A failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFPVRTC2_4_RGB:
+		{
+#if !BASISD_SUPPORT_PVRTC2
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: PVRTC2 unsupported\n");
+			return false;
+#else
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to cPVRTC2_4_RGB failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+		{
+#if !BASISD_SUPPORT_PVRTC2
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: PVRTC2 unsupported\n");
+			return false;
+#else
+			if (basis_file_has_alpha_slices)
+			{
+				// First decode the alpha data to the output (we're using the output texture as a temp buffer here).
+				//status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to failed\n");
+				}
+				else
+				{
+					// Now decode the color data and transcode to PVRTC2 RGBA. 
+					//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGBA, bytes_per_block_or_pixel, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState);
+					status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels);
+				}
+			}
+			else
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to cPVRTC2_4_RGBA failed\n");
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFRGBA32:
+		{
+			// Raw 32bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
+
+			// First decode the alpha data 
+			if (basis_file_has_alpha_slices)
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cA32, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			else
+				status = true;
+
+			if (status)
+			{
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA32 RGB failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA32 A failed\n");
+			}
+
+			break;
+		}
+		case transcoder_texture_format::cTFRGB565:
+		case transcoder_texture_format::cTFBGR565:
+		{
+			// Raw 16bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
+
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, (fmt == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, (target_format == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGB565 RGB failed\n");
+			}
+
+			break;
+		}
+		case transcoder_texture_format::cTFRGBA4444:
+		{
+			// Raw 16bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
+
+			// First decode the alpha data 
+			if (basis_file_has_alpha_slices)
+				//status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			else
+				status = true;
+
+			if (status)
+			{
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA4444 RGB failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA4444 A failed\n");
+			}
+
+			break;
+		}
+		case transcoder_texture_format::cTFFXT1_RGB:
+		{
+#if !BASISD_SUPPORT_FXT1
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: FXT1 unsupported\n");
+			return false;
+#else
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cFXT1_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cFXT1_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to FXT1_RGB failed\n");
+			}
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFETC2_EAC_R11:
+		{
+#if !BASISD_SUPPORT_ETC2_EAC_RG11
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: EAC_RG11 unsupported\n");
+			return false;
+#else
+			//status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2_EAC_R11 failed\n");
+			}
+
+			break;
+#endif
+		}
+		case transcoder_texture_format::cTFETC2_EAC_RG11:
+		{
+#if !BASISD_SUPPORT_ETC2_EAC_RG11
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: EAC_RG11 unsupported\n");
+			return false;
+#else
+			assert(bytes_per_block_or_pixel == 16);
+
+			if (basis_file_has_alpha_slices)
+			{
+				// First decode the alpha data to G
+				//status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+			}
+			else
+			{
+				basisu_transcoder::write_opaque_alpha_blocks(num_blocks_x, num_blocks_y, (uint8_t*)pOutput_blocks + 8, block_format::cETC2_EAC_R11, 16, output_row_pitch_in_blocks_or_pixels);
+				status = true;
+			}
+
+			if (status)
+			{
+				// Now decode the color data to R
+				//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels);
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2_EAC_R11 R failed\n");
+				}
+			}
+			else
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2_EAC_R11 G failed\n");
+			}
+
+			break;
+#endif
+		}
+		default:
+		{
+			assert(0);
+			BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: Invalid fmt\n");
+			break;
+		}
+		}
+
+		return status;
+	}
+	
+	basisu_lowlevel_uastc_transcoder::basisu_lowlevel_uastc_transcoder()
+	{
+	}
+
+	bool basisu_lowlevel_uastc_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+        uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels,
+		basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags)
+	{
+		BASISU_NOTE_UNUSED(pState);
+		BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks);
+
+		assert(g_transcoder_initialized);
+		if (!g_transcoder_initialized)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: Transcoder not globally initialized.\n");
+			return false;
+		}
+
+#if BASISD_SUPPORT_UASTC
+		const uint32_t total_blocks = num_blocks_x * num_blocks_y;
+
+		if (!output_row_pitch_in_blocks_or_pixels)
+		{
+			if (basis_block_format_is_uncompressed(fmt))
+				output_row_pitch_in_blocks_or_pixels = orig_width;
+			else
+			{
+				if (fmt == block_format::cFXT1_RGB)
+					output_row_pitch_in_blocks_or_pixels = (orig_width + 7) / 8;
+				else
+					output_row_pitch_in_blocks_or_pixels = num_blocks_x;
+			}
+		}
+
+		if (basis_block_format_is_uncompressed(fmt))
+		{
+			if (!output_rows_in_pixels)
+				output_rows_in_pixels = orig_height;
+		}
+
+		uint32_t total_expected_block_bytes = sizeof(uastc_block) * total_blocks;
+		if (image_data_size < total_expected_block_bytes)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n");
+			return false;
+		}
+
+		const uastc_block* pSource_block = reinterpret_cast<const uastc_block *>(pImage_data);
+
+		const bool high_quality = (decode_flags & cDecodeFlagsHighQuality) != 0;
+		const bool from_alpha = has_alpha && (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0;
+
+		bool status = false;
+		if ((fmt == block_format::cPVRTC1_4_RGB) || (fmt == block_format::cPVRTC1_4_RGBA))
+		{
+			if (fmt == block_format::cPVRTC1_4_RGBA)
+				transcode_uastc_to_pvrtc1_4_rgba((const uastc_block*)pImage_data, pDst_blocks, num_blocks_x, num_blocks_y, high_quality);
+			else
+				transcode_uastc_to_pvrtc1_4_rgb((const uastc_block *)pImage_data, pDst_blocks, num_blocks_x, num_blocks_y, high_quality, from_alpha);
+		}
+		else
+		{
+			for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y)
+			{
+				void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes;
+								
+				for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t *)pDst_block + output_block_or_pixel_stride_in_bytes)
+				{
+					switch (fmt)
+					{
+					case block_format::cETC1:
+					{
+						if (from_alpha)
+							status = transcode_uastc_to_etc1(*pSource_block, pDst_block, 3);
+						else
+							status = transcode_uastc_to_etc1(*pSource_block, pDst_block);
+						break;
+					}
+					case block_format::cETC2_RGBA:
+					{
+						status = transcode_uastc_to_etc2_rgba(*pSource_block, pDst_block);
+						break;
+					}
+					case block_format::cBC1:
+					{
+						status = transcode_uastc_to_bc1(*pSource_block, pDst_block, high_quality);
+						break;
+					}
+					case block_format::cBC3:
+					{
+						status = transcode_uastc_to_bc3(*pSource_block, pDst_block, high_quality);
+						break;
+					}
+					case block_format::cBC4:
+					{
+						if (channel0 < 0) 
+							channel0 = 0;
+						status = transcode_uastc_to_bc4(*pSource_block, pDst_block, high_quality, channel0);
+						break;
+					}
+					case block_format::cBC5:
+					{
+						if (channel0 < 0)
+							channel0 = 0;
+						if (channel1 < 0)
+							channel1 = 3;
+						status = transcode_uastc_to_bc5(*pSource_block, pDst_block, high_quality, channel0, channel1);
+						break;
+					}
+					case block_format::cBC7:
+					case block_format::cBC7_M5_COLOR: // for consistently with ETC1S
+					{
+						status = transcode_uastc_to_bc7(*pSource_block, pDst_block);
+						break;
+					}
+					case block_format::cASTC_4x4:
+					{
+						status = transcode_uastc_to_astc(*pSource_block, pDst_block);
+						break;
+					}
+					case block_format::cETC2_EAC_R11:
+					{
+						if (channel0 < 0)
+							channel0 = 0;
+						status = transcode_uastc_to_etc2_eac_r11(*pSource_block, pDst_block, high_quality, channel0);
+						break;
+					}
+					case block_format::cETC2_EAC_RG11:
+					{
+						if (channel0 < 0)
+							channel0 = 0;
+						if (channel1 < 0)
+							channel1 = 3;
+						status = transcode_uastc_to_etc2_eac_rg11(*pSource_block, pDst_block, high_quality, channel0, channel1);
+						break;
+					}
+					case block_format::cRGBA32:
+					{
+						color32 block_pixels[4][4];
+						status = unpack_uastc(*pSource_block, (color32 *)block_pixels, false);
+
+						assert(sizeof(uint32_t) == output_block_or_pixel_stride_in_bytes);
+						uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t);
+
+						const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+						const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+						for (uint32_t y = 0; y < max_y; y++)
+						{
+							for (uint32_t x = 0; x < max_x; x++)
+							{
+								const color32& c = block_pixels[y][x];
+
+								pDst_pixels[0 + 4 * x] = c.r;
+								pDst_pixels[1 + 4 * x] = c.g;
+								pDst_pixels[2 + 4 * x] = c.b;
+								pDst_pixels[3 + 4 * x] = c.a;
+							}
+
+							pDst_pixels += output_row_pitch_in_blocks_or_pixels * sizeof(uint32_t);
+						}
+
+						break;
+					}
+					case block_format::cRGB565:
+					case block_format::cBGR565:
+					{
+						color32 block_pixels[4][4];
+						status = unpack_uastc(*pSource_block, (color32*)block_pixels, false);
+
+						assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
+						uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
+
+						const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+						const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+						for (uint32_t y = 0; y < max_y; y++)
+						{
+							for (uint32_t x = 0; x < max_x; x++)
+							{
+								const color32& c = block_pixels[y][x];
+
+								const uint16_t packed = (fmt == block_format::cRGB565) ? static_cast<uint16_t>((mul_8(c.r, 31) << 11) | (mul_8(c.g, 63) << 5) | mul_8(c.b, 31)) :
+									static_cast<uint16_t>((mul_8(c.b, 31) << 11) | (mul_8(c.g, 63) << 5) | mul_8(c.r, 31));
+
+								pDst_pixels[x * 2 + 0] = (uint8_t)(packed & 0xFF);
+								pDst_pixels[x * 2 + 1] = (uint8_t)((packed >> 8) & 0xFF);
+							}
+
+							pDst_pixels += output_row_pitch_in_blocks_or_pixels * sizeof(uint16_t);
+						}
+
+						break;
+					}
+					case block_format::cRGBA4444:
+					{
+						color32 block_pixels[4][4];
+						status = unpack_uastc(*pSource_block, (color32*)block_pixels, false);
+
+						assert(sizeof(uint16_t) == output_block_or_pixel_stride_in_bytes);
+						uint8_t* pDst_pixels = static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint16_t);
+
+						const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+						const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+						for (uint32_t y = 0; y < max_y; y++)
+						{
+							for (uint32_t x = 0; x < max_x; x++)
+							{
+								const color32& c = block_pixels[y][x];
+
+								const uint16_t packed = static_cast<uint16_t>((mul_8(c.r, 15) << 12) | (mul_8(c.g, 15) << 8) | (mul_8(c.b, 15) << 4) | mul_8(c.a, 15));
+
+								pDst_pixels[x * 2 + 0] = (uint8_t)(packed & 0xFF);
+								pDst_pixels[x * 2 + 1] = (uint8_t)((packed >> 8) & 0xFF);
+							}
+
+							pDst_pixels += output_row_pitch_in_blocks_or_pixels * sizeof(uint16_t);
+						}
+						break;
+					}
+					default:
+						assert(0);
+						break;
+
+					}
+
+					if (!status)
+					{
+						BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: Transcoder failed to unpack a UASTC block - this is a bug, or the data was corrupted\n");
+						return false;
+					}
+
+				} // block_x
+
+			} // block_y
+		}
+
+		return true;
+#else
+		BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: UASTC is unsupported\n");
+
+		BASISU_NOTE_UNUSED(decode_flags);
+		BASISU_NOTE_UNUSED(channel0);
+		BASISU_NOTE_UNUSED(channel1);
+		BASISU_NOTE_UNUSED(output_rows_in_pixels);
+		BASISU_NOTE_UNUSED(output_row_pitch_in_blocks_or_pixels);
+		BASISU_NOTE_UNUSED(output_block_or_pixel_stride_in_bytes);
+		BASISU_NOTE_UNUSED(fmt);
+		BASISU_NOTE_UNUSED(image_data_size);
+		BASISU_NOTE_UNUSED(pImage_data);
+		BASISU_NOTE_UNUSED(num_blocks_x);
+		BASISU_NOTE_UNUSED(num_blocks_y);
+		BASISU_NOTE_UNUSED(pDst_blocks);
+
+		return false;
+#endif
+	}
+		
+	bool basisu_lowlevel_uastc_transcoder::transcode_image(
+		transcoder_texture_format target_format,
+		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+		uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+		uint32_t slice_offset, uint32_t slice_length,
+		uint32_t decode_flags,
+		bool has_alpha,
+		bool is_video,
+		uint32_t output_row_pitch_in_blocks_or_pixels,
+		basisu_transcoder_state* pState,
+		uint32_t output_rows_in_pixels,
+		int channel0, int channel1)
+	{
+		BASISU_NOTE_UNUSED(is_video);
+		BASISU_NOTE_UNUSED(level_index);
+
+		if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: source data buffer too small\n");
+			return false;
+		}	
+
+		if ((target_format == transcoder_texture_format::cTFPVRTC1_4_RGB) || (target_format == transcoder_texture_format::cTFPVRTC1_4_RGBA))
+		{
+			if ((!basisu::is_pow2(num_blocks_x * 4)) || (!basisu::is_pow2(num_blocks_y * 4)))
+			{
+				// PVRTC1 only supports power of 2 dimensions
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: PVRTC1 only supports power of 2 dimensions\n");
+				return false;
+			}
+		}
+
+		if ((target_format == transcoder_texture_format::cTFPVRTC1_4_RGBA) && (!has_alpha))
+		{
+			// Switch to PVRTC1 RGB if the input doesn't have alpha.
+			target_format = transcoder_texture_format::cTFPVRTC1_4_RGB;
+		}
+
+		const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0;
+		const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format);
+		const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y;
+
+		if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks))
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: output buffer size too small\n");
+			return false;
+		}
+				
+		bool status = false;
+
+		// UASTC4x4
+		switch (target_format)
+		{
+		case transcoder_texture_format::cTFETC1_RGB:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC1,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+				
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ETC1 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFETC2_RGBA:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_RGBA,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ETC2 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC1_RGB:
+		{
+			// TODO: ETC1S allows BC1 from alpha channel. That doesn't seem actually useful, though.
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC1,
+				bytes_per_block_or_pixel, true, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC1 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC3_RGBA:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC3, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC3,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC3 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC4_R:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState,
+			//	nullptr, 0,
+			//	((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC4,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels,
+				((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC4 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC5_RG:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC5, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState,
+			//	nullptr, 0,
+			//	0, 3);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC5,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels,
+				0, 3);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC5 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC7_RGBA:
+		case transcoder_texture_format::cTFBC7_ALT:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC7,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC7 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFPVRTC1_4_RGB:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cPVRTC1_4_RGB,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to PVRTC1 RGB 4bpp failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cPVRTC1_4_RGBA,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to PVRTC1 RGBA 4bpp failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_4x4,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ASTC 4x4 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFATC_RGB:
+		case transcoder_texture_format::cTFATC_RGBA:
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->ATC currently unsupported\n");
+			return false;
+		}
+		case transcoder_texture_format::cTFFXT1_RGB:
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->FXT1 currently unsupported\n");
+			return false;
+		}
+		case transcoder_texture_format::cTFPVRTC2_4_RGB:
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n");
+			return false;
+		}
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n");
+			return false;
+		}
+		case transcoder_texture_format::cTFETC2_EAC_R11:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState,
+			//	nullptr, 0,
+			//	((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_EAC_R11,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels,
+				((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to EAC R11 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFETC2_EAC_RG11:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_RG11, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState,
+			//	nullptr, 0,
+			//	0, 3);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_EAC_RG11,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels,
+				0, 3);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_basisu_lowlevel_uastc_transcodertranscoder::transcode_image: transcode_slice() to EAC RG11 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGBA32:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA32, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA32,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGBA32 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGB565:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGB565, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB565,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGB565 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBGR565:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBGR565, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBGR565,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGB565 failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGBA4444:
+		{
+			//status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA4444, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA4444,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGBA4444 failed\n");
+			}
+			break;
+		}
+		default:
+		{
+			assert(0);
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: Invalid format\n");
+			break;
+		}
+		}
+
+		return status;
+	}
+	
 	basisu_transcoder::basisu_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook) :
-		m_lowlevel_decoder(pGlobal_sel_codebook)
+		m_lowlevel_etc1s_decoder(pGlobal_sel_codebook),
+		m_ready_to_transcode(false)
 	{
 	}
 
@@ -9027,22 +10011,33 @@ namespace basist
 			return false;
 		}
 
-		if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices)
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
 		{
-			if (pHeader->m_total_slices & 1)
+			if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha basis file\n");
+				if (pHeader->m_total_slices & 1)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha .basis file\n");
+					return false;
+				}
+			}
+		
+			// This flag dates back to pre-Basis Universal, when .basis supported full ETC1 too.
+			if ((pHeader->m_flags & cBASISHeaderFlagETC1S) == 0)
+			{
+				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n");
 				return false;
 			}
 		}
-
-		if ((pHeader->m_flags & cBASISHeaderFlagETC1S) == 0)
+		else
 		{
-			// We only support ETC1S in basis universal
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (ETC1S flag check)\n");
-			return false;
+			if ((pHeader->m_flags & cBASISHeaderFlagETC1S) != 0)
+			{
+				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n");
+				return false;
+			}
 		}
-
+		
 		if ((pHeader->m_slice_desc_file_ofs >= data_size) ||
 			((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices))
 			)
@@ -9103,6 +10098,19 @@ namespace basist
 		return pHeader->m_total_images;
 	}
 
+	basis_tex_format basisu_transcoder::get_tex_format(const void* pData, uint32_t data_size) const
+	{
+		if (!validate_header_quick(pData, data_size))
+		{
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n");
+			return basis_tex_format::cETC1S;
+		}
+
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
+
+		return (basis_tex_format)(uint32_t)pHeader->m_tex_format;
+	}
+
 	bool basisu_transcoder::get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const
 	{
 		if (!validate_header_quick(pData, data_size))
@@ -9145,8 +10153,17 @@ namespace basist
 
 		image_info.m_image_index = image_index;
 		image_info.m_total_levels = total_levels;
-		image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
+		
+		image_info.m_alpha_flag = false;
+
+		// For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha.
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
+			image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; 
+		else
+			image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0;
+
 		image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
+
 		image_info.m_width = slice_desc.m_num_blocks_x * 4;
 		image_info.m_height = slice_desc.m_num_blocks_y * 4;
 		image_info.m_orig_width = slice_desc.m_orig_width;
@@ -9264,7 +10281,13 @@ namespace basist
 
 		image_info.m_image_index = image_index;
 		image_info.m_level_index = level_index;
-		image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
+		
+		// For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha.
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
+			image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
+		else
+			image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0;
+		
 		image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
 		image_info.m_width = slice_desc.m_num_blocks_x * 4;
 		image_info.m_height = slice_desc.m_num_blocks_y * 4;
@@ -9275,6 +10298,21 @@ namespace basist
 		image_info.m_total_blocks = image_info.m_num_blocks_x * image_info.m_num_blocks_y;
 		image_info.m_first_slice_index = slice_index;
 
+		image_info.m_rgb_file_ofs = slice_desc.m_file_ofs;
+		image_info.m_rgb_file_len = slice_desc.m_file_size;
+		image_info.m_alpha_file_ofs = 0;
+		image_info.m_alpha_file_len = 0;
+
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
+		{
+			if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices)
+			{
+				assert((slice_index + 1) < (int)pHeader->m_total_slices);
+				image_info.m_alpha_file_ofs = pSlice_descs[slice_index + 1].m_file_ofs;
+				image_info.m_alpha_file_len = pSlice_descs[slice_index + 1].m_file_size;
+			}
+		}
+
 		return true;
 	}
 
@@ -9294,14 +10332,20 @@ namespace basist
 		file_info.m_total_header_size = sizeof(basis_file_header) + pHeader->m_total_slices * sizeof(basis_slice_desc);
 
 		file_info.m_total_selectors = pHeader->m_total_selectors;
+		file_info.m_selector_codebook_ofs = pHeader->m_selector_cb_file_ofs;
 		file_info.m_selector_codebook_size = pHeader->m_selector_cb_file_size;
 
 		file_info.m_total_endpoints = pHeader->m_total_endpoints;
+		file_info.m_endpoint_codebook_ofs = pHeader->m_endpoint_cb_file_ofs;
 		file_info.m_endpoint_codebook_size = pHeader->m_endpoint_cb_file_size;
 
+		file_info.m_tables_ofs = pHeader->m_tables_file_ofs;
 		file_info.m_tables_size = pHeader->m_tables_file_size;
 
-		file_info.m_etc1s = (pHeader->m_flags & cBASISHeaderFlagETC1S) != 0;
+		file_info.m_tex_format = static_cast<basis_tex_format>(static_cast<int>(pHeader->m_tex_format));
+
+		file_info.m_etc1s = (pHeader->m_tex_format == (int)basis_tex_format::cETC1S);
+		
 		file_info.m_y_flipped = (pHeader->m_flags & cBASISHeaderFlagYFlipped) != 0;
 		file_info.m_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
 
@@ -9346,7 +10390,7 @@ namespace basist
 			slice_info.m_image_index = pSlice_descs[i].m_image_index;
 			slice_info.m_level_index = pSlice_descs[i].m_level_index;
 			slice_info.m_unpacked_slice_crc16 = pSlice_descs[i].m_slice_data_crc16;
-			slice_info.m_alpha_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsIsAlphaData) != 0;
+			slice_info.m_alpha_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsHasAlpha) != 0;
 			slice_info.m_iframe_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
 
 			if (pSlice_descs[i].m_image_index >= pHeader->m_total_images)
@@ -9366,15 +10410,9 @@ namespace basist
 
 		return true;
 	}
-
-	bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) const
+		
+	bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size)
 	{
-		if (m_lowlevel_decoder.m_endpoints.size())
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: already called start_transcoding\n");
-			return true;
-		}
-
 		if (!validate_header_quick(pData, data_size))
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: header validation failed\n");
@@ -9382,59 +10420,123 @@ namespace basist
 		}
 
 		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
-
 		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
 
-		if (!pHeader->m_endpoint_cb_file_size || !pHeader->m_selector_cb_file_size || !pHeader->m_tables_file_size)
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (0)\n");
-		}
+			if (m_lowlevel_etc1s_decoder.m_local_endpoints.size())
+			{
+				m_lowlevel_etc1s_decoder.clear();
+			}
 
-		if ((pHeader->m_endpoint_cb_file_ofs > data_size) || (pHeader->m_selector_cb_file_ofs > data_size) || (pHeader->m_tables_file_ofs > data_size))
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (1)\n");
-			return false;
-		}
+			if (pHeader->m_flags & cBASISHeaderFlagUsesGlobalCodebook)
+			{
+				if (!m_lowlevel_etc1s_decoder.get_global_codebooks())
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: File uses global codebooks, but set_global_codebooks() has not been called\n");
+					return false;
+				}
+				if (!m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size())
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebooks must be unpacked first by calling start_transcoding()\n");
+					return false;
+				}
+				if ((m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size() != pHeader->m_total_endpoints) ||
+					 (m_lowlevel_etc1s_decoder.get_global_codebooks()->get_selectors().size() != pHeader->m_total_selectors))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebook size mismatch (wrong codebooks for file).\n");
+					return false;
+				}
+				if (!pHeader->m_tables_file_size)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (2)\n");
+					return false;
+				}
+				if (pHeader->m_tables_file_ofs > data_size)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (4)\n");
+					return false;
+				}
+				if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (5)\n");
+					return false;
+				}
+			}
+			else
+			{
+				if (!pHeader->m_endpoint_cb_file_size || !pHeader->m_selector_cb_file_size || !pHeader->m_tables_file_size)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (0)\n");
+						return false;
+				}
 
-		if (pHeader->m_endpoint_cb_file_size > (data_size - pHeader->m_endpoint_cb_file_ofs))
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (2)\n");
-			return false;
-		}
+				if ((pHeader->m_endpoint_cb_file_ofs > data_size) || (pHeader->m_selector_cb_file_ofs > data_size) || (pHeader->m_tables_file_ofs > data_size))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (1)\n");
+					return false;
+				}
 
-		if (pHeader->m_selector_cb_file_size > (data_size - pHeader->m_selector_cb_file_ofs))
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n");
-			return false;
-		}
+				if (pHeader->m_endpoint_cb_file_size > (data_size - pHeader->m_endpoint_cb_file_ofs))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (2)\n");
+					return false;
+				}
 
-		if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs))
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n");
-			return false;
-		}
+				if (pHeader->m_selector_cb_file_size > (data_size - pHeader->m_selector_cb_file_ofs))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n");
+					return false;
+				}
 
-		if (!m_lowlevel_decoder.decode_palettes(
-			pHeader->m_total_endpoints, pDataU8 + pHeader->m_endpoint_cb_file_ofs, pHeader->m_endpoint_cb_file_size,
-			pHeader->m_total_selectors, pDataU8 + pHeader->m_selector_cb_file_ofs, pHeader->m_selector_cb_file_size))
-		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_palettes failed\n");
-			return false;
-		}
+				if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n");
+					return false;
+				}
 
-		if (!m_lowlevel_decoder.decode_tables(pDataU8 + pHeader->m_tables_file_ofs, pHeader->m_tables_file_size))
+				if (!m_lowlevel_etc1s_decoder.decode_palettes(
+					pHeader->m_total_endpoints, pDataU8 + pHeader->m_endpoint_cb_file_ofs, pHeader->m_endpoint_cb_file_size,
+					pHeader->m_total_selectors, pDataU8 + pHeader->m_selector_cb_file_ofs, pHeader->m_selector_cb_file_size))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_palettes failed\n");
+					return false;
+				}
+			}
+
+			if (!m_lowlevel_etc1s_decoder.decode_tables(pDataU8 + pHeader->m_tables_file_ofs, pHeader->m_tables_file_size))
+			{
+				BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_tables failed\n");
+				return false;
+			}
+		}
+		else
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_tables failed\n");
-			return false;
+			// Nothing special to do for UASTC.
+			if (m_lowlevel_etc1s_decoder.m_local_endpoints.size())
+			{
+				m_lowlevel_etc1s_decoder.clear();
+			}
 		}
+		
+		m_ready_to_transcode = true;
+
+		return true;
+	}
 
+	bool basisu_transcoder::stop_transcoding()
+	{
+		m_lowlevel_etc1s_decoder.clear();
+
+		m_ready_to_transcode = false;
+		
 		return true;
 	}
 
 	bool basisu_transcoder::transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, block_format fmt,
-		uint32_t output_block_or_pixel_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, void *pAlpha_blocks, uint32_t output_rows_in_pixels) const
+		uint32_t output_block_or_pixel_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, void *pAlpha_blocks, uint32_t output_rows_in_pixels, int channel0, int channel1) const
 	{
-		if (!m_lowlevel_decoder.m_endpoints.size())
+		if (!m_ready_to_transcode)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: must call start_transcoding first\n");
 			return false;
@@ -9529,16 +10631,26 @@ namespace basist
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_size, or passed in buffer too small\n");
 			return false;
 		}
-
-		return m_lowlevel_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
-			pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
-			fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState,
-			(decode_flags & cDecodeFlagsOutputHasAlphaIndices) != 0, pAlpha_blocks, output_rows_in_pixels);
+				
+		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
+		{
+			return m_lowlevel_uastc_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
+				pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
+				fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState,
+				output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+		else
+		{
+			return m_lowlevel_etc1s_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
+				pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
+				fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState,
+				(decode_flags & cDecodeFlagsOutputHasAlphaIndices) != 0, pAlpha_blocks, output_rows_in_pixels);
+		}
 	}
 
 	int basisu_transcoder::find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const
 	{
-		(void)data_size;
+		BASISU_NOTE_UNUSED(data_size);
 
 		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
@@ -9576,9 +10688,16 @@ namespace basist
 			const basis_slice_desc& slice_desc = pSlice_descs[slice_iter];
 			if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index))
 			{
-				const bool slice_alpha = (slice_desc.m_flags & cSliceDescFlagsIsAlphaData) != 0;
-				if (slice_alpha == alpha_data)
+				if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
+				{
+					const bool slice_alpha = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0;
+					if (slice_alpha == alpha_data)
+						return slice_iter;
+				}
+				else
+				{
 					return slice_iter;
+				}
 			}
 		}
 
@@ -9587,12 +10706,16 @@ namespace basist
 		return -1;
 	}
 
-	static void write_opaque_alpha_blocks(
+	void basisu_transcoder::write_opaque_alpha_blocks(
 		uint32_t num_blocks_x, uint32_t num_blocks_y,
-		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, block_format fmt,
+		void* pOutput_blocks, block_format fmt,
 		uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels)
 	{
-		BASISU_NOTE_UNUSED(output_blocks_buf_size_in_blocks_or_pixels);
+		// 'num_blocks_y', 'pOutput_blocks' & 'block_stride_in_bytes' unused
+		// when disabling BASISD_SUPPORT_ETC2_EAC_A8 *and* BASISD_SUPPORT_DXT5A
+		BASISU_NOTE_UNUSED(num_blocks_y);
+		BASISU_NOTE_UNUSED(pOutput_blocks);
+		BASISU_NOTE_UNUSED(block_stride_in_bytes);
 
 		if (!output_row_pitch_in_blocks_or_pixels)
 			output_row_pitch_in_blocks_or_pixels = num_blocks_x;
@@ -9606,8 +10729,7 @@ namespace basist
 			blk.m_table = 13;
 
 			// Selectors are all 4's
-			static const uint8_t s_etc2_eac_a8_sel4[6] = { 0x92, 0x49, 0x24, 0x92, 0x49, 0x24 };
-			memcpy(&blk.m_selectors, s_etc2_eac_a8_sel4, sizeof(s_etc2_eac_a8_sel4));
+			memcpy(&blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4));
 
 			for (uint32_t y = 0; y < num_blocks_y; y++)
 			{
@@ -9648,9 +10770,9 @@ namespace basist
 		transcoder_texture_format fmt,
 		uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state *pState, uint32_t output_rows_in_pixels) const
 	{
-		const uint32_t bytes_per_block = basis_get_bytes_per_block(fmt);
+		const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(fmt);
 
-		if (!m_lowlevel_decoder.m_endpoints.size())
+		if (!m_ready_to_transcode)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: must call start_transcoding() first\n");
 			return false;
@@ -9693,37 +10815,40 @@ namespace basist
 			fmt = transcoder_texture_format::cTFPVRTC1_4_RGB;
 		}
 				
-		if (pSlice_descs[slice_index].m_flags & cSliceDescFlagsIsAlphaData)
+		if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has out of order alpha slice\n");
-
-			// The first slice shouldn't have alpha data in a properly formed basis file
-			return false;
-		}
-
-		if (basis_file_has_alpha_slices)
-		{
-			// The alpha data should immediately follow the color data, and have the same resolution.
-			if ((slice_index + 1U) >= pHeader->m_total_slices)
+			if (pSlice_descs[slice_index].m_flags & cSliceDescFlagsHasAlpha)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice\n");
-				// basis file is missing the alpha slice
-				return false;
-			}
+				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has out of order alpha slice\n");
 
-			// Basic sanity checks
-			if ((pSlice_descs[slice_index + 1].m_flags & cSliceDescFlagsIsAlphaData) == 0)
-			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice (flag check)\n");
-				// This slice should have alpha data
+				// The first slice shouldn't have alpha data in a properly formed basis file
 				return false;
 			}
 
-			if ((pSlice_descs[slice_index].m_num_blocks_x != pSlice_descs[slice_index + 1].m_num_blocks_x) || (pSlice_descs[slice_index].m_num_blocks_y != pSlice_descs[slice_index + 1].m_num_blocks_y))
+			if (basis_file_has_alpha_slices)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file slice dimensions bad\n");
-				// Alpha slice should have been the same res as the color slice
-				return false;
+				// The alpha data should immediately follow the color data, and have the same resolution.
+				if ((slice_index + 1U) >= pHeader->m_total_slices)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice\n");
+					// basis file is missing the alpha slice
+					return false;
+				}
+
+				// Basic sanity checks
+				if ((pSlice_descs[slice_index + 1].m_flags & cSliceDescFlagsHasAlpha) == 0)
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice (flag check)\n");
+					// This slice should have alpha data
+					return false;
+				}
+
+				if ((pSlice_descs[slice_index].m_num_blocks_x != pSlice_descs[slice_index + 1].m_num_blocks_x) || (pSlice_descs[slice_index].m_num_blocks_y != pSlice_descs[slice_index + 1].m_num_blocks_y))
+				{
+					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file slice dimensions bad\n");
+					// Alpha slice should have been the same res as the color slice
+					return false;
+				}
 			}
 		}
 								
@@ -9735,798 +10860,6745 @@ namespace basist
 		{
 			// The transcoder doesn't write beyond total_slice_blocks, so we need to clear the rest ourselves.
 			// For GL usage, PVRTC1 4bpp image size is (max(width, 8)* max(height, 8) * 4 + 7) / 8. 
-			// However, for KTX and internally in Basis this formula isn't used, it's just ((width+3)/4) * ((height+3)/4) * bytes_per_block. This is all the transcoder actually writes to memory.
-			memset(static_cast<uint8_t*>(pOutput_blocks) + total_slice_blocks * bytes_per_block, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block);
+			// However, for KTX and internally in Basis this formula isn't used, it's just ((width+3)/4) * ((height+3)/4) * bytes_per_block_or_pixel. This is all the transcoder actually writes to memory.
+			memset(static_cast<uint8_t*>(pOutput_blocks) + total_slice_blocks * bytes_per_block_or_pixel, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block_or_pixel);
 		}
-				
+		
+		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
+		{
+			const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index];
+
+			// Use the container independent image transcode method.
+			status = m_lowlevel_uastc_decoder.transcode_image(fmt,
+				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+				(const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index,
+				pSlice_desc->m_file_ofs, pSlice_desc->m_file_size,
+				decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+		}
+		else 
+		{
+			// ETC1S
+			const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index];
+			const basis_slice_desc* pAlpha_slice_desc = basis_file_has_alpha_slices ? &pSlice_descs[slice_index + 1] : nullptr;
+
+			assert((pSlice_desc->m_flags & cSliceDescFlagsHasAlpha) == 0);
+
+			if (pAlpha_slice_desc)
+			{
+				// Basic sanity checks
+				assert((pAlpha_slice_desc->m_flags & cSliceDescFlagsHasAlpha) != 0);
+				assert(pSlice_desc->m_num_blocks_x == pAlpha_slice_desc->m_num_blocks_x);
+				assert(pSlice_desc->m_num_blocks_y == pAlpha_slice_desc->m_num_blocks_y);
+				assert(pSlice_desc->m_level_index == pAlpha_slice_desc->m_level_index);
+			}
+
+			// Use the container independent image transcode method.
+			status = m_lowlevel_etc1s_decoder.transcode_image(fmt,
+				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+				(const uint8_t *)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index,
+				pSlice_desc->m_file_ofs, pSlice_desc->m_file_size,
+				(pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_ofs : 0U, (pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_size : 0U,
+				decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+
+		} // if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
+      
+      if (!status)
+      {
+         BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning false\n");
+      }
+      else
+      {
+         //BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning true\n");      
+      }
+
+		return status;
+	}
+
+	uint32_t basis_get_bytes_per_block_or_pixel(transcoder_texture_format fmt)
+	{
 		switch (fmt)
 		{
 		case transcoder_texture_format::cTFETC1_RGB:
+		case transcoder_texture_format::cTFBC1_RGB:
+		case transcoder_texture_format::cTFBC4_R:
+		case transcoder_texture_format::cTFPVRTC1_4_RGB:
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+		case transcoder_texture_format::cTFATC_RGB:
+		case transcoder_texture_format::cTFPVRTC2_4_RGB:
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+		case transcoder_texture_format::cTFETC2_EAC_R11:
+			return 8;
+		case transcoder_texture_format::cTFBC7_RGBA:
+		case transcoder_texture_format::cTFBC7_ALT:
+		case transcoder_texture_format::cTFETC2_RGBA:
+		case transcoder_texture_format::cTFBC3_RGBA:
+		case transcoder_texture_format::cTFBC5_RG:
+		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		case transcoder_texture_format::cTFATC_RGBA:
+		case transcoder_texture_format::cTFFXT1_RGB:
+		case transcoder_texture_format::cTFETC2_EAC_RG11:
+			return 16;
+		case transcoder_texture_format::cTFRGBA32:
+			return sizeof(uint32_t);
+		case transcoder_texture_format::cTFRGB565:
+		case transcoder_texture_format::cTFBGR565:
+		case transcoder_texture_format::cTFRGBA4444:
+			return sizeof(uint16_t);
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
+			break;
+		}
+		return 0;
+	}
+
+	const char* basis_get_format_name(transcoder_texture_format fmt)
+	{
+		switch (fmt)
 		{
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+		case transcoder_texture_format::cTFETC1_RGB: return "ETC1_RGB";
+		case transcoder_texture_format::cTFBC1_RGB: return "BC1_RGB";
+		case transcoder_texture_format::cTFBC4_R: return "BC4_R";
+		case transcoder_texture_format::cTFPVRTC1_4_RGB: return "PVRTC1_4_RGB";
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA: return "PVRTC1_4_RGBA";
+		case transcoder_texture_format::cTFBC7_RGBA: return "BC7_RGBA";
+		case transcoder_texture_format::cTFBC7_ALT: return "BC7_RGBA";
+		case transcoder_texture_format::cTFETC2_RGBA: return "ETC2_RGBA";
+		case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA";
+		case transcoder_texture_format::cTFBC5_RG: return "BC5_RG";
+		case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA";
+		case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB";
+		case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA";
+		case transcoder_texture_format::cTFRGBA32: return "RGBA32";
+		case transcoder_texture_format::cTFRGB565: return "RGB565";
+		case transcoder_texture_format::cTFBGR565: return "BGR565";
+		case transcoder_texture_format::cTFRGBA4444: return "RGBA4444";
+		case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB";
+		case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB";
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA";
+		case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11";
+		case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11";
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
+			break;
+		}
+		return "";
+	}
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+	const char* basis_get_block_format_name(block_format fmt)
+	{
+		switch (fmt)
+		{
+		case block_format::cETC1: return "ETC1";
+		case block_format::cBC1: return "BC1";
+		case block_format::cPVRTC1_4_RGB: return "PVRTC1_4_RGB";
+		case block_format::cPVRTC1_4_RGBA: return "PVRTC1_4_RGBA";
+		case block_format::cBC7: return "BC7";
+		case block_format::cETC2_RGBA: return "ETC2_RGBA";
+		case block_format::cBC3: return "BC3";
+		case block_format::cASTC_4x4: return "ASTC_4x4";
+		case block_format::cATC_RGB: return "ATC_RGB";
+		case block_format::cRGBA32: return "RGBA32";
+		case block_format::cRGB565: return "RGB565";
+		case block_format::cBGR565: return "BGR565";
+		case block_format::cRGBA4444: return "RGBA4444";
+		case block_format::cFXT1_RGB: return "FXT1_RGB";
+		case block_format::cPVRTC2_4_RGB: return "PVRTC2_4_RGB";
+		case block_format::cPVRTC2_4_RGBA: return "PVRTC2_4_RGBA";
+		case block_format::cETC2_EAC_R11: return "ETC2_EAC_R11";
+		case block_format::cETC2_EAC_RG11: return "ETC2_EAC_RG11";
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
+		break;
+		}
+		return "";
+	}
+
+	const char* basis_get_texture_type_name(basis_texture_type tex_type)
+	{
+		switch (tex_type)
+		{
+		case cBASISTexType2D: return "2D";
+		case cBASISTexType2DArray: return "2D array";
+		case cBASISTexTypeCubemapArray: return "cubemap array";
+		case cBASISTexTypeVideoFrames: return "video";
+		case cBASISTexTypeVolume: return "3D";
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n");
+			break;
+		}
+		return "";
+	}
+
+	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case transcoder_texture_format::cTFETC2_RGBA:
+		case transcoder_texture_format::cTFBC3_RGBA:
+		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		case transcoder_texture_format::cTFBC7_RGBA:
+		case transcoder_texture_format::cTFBC7_ALT:
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+		case transcoder_texture_format::cTFATC_RGBA:
+		case transcoder_texture_format::cTFRGBA32:
+		case transcoder_texture_format::cTFRGBA4444:
+			return true;
+		default:
+			break;
+		}
+		return false;
+	}
+
+	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case transcoder_texture_format::cTFETC1_RGB: return basisu::texture_format::cETC1;
+		case transcoder_texture_format::cTFBC1_RGB: return basisu::texture_format::cBC1;
+		case transcoder_texture_format::cTFBC4_R: return basisu::texture_format::cBC4;
+		case transcoder_texture_format::cTFPVRTC1_4_RGB: return basisu::texture_format::cPVRTC1_4_RGB;
+		case transcoder_texture_format::cTFPVRTC1_4_RGBA: return basisu::texture_format::cPVRTC1_4_RGBA;
+		case transcoder_texture_format::cTFBC7_RGBA: return basisu::texture_format::cBC7;
+		case transcoder_texture_format::cTFBC7_ALT: return basisu::texture_format::cBC7;
+		case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA;
+		case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3;
+		case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5;
+		case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC4x4;
+		case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB;
+		case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA;
+		case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32;
+		case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565;
+		case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565;
+		case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444;
+		case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB;
+		case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA;
+		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA;
+		case transcoder_texture_format::cTFETC2_EAC_R11: return basisu::texture_format::cETC2_R11_EAC;
+		case transcoder_texture_format::cTFETC2_EAC_RG11: return basisu::texture_format::cETC2_RG11_EAC;
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
+			break;
+		}
+		return basisu::texture_format::cInvalidTextureFormat;
+	}
+
+	bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type)
+	{
+		switch (tex_type)
+		{
+		case transcoder_texture_format::cTFRGBA32:
+		case transcoder_texture_format::cTFRGB565:
+		case transcoder_texture_format::cTFBGR565:
+		case transcoder_texture_format::cTFRGBA4444:
+			return true;
+		default:
+			break;
+		}
+		return false;
+	}
+
+	bool basis_block_format_is_uncompressed(block_format blk_fmt)
+	{
+		switch (blk_fmt)
+		{
+		case block_format::cRGB32:
+		case block_format::cRGBA32:
+		case block_format::cA32:
+		case block_format::cRGB565:
+		case block_format::cBGR565:
+		case block_format::cRGBA4444:
+		case block_format::cRGBA4444_COLOR:
+		case block_format::cRGBA4444_ALPHA:
+		case block_format::cRGBA4444_COLOR_OPAQUE:
+			return true;
+		default:
+			break;
+		}
+		return false;
+	}
+	
+	uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case transcoder_texture_format::cTFRGBA32:
+			return sizeof(uint32_t); 
+		case transcoder_texture_format::cTFRGB565:
+		case transcoder_texture_format::cTFBGR565:
+		case transcoder_texture_format::cTFRGBA4444:
+			return sizeof(uint16_t);
+		default:
+			break;
+		}
+		return 0;
+	}
+	
+	uint32_t basis_get_block_width(transcoder_texture_format tex_type)
+	{
+		switch (tex_type)
+		{
+			case transcoder_texture_format::cTFFXT1_RGB:
+				return 8;
+			default:
+				break;
+		}
+		return 4;
+	}
+
+	uint32_t basis_get_block_height(transcoder_texture_format tex_type)
+	{
+		BASISU_NOTE_UNUSED(tex_type);
+		return 4;
+	}
+	
+	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt)
+	{
+		if (fmt == basis_tex_format::cUASTC4x4)
+		{
+#if BASISD_SUPPORT_UASTC
+			switch (tex_type)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC1 failed\n");
+				// These niche formats aren't currently supported for UASTC - everything else is.
+			case transcoder_texture_format::cTFPVRTC2_4_RGB:
+			case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+			case transcoder_texture_format::cTFATC_RGB:
+			case transcoder_texture_format::cTFATC_RGBA:
+			case transcoder_texture_format::cTFFXT1_RGB:
+				return false;
+			default:
+				return true;
 			}
-			break;
+#endif
 		}
-		case transcoder_texture_format::cTFBC1_RGB:
+		else
 		{
-#if !BASISD_SUPPORT_DXT1
-			return false;
+			switch (tex_type)
+			{
+				// ETC1 and uncompressed are always supported.
+			case transcoder_texture_format::cTFETC1_RGB:
+			case transcoder_texture_format::cTFRGBA32:
+			case transcoder_texture_format::cTFRGB565:
+			case transcoder_texture_format::cTFBGR565:
+			case transcoder_texture_format::cTFRGBA4444:
+				return true;
+#if BASISD_SUPPORT_DXT1
+			case transcoder_texture_format::cTFBC1_RGB:
+				return true;
+#endif
+#if BASISD_SUPPORT_DXT5A
+			case transcoder_texture_format::cTFBC4_R:
+			case transcoder_texture_format::cTFBC5_RG:
+				return true;
+#endif
+#if BASISD_SUPPORT_DXT1 && BASISD_SUPPORT_DXT5A
+			case transcoder_texture_format::cTFBC3_RGBA:
+				return true;
+#endif
+#if BASISD_SUPPORT_PVRTC1
+			case transcoder_texture_format::cTFPVRTC1_4_RGB:
+			case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+				return true;
+#endif
+#if BASISD_SUPPORT_BC7_MODE5
+			case transcoder_texture_format::cTFBC7_RGBA:
+			case transcoder_texture_format::cTFBC7_ALT:
+				return true;
+#endif
+#if BASISD_SUPPORT_ETC2_EAC_A8
+			case transcoder_texture_format::cTFETC2_RGBA:
+				return true;
+#endif
+#if BASISD_SUPPORT_ASTC		
+			case transcoder_texture_format::cTFASTC_4x4_RGBA:
+				return true;
+#endif
+#if BASISD_SUPPORT_ATC
+			case transcoder_texture_format::cTFATC_RGB:
+			case transcoder_texture_format::cTFATC_RGBA:
+				return true;
+#endif
+#if BASISD_SUPPORT_FXT1
+			case transcoder_texture_format::cTFFXT1_RGB:
+				return true;
+#endif
+#if BASISD_SUPPORT_PVRTC2
+			case transcoder_texture_format::cTFPVRTC2_4_RGB:
+			case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+				return true;
 #endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+#if BASISD_SUPPORT_ETC2_EAC_RG11
+			case transcoder_texture_format::cTFETC2_EAC_R11:
+			case transcoder_texture_format::cTFETC2_EAC_RG11:
+				return true;
+#endif
+			default:
+				break;
+			}
+		}
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+		return false;
+	}
+
+	// ------------------------------------------------------------------------------------------------------ 
+	// UASTC
+	// ------------------------------------------------------------------------------------------------------ 
+
+#if BASISD_SUPPORT_UASTC
+	const astc_bc7_common_partition2_desc g_astc_bc7_common_partitions2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2] =
+	{
+		{ 0, 28, false  }, { 1, 20, false }, { 2, 16, true }, { 3, 29, false },
+		{ 4, 91, true }, { 5, 9, false }, { 6, 107, true }, { 7, 72, true },
+		{ 8, 149, false }, { 9, 204, true }, { 10, 50, false }, { 11, 114, true },
+		{ 12, 496, true }, { 13, 17, true }, { 14, 78, false }, { 15, 39, true },
+		{ 17, 252, true }, { 18, 828, true }, { 19, 43, false }, { 20, 156, false },
+		{ 21, 116, false }, { 22, 210, true }, { 23, 476, true }, { 24, 273, false },
+		{ 25, 684, true }, { 26, 359, false }, { 29, 246, true }, { 32, 195, true },
+		{ 33, 694, true }, { 52, 524, true }
+	};
+
+	const bc73_astc2_common_partition_desc g_bc7_3_astc2_common_partitions[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS] =
+	{
+		{ 10, 36, 4 }, { 11, 48, 4 },	{ 0, 61, 3 }, { 2, 137, 4 },
+		{ 8, 161, 5 }, { 13, 183, 4 }, { 1, 226, 2 }, { 33, 281, 2 },
+		{ 40, 302, 3 }, { 20, 307, 4 }, { 21, 479, 0 }, { 58, 495, 3 },
+		{ 3, 593, 0 }, { 32, 594, 2 }, { 59, 605, 1 }, { 34, 799, 3 },
+		{ 20, 812, 1 }, { 14, 988, 4 }, { 31, 993, 3 }
+	};
+
+	const astc_bc7_common_partition3_desc g_astc_bc7_common_partitions3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3] =
+	{
+		{ 4, 260, 0 }, { 8, 74, 5 }, { 9, 32, 5 }, { 10, 156, 2 },
+		{ 11, 183, 2 }, { 12, 15, 0 }, { 13, 745, 4 }, { 20, 0, 1 },
+		{ 35, 335, 1 }, { 36, 902, 5 }, { 57, 254, 0 }
+	};
+
+	const uint8_t g_astc_to_bc7_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 2, 0, 1 },	{ 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } };
+
+	const uint8_t g_bc7_to_astc_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 2, 0, 1 }, { 1, 2, 0 },	{ 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } };
+
+	uint32_t bc7_convert_partition_index_3_to_2(uint32_t p, uint32_t k)
+	{
+		assert(k < 6);
+		switch (k >> 1)
+		{
+		case 0:
+			if (p <= 1)
+				p = 0;
+			else
+				p = 1;
+			break;
+		case 1:
+			if (p == 0)
+				p = 0;
+			else
+				p = 1;
+			break;
+		case 2:
+			if ((p == 0) || (p == 2))
+				p = 0;
+			else
+				p = 1;
+			break;
+		}
+		if (k & 1)
+			p = 1 - p;
+		return p;
+	}
+
+	static const uint8_t g_zero_pattern[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16] =
+	{
+		{ 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1 }, { 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1 }, { 1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0 }, { 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1 },
+		{ 1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0 }, { 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1 }, { 1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,0,1,1,0,0,1,0,0,0 },
+		{ 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1 }, { 1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0 },
+		{ 1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0 },
+		{ 1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1 }, { 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0 }, { 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0 },
+		{ 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1 }, { 1,0,0,0,1,1,0,0,1,1,0,0,1,1,1,0 }, { 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0 },
+		{ 1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,1 }, { 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0 }, { 1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1 }, { 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0 },
+		{ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0 }, { 1,0,0,1,0,0,1,1,0,1,1,0,1,1,0,0 }
+	};
+
+	const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16] =
+	{
+		{ 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2 }, { 1,1,1,1,1,1,1,1,0,0,0,0,2,2,2,2 }, { 1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2 },	{ 1,1,1,1,2,2,2,2,0,0,0,0,0,0,0,0 },
+		{ 1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,0 }, { 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2 }, { 0,2,1,1,0,2,1,1,0,2,1,1,0,2,1,1 },	{ 2,0,0,0,2,0,0,0,2,1,1,1,2,1,1,1 },
+		{ 2,0,1,2,2,0,1,2,2,0,1,2,2,0,1,2 }, { 1,1,1,1,0,0,0,0,2,2,2,2,1,1,1,1 }, { 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2 }
+	};
+
+	const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16] =
+	{
+		{ 0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0 }, { 1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0 },	{ 0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1 },
+		{ 1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1 }, { 0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0 }, { 0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1 },	{ 0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,1 },
+		{ 1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0 }, { 0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,0 },	{ 1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0 },
+		{ 0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0 },	{ 1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0 },
+		{ 1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0 }, { 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0 }, { 1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 }
+	};
+
+	const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3] =
+	{
+		{ 0, 2 }, { 0, 3 }, { 1, 0 }, { 0, 3 }, { 7, 0 }, { 0, 2 }, { 3, 0 }, { 7, 0 },
+		{ 0, 11 }, { 2, 0 }, { 0, 7 }, { 11, 0 }, { 3, 0 }, { 8, 0 }, { 0, 4 }, { 12, 0 },
+		{ 1, 0 }, { 8, 0 }, { 0, 1 }, { 0, 2 }, { 0, 4 }, { 8, 0 }, { 1, 0 }, { 0, 2 },
+		{ 4, 0 }, { 0, 1 }, { 4, 0 }, { 1, 0 }, { 4, 0 }, { 1, 0 }
+	};
+
+	const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3] =
+	{
+		{ 0, 8, 10 },	{ 8, 0, 12 }, { 4, 0, 12 }, { 8, 0, 4 }, { 3, 0, 2 }, { 0, 1, 3 }, { 0, 2, 1 }, { 1, 9, 0 }, { 1, 2, 0 }, { 4, 0, 8 }, { 0, 6, 2 }
+	};
+
+	const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3] =
+	{
+		{ 0, 4 }, { 0, 2 }, { 2, 0 }, { 0, 7 }, { 8, 0 }, { 0, 1 }, { 0, 3 }, { 0, 1 }, { 2, 0 }, { 0, 1 }, { 0, 8 }, { 2, 0 }, { 0, 1 }, { 0, 7 }, { 12, 0 }, { 2, 0 }, { 9, 0 }, { 0, 2 }, { 4, 0 }
+	};
+
+	const uint32_t g_uastc_mode_huff_codes[TOTAL_UASTC_MODES + 1][2] =
+	{
+		{ 0x1, 4 },
+		{ 0x35, 6 },
+		{ 0x1D, 5 },
+		{ 0x3, 5 },
+
+		{ 0x13, 5 },
+		{ 0xB, 5 },
+		{ 0x1B, 5 },
+		{ 0x7, 5 },
+
+		{ 0x17, 5 },
+		{ 0xF, 5 },
+		{ 0x2, 3 },
+		{ 0x0, 2 },
+
+		{ 0x6, 3 },
+		{ 0x1F, 5 },
+		{ 0xD, 5 },
+		{ 0x5, 7 },
+
+		{ 0x15, 6 },
+		{ 0x25, 6 },
+		{ 0x9, 4 },
+		{ 0x45, 7 } // future expansion
+	};
+
+	// If g_uastc_mode_huff_codes[] changes this table must be updated!
+	static const uint8_t g_uastc_huff_modes[128] =
+	{
+		11,0,10,3,11,15,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,
+		19,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13
+	};
+
+	const uint8_t g_uastc_mode_weight_bits[TOTAL_UASTC_MODES] = { 4, 2, 3, 2, 2, 3, 2, 2,			0,  2, 4, 2, 3, 1, 2,			4, 2, 2,     5 };
+	const uint8_t g_uastc_mode_weight_ranges[TOTAL_UASTC_MODES] = { 8, 2, 5, 2, 2, 5, 2, 2,			0,  2, 8, 2, 5, 0, 2,			8, 2, 2,     11 };
+	const uint8_t g_uastc_mode_endpoint_ranges[TOTAL_UASTC_MODES] = { 19, 20, 8, 7, 12, 20, 18, 12,	0,  8, 13, 13, 19, 20, 20,		20, 20, 20,  11 };
+	const uint8_t g_uastc_mode_subsets[TOTAL_UASTC_MODES] = { 1, 1, 2, 3, 2, 1, 1, 2,			0,  2, 1, 1, 1, 1, 1,			1, 2, 1,     1 };
+	const uint8_t g_uastc_mode_planes[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 2, 1,			0,  1, 1, 2, 1, 2, 1,			1, 1, 2,     1 };
+	const uint8_t g_uastc_mode_comps[TOTAL_UASTC_MODES] = { 3, 3, 3, 3, 3, 3, 3, 3,			4,  4, 4, 4, 4, 4, 4,			2, 2, 2,     3 };
+	const uint8_t g_uastc_mode_has_etc1_bias[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1,			0,  1, 0, 0, 0, 1, 1,			1, 1, 1,     1 };
+	const uint8_t g_uastc_mode_has_bc1_hint0[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1,			0,  1, 1, 1, 1, 1, 1,			1, 1, 1,     1 };
+	const uint8_t g_uastc_mode_has_bc1_hint1[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1,			0,  1, 0, 0, 0, 1, 1,			1, 1, 1,     1 };
+	const uint8_t g_uastc_mode_cem[TOTAL_UASTC_MODES] = { 8, 8, 8, 8, 8, 8, 8, 8,         0,  12, 12, 12, 12, 12, 12,   4, 4, 4,     8 };
+	const uint8_t g_uastc_mode_has_alpha[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0,			1,  1, 1, 1, 1, 1, 1,			1, 1, 1,     0 };
+	const uint8_t g_uastc_mode_is_la[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0,			0,  0, 0, 0, 0, 0, 0,			1, 1, 1,     0 };
+	const uint8_t g_uastc_mode_total_hint_bits[TOTAL_UASTC_MODES] = { 15, 15, 15, 15, 15, 15, 15, 15, 0, 23, 17, 17, 17, 23, 23, 23, 23, 23, 15 };
+
+	// bits, trits, quints
+	const int g_astc_bise_range_table[TOTAL_ASTC_RANGES][3] =
+	{
+		{ 1, 0, 0 }, // 0-1 0
+		{ 0, 1, 0 }, // 0-2 1
+		{ 2, 0, 0 }, // 0-3 2
+		{ 0, 0, 1 }, // 0-4 3
+
+		{ 1, 1, 0 }, // 0-5 4
+		{ 3, 0, 0 }, // 0-7 5
+		{ 1, 0, 1 }, // 0-9 6
+		{ 2, 1, 0 }, // 0-11 7
+
+		{ 4, 0, 0 }, // 0-15 8
+		{ 2, 0, 1 }, // 0-19 9
+		{ 3, 1, 0 }, // 0-23 10
+		{ 5, 0, 0 }, // 0-31 11
+
+		{ 3, 0, 1 }, // 0-39 12
+		{ 4, 1, 0 }, // 0-47 13
+		{ 6, 0, 0 }, // 0-63 14
+		{ 4, 0, 1 }, // 0-79 15
+
+		{ 5, 1, 0 }, // 0-95 16
+		{ 7, 0, 0 }, // 0-127 17
+		{ 5, 0, 1 }, // 0-159 18
+		{ 6, 1, 0 }, // 0-191 19
+
+		{ 8, 0, 0 }, // 0-255 20
+	};
+
+	int astc_get_levels(int range)
+	{
+		assert(range < (int)BC7ENC_TOTAL_ASTC_RANGES);
+		return (1 + 2 * g_astc_bise_range_table[range][1] + 4 * g_astc_bise_range_table[range][2]) << g_astc_bise_range_table[range][0];
+	}
+
+	// g_astc_unquant[] is the inverse of g_astc_sorted_order_unquant[]
+	astc_quant_bin g_astc_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [ASTC encoded endpoint index]
+
+	// Taken right from the ASTC spec.
+	static struct
+	{
+		const char* m_pB_str;
+		uint32_t m_c;
+	} g_astc_endpoint_unquant_params[BC7ENC_TOTAL_ASTC_RANGES] =
+	{
+		{ "", 0 },
+		{ "", 0 },
+		{ "", 0 },
+		{ "", 0 },
+		{ "000000000", 204, },  // 0-5
+		{ "", 0 },
+		{ "000000000", 113, },  // 0-9
+		{ "b000b0bb0", 93 },    // 0-11
+		{ "", 0 },
+		{ "b0000bb00", 54 },    // 0-19
+		{ "cb000cbcb", 44 },   // 0-23
+		{ "", 0 },
+		{ "cb0000cbc", 26 },   // 0-39
+		{ "dcb000dcb", 22 },   // 0-47
+		{ "", 0 },
+		{ "dcb0000dc", 13 },   // 0-79
+		{ "edcb000ed", 11 },   // 0-95
+		{ "", 0 },
+		{ "edcb0000e", 6 },    // 0-159
+		{ "fedcb000f", 5 },     // 0-191
+		{ "", 0 },
+	};
+
+	bool astc_is_valid_endpoint_range(uint32_t range)
+	{
+		if ((g_astc_bise_range_table[range][1] == 0) && (g_astc_bise_range_table[range][2] == 0))
+			return true;
+
+		return g_astc_endpoint_unquant_params[range].m_c != 0;
+	}
+
+	uint32_t unquant_astc_endpoint(uint32_t packed_bits, uint32_t packed_trits, uint32_t packed_quints, uint32_t range)
+	{
+		assert(range < BC7ENC_TOTAL_ASTC_RANGES);
+
+		const uint32_t bits = g_astc_bise_range_table[range][0];
+		const uint32_t trits = g_astc_bise_range_table[range][1];
+		const uint32_t quints = g_astc_bise_range_table[range][2];
+
+		uint32_t val = 0;
+		if ((!trits) && (!quints))
+		{
+			assert(!packed_trits && !packed_quints);
+
+			int bits_left = 8;
+			while (bits_left > 0)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC1 failed\n");
+				uint32_t v = packed_bits;
+
+				int n = basisu::minimumi(bits_left, bits);
+				if (n < (int)bits)
+					v >>= (bits - n);
+
+				assert(v < (1U << n));
+
+				val |= (v << (bits_left - n));
+				bits_left -= n;
 			}
-			break;
 		}
-		case transcoder_texture_format::cTFBC4_R:
+		else
 		{
-#if !BASISD_SUPPORT_DXT5A
-			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			const uint32_t A = (packed_bits & 1) ? 511 : 0;
+			const uint32_t C = g_astc_endpoint_unquant_params[range].m_c;
+			const uint32_t D = trits ? packed_trits : packed_quints;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+			assert(C);
+
+			uint32_t B = 0;
+			for (uint32_t i = 0; i < 9; i++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC4 failed\n");
+				B <<= 1;
+
+				char c = g_astc_endpoint_unquant_params[range].m_pB_str[i];
+				if (c != '0')
+				{
+					c -= 'a';
+					B |= ((packed_bits >> c) & 1);
+				}
 			}
-			break;
+
+			val = D * C + B;
+			val = val ^ A;
+			val = (A & 0x80) | (val >> 2);
 		}
-		case transcoder_texture_format::cTFPVRTC1_4_RGB:
+
+		return val;
+	}
+
+	uint32_t unquant_astc_endpoint_val(uint32_t packed_val, uint32_t range)
+	{
+		assert(range < BC7ENC_TOTAL_ASTC_RANGES);
+		assert(packed_val < (uint32_t)astc_get_levels(range));
+
+		const uint32_t bits = g_astc_bise_range_table[range][0];
+		const uint32_t trits = g_astc_bise_range_table[range][1];
+		const uint32_t quints = g_astc_bise_range_table[range][2];
+
+		if ((!trits) && (!quints))
+			return unquant_astc_endpoint(packed_val, 0, 0, range);
+		else if (trits)
+			return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), packed_val >> bits, 0, range);
+		else
+			return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), 0, packed_val >> bits, range);
+	}
+
+	// BC7 - Various BC7 tables/helpers
+	const uint32_t g_bc7_weights1[2] = { 0, 64 };
+	const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 };
+	const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+	const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+	const uint32_t g_astc_weights4[16] = { 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 };
+	const uint32_t g_astc_weights5[32] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64 };
+	const uint32_t g_astc_weights_3levels[3] = { 0, 32, 64 };
+
+	const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+	const uint8_t g_bc7_partition2[64 * 16] =
+	{
+		0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,		0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,		0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,		0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,		0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,		0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
+		0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,		0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,		0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
+		0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,		0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,		0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,		0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
+		0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,		0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,		0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,		0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,		0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,		0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,		0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
+		0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,		0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,		0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,		0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,		0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,		0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,		0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,		0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
+		0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,		0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,		0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,		0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,		0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,		0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,		0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,		0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
+		0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,		0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,		0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,		0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,		0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,		0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
+		0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,		0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,		0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,		0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,		0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,		0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,		0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
+	};
+
+	const uint8_t g_bc7_partition3[64 * 16] =
+	{
+		0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2,		0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1,		0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1,		0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1,		0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2,		0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2,		0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1,		0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1,
+		0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,		0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,		0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2,		0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,		0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,		0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2,		0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2,		0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0,
+		0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2,		0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0,		0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2,		0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1,		0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2,		0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1,		0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2,		0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0,
+		0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0,		0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2,		0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0,		0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1,		0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2,		0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2,		0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1,		0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1,
+		0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2,		0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1,		0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2,		0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0,		0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0,		0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0,		0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0,		0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1,
+		0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1,		0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2,		0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1,		0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2,		0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1,		0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1,		0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1,		0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1,
+		0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2,		0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1,		0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2,		0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2,		0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2,		0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2,		0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2,		0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2,
+		0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2,		0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2,		0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2,		0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,		0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1,		0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2,		0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2,		0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0,
+	};
+
+	const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15,		15,15,15,15,15,15,15,15,		15, 2, 8, 2, 2, 8, 8,15,		2, 8, 2, 2, 8, 8, 2, 2,		15,15, 6, 8, 2, 8,15,15,		2, 8, 2, 2, 2,15,15, 6,		6, 2, 6, 8,15,15, 2, 2,		15,15,15,15,15, 2, 2,15 };
+
+	const uint8_t g_bc7_table_anchor_index_third_subset_1[64] =
+	{
+		3, 3,15,15, 8, 3,15,15,		8, 8, 6, 6, 6, 5, 3, 3,		3, 3, 8,15, 3, 3, 6,10,		5, 8, 8, 6, 8, 5,15,15,		8,15, 3, 5, 6,10, 8,15,		15, 3,15, 5,15,15,15,15,		3,15, 5, 5, 5, 8, 5,10,		5,10, 8,13,15,12, 3, 3
+	};
+
+	const uint8_t g_bc7_table_anchor_index_third_subset_2[64] =
+	{
+		15, 8, 8, 3,15,15, 3, 8,		15,15,15,15,15,15,15, 8,		15, 8,15, 3,15, 8,15, 8,		3,15, 6,10,15,15,10, 8,		15, 3,15,10,10, 8, 9,10,		6,15, 8,15, 3, 6, 6, 8,		15, 3,15,15,15,15,15,15,		15,15,15,15, 3,15,15, 8
+	};
+
+	const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 };
+	const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 };
+	const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
+
+	const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 };
+	const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 };
+	const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 };
+	const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 };
+
+	const uint8_t g_bc7_alpha_index_bitcount[8] = { 0, 0, 0, 0, 3, 2, 4, 2 };
+
+	endpoint_err g_bc7_mode_6_optimal_endpoints[256][2]; // [c][pbit]
+	endpoint_err g_bc7_mode_5_optimal_endpoints[256]; // [c]
+
+	static inline void bc7_set_block_bits(uint8_t* pBytes, uint32_t val, uint32_t num_bits, uint32_t* pCur_ofs)
+	{
+		assert((num_bits <= 32) && (val < (1ULL << num_bits)));
+		while (num_bits)
 		{
-#if !BASISD_SUPPORT_PVRTC1
-			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			const uint32_t n = basisu::minimumu(8 - (*pCur_ofs & 7), num_bits);
+			pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7));
+			val >>= n;
+			num_bits -= n;
+			*pCur_ofs += n;
+		}
+		assert(*pCur_ofs <= 128);
+	}
 
-			// output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?)
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGB, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+	// TODO: Optimize this.
+	void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults)
+	{
+		const uint32_t best_mode = pResults->m_mode;
+
+		const uint32_t total_subsets = g_bc7_num_subsets[best_mode];
+		const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode];
+		//const uint32_t num_rotations = 1 << g_bc7_rotation_bits[best_mode];
+		//const uint32_t num_index_selectors = (best_mode == 4) ? 2 : 1;
+
+		const uint8_t* pPartition;
+		if (total_subsets == 1)
+			pPartition = &g_bc7_partition1[0];
+		else if (total_subsets == 2)
+			pPartition = &g_bc7_partition2[pResults->m_partition * 16];
+		else
+			pPartition = &g_bc7_partition3[pResults->m_partition * 16];
+
+		uint8_t color_selectors[16];
+		memcpy(color_selectors, pResults->m_selectors, 16);
+
+		uint8_t alpha_selectors[16];
+		memcpy(alpha_selectors, pResults->m_alpha_selectors, 16);
+
+		color_quad_u8 low[3], high[3];
+		memcpy(low, pResults->m_low, sizeof(low));
+		memcpy(high, pResults->m_high, sizeof(high));
+
+		uint32_t pbits[3][2];
+		memcpy(pbits, pResults->m_pbits, sizeof(pbits));
+
+		int anchor[3] = { -1, -1, -1 };
+
+		for (uint32_t k = 0; k < total_subsets; k++)
+		{
+			uint32_t anchor_index = 0;
+			if (k)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to PVRTC1 4 RGB failed\n");
+				if ((total_subsets == 3) && (k == 1))
+					anchor_index = g_bc7_table_anchor_index_third_subset_1[pResults->m_partition];
+				else if ((total_subsets == 3) && (k == 2))
+					anchor_index = g_bc7_table_anchor_index_third_subset_2[pResults->m_partition];
+				else
+					anchor_index = g_bc7_table_anchor_index_second_subset[pResults->m_partition];
+			}
+
+			anchor[k] = anchor_index;
+
+			const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, pResults->m_index_selector);
+			const uint32_t num_color_indices = 1 << color_index_bits;
+
+			if (color_selectors[anchor_index] & (num_color_indices >> 1))
+			{
+				for (uint32_t i = 0; i < 16; i++)
+					if (pPartition[i] == k)
+						color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]);
+
+				if (get_bc7_mode_has_seperate_alpha_selectors(best_mode))
+				{
+					for (uint32_t q = 0; q < 3; q++)
+					{
+						uint8_t t = low[k].m_c[q];
+						low[k].m_c[q] = high[k].m_c[q];
+						high[k].m_c[q] = t;
+					}
+				}
+				else
+				{
+					color_quad_u8 tmp = low[k];
+					low[k] = high[k];
+					high[k] = tmp;
+				}
+
+				if (!g_bc7_mode_has_shared_p_bits[best_mode])
+				{
+					uint32_t t = pbits[k][0];
+					pbits[k][0] = pbits[k][1];
+					pbits[k][1] = t;
+				}
+			}
+
+			if (get_bc7_mode_has_seperate_alpha_selectors(best_mode))
+			{
+				const uint32_t alpha_index_bits = get_bc7_alpha_index_size(best_mode, pResults->m_index_selector);
+				const uint32_t num_alpha_indices = 1 << alpha_index_bits;
+
+				if (alpha_selectors[anchor_index] & (num_alpha_indices >> 1))
+				{
+					for (uint32_t i = 0; i < 16; i++)
+						if (pPartition[i] == k)
+							alpha_selectors[i] = (uint8_t)((num_alpha_indices - 1) - alpha_selectors[i]);
+
+					uint8_t t = low[k].m_c[3];
+					low[k].m_c[3] = high[k].m_c[3];
+					high[k].m_c[3] = t;
+				}
 			}
-			break;
 		}
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
+
+		uint8_t* pBlock_bytes = (uint8_t*)(pBlock);
+		memset(pBlock_bytes, 0, BC7ENC_BLOCK_SIZE);
+
+		uint32_t cur_bit_ofs = 0;
+		bc7_set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs);
+
+		if ((best_mode == 4) || (best_mode == 5))
+			bc7_set_block_bits(pBlock_bytes, pResults->m_rotation, 2, &cur_bit_ofs);
+
+		if (best_mode == 4)
+			bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector, 1, &cur_bit_ofs);
+
+		if (total_partitions > 1)
+			bc7_set_block_bits(pBlock_bytes, pResults->m_partition, (total_partitions == 64) ? 6 : 4, &cur_bit_ofs);
+
+		const uint32_t total_comps = (best_mode >= 4) ? 4 : 3;
+		for (uint32_t comp = 0; comp < total_comps; comp++)
 		{
-#if !BASISD_SUPPORT_PVRTC1
-			return false;
-#endif
-			assert(basis_file_has_alpha_slices);
+			for (uint32_t subset = 0; subset < total_subsets; subset++)
+			{
+				bc7_set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
+				bc7_set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
+			}
+		}
 
-			// Temp buffer to hold alpha block endpoint/selector indices
-			std::vector<uint32_t> temp_block_indices(total_slice_blocks);
+		if (g_bc7_mode_has_p_bits[best_mode])
+		{
+			for (uint32_t subset = 0; subset < total_subsets; subset++)
+			{
+				bc7_set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs);
+				if (!g_bc7_mode_has_shared_p_bits[best_mode])
+					bc7_set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs);
+			}
+		}
 
-			// First transcode alpha data to temp buffer
-			status = transcode_slice(pData, data_size, slice_index + 1, &temp_block_indices[0], total_slice_blocks, block_format::cIndices, sizeof(uint32_t), decode_flags, pSlice_descs[slice_index].m_num_blocks_x, pState);
-			if (!status)
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to PVRTC1 4 RGBA failed (0)\n");
+				int idx = x + y * 4;
+
+				uint32_t n = pResults->m_index_selector ? get_bc7_alpha_index_size(best_mode, pResults->m_index_selector) : get_bc7_color_index_size(best_mode, pResults->m_index_selector);
+
+				if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2]))
+					n--;
+
+				bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? alpha_selectors[idx] : color_selectors[idx], n, &cur_bit_ofs);
 			}
-			else
+		}
+
+		if (get_bc7_mode_has_seperate_alpha_selectors(best_mode))
+		{
+			for (uint32_t y = 0; y < 4; y++)
 			{
-				// output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?)
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGBA, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState, &temp_block_indices[0]);
-				if (!status)
+				for (uint32_t x = 0; x < 4; x++)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to PVRTC1 4 RGBA failed (1)\n");
+					int idx = x + y * 4;
+
+					uint32_t n = pResults->m_index_selector ? get_bc7_color_index_size(best_mode, pResults->m_index_selector) : get_bc7_alpha_index_size(best_mode, pResults->m_index_selector);
+
+					if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2]))
+						n--;
+
+					bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? color_selectors[idx] : alpha_selectors[idx], n, &cur_bit_ofs);
 				}
 			}
+		}
 
-			break;
+		assert(cur_bit_ofs == 128);
+	}
+
+	// ASTC
+	static inline void astc_set_bits_1_to_9(uint32_t* pDst, int& bit_offset, uint32_t code, uint32_t codesize)
+	{
+		uint8_t* pBuf = reinterpret_cast<uint8_t*>(pDst);
+
+		assert(codesize <= 9);
+		if (codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t val = code << byte_bit_offset;
+
+			uint32_t index = bit_offset >> 3;
+			pBuf[index] |= (uint8_t)val;
+
+			if (codesize > (8 - byte_bit_offset))
+				pBuf[index + 1] |= (uint8_t)(val >> 8);
+
+			bit_offset += codesize;
 		}
-		case transcoder_texture_format::cTFBC7_M6_RGB:
+	}
+
+	void pack_astc_solid_block(void* pDst_block, const color32& color)
+	{
+		uint32_t r = color[0], g = color[1], b = color[2];
+		uint32_t a = color[3];
+
+		uint32_t* pOutput = static_cast<uint32_t*>(pDst_block);
+		uint8_t* pBytes = reinterpret_cast<uint8_t*>(pDst_block);
+
+		pBytes[0] = 0xfc; pBytes[1] = 0xfd; pBytes[2] = 0xff; pBytes[3] = 0xff;
+
+		pOutput[1] = 0xffffffff;
+		pOutput[2] = 0;
+		pOutput[3] = 0;
+
+		int bit_pos = 64;
+		astc_set_bits(reinterpret_cast<uint32_t*>(pDst_block), bit_pos, r | (r << 8), 16);
+		astc_set_bits(reinterpret_cast<uint32_t*>(pDst_block), bit_pos, g | (g << 8), 16);
+		astc_set_bits(reinterpret_cast<uint32_t*>(pDst_block), bit_pos, b | (b << 8), 16);
+		astc_set_bits(reinterpret_cast<uint32_t*>(pDst_block), bit_pos, a | (a << 8), 16);
+	}
+
+	// See 23.21 https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_partition_pattern_generation
+#ifdef _DEBUG
+	static inline uint32_t astc_hash52(uint32_t v)
+	{
+		uint32_t p = v;
+		p ^= p >> 15;	p -= p << 17;	p += p << 7;	p += p << 4;
+		p ^= p >> 5;	p += p << 16;	p ^= p >> 7;	p ^= p >> 3;
+		p ^= p << 6;	p ^= p >> 17;
+		return p;
+	}
+
+	int astc_compute_texel_partition(int seed, int x, int y, int z, int partitioncount, bool small_block)
+	{
+		if (small_block)
+		{
+			x <<= 1; y <<= 1; z <<= 1;
+		}
+		seed += (partitioncount - 1) * 1024;
+		uint32_t rnum = astc_hash52(seed);
+		uint8_t seed1 = rnum & 0xF;
+		uint8_t seed2 = (rnum >> 4) & 0xF;
+		uint8_t seed3 = (rnum >> 8) & 0xF;
+		uint8_t seed4 = (rnum >> 12) & 0xF;
+		uint8_t seed5 = (rnum >> 16) & 0xF;
+		uint8_t seed6 = (rnum >> 20) & 0xF;
+		uint8_t seed7 = (rnum >> 24) & 0xF;
+		uint8_t seed8 = (rnum >> 28) & 0xF;
+		uint8_t seed9 = (rnum >> 18) & 0xF;
+		uint8_t seed10 = (rnum >> 22) & 0xF;
+		uint8_t seed11 = (rnum >> 26) & 0xF;
+		uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+		seed1 *= seed1;    seed2 *= seed2;
+		seed3 *= seed3;    seed4 *= seed4;
+		seed5 *= seed5;    seed6 *= seed6;
+		seed7 *= seed7;    seed8 *= seed8;
+		seed9 *= seed9;    seed10 *= seed10;
+		seed11 *= seed11;   seed12 *= seed12;
+
+		int sh1, sh2, sh3;
+		if (seed & 1)
+		{
+			sh1 = (seed & 2 ? 4 : 5); sh2 = (partitioncount == 3 ? 6 : 5);
+		}
+		else
 		{
-#if !BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-			return false;
+			sh1 = (partitioncount == 3 ? 6 : 5); sh2 = (seed & 2 ? 4 : 5);
+		}
+		sh3 = (seed & 0x10) ? sh1 : sh2;
+
+		seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2;
+		seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2;
+		seed9 >>= sh3; seed10 >>= sh3; seed11 >>= sh3; seed12 >>= sh3;
+
+		int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+		int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+		int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+		int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+		a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F;
+
+		if (partitioncount < 4) d = 0;
+		if (partitioncount < 3) c = 0;
+
+		if (a >= b && a >= c && a >= d)
+			return 0;
+		else if (b >= c && b >= d)
+			return 1;
+		else if (c >= d)
+			return 2;
+		else
+			return 3;
+	}
 #endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M6_OPAQUE_ONLY, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+	static const uint8_t g_astc_quint_encode[125] =
+	{
+		0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57,
+		58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104,
+		105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54,
+		126, 127, 94, 95, 62, 39, 47, 55, 63, 31
+	};
+
+	// Encodes 3 values to output, usable for any range that uses quints and bits
+	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n)
+	{
+		// First extract the trits and the bits from the 5 input values
+		int quints = 0, bits[3];
+		const uint32_t bit_mask = (1 << n) - 1;
+		for (int i = 0; i < 3; i++)
+		{
+			static const int s_muls[3] = { 1, 5, 25 };
+
+			const int t = pValues[i] >> n;
+
+			quints += t * s_muls[i];
+			bits[i] = pValues[i] & bit_mask;
+		}
+
+		// Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits.
+		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
+
+		assert(quints < 125);
+		const int T = g_astc_quint_encode[quints];
+
+		// Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96.
+		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) |
+			(bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3);
+	}
+
+	// Packs values using ASTC's BISE to output buffer.
+	static void astc_pack_bise(uint32_t* pDst, const uint8_t* pSrc_vals, int bit_pos, int num_vals, int range)
+	{
+		uint32_t temp[5] = { 0, 0, 0, 0, 0 };
+
+		const int num_bits = g_astc_bise_range_table[range][0];
+
+		int group_size = 0;
+		if (g_astc_bise_range_table[range][1])
+			group_size = 5;
+		else if (g_astc_bise_range_table[range][2])
+			group_size = 3;
+
+		if (group_size)
+		{
+			// Range has trits or quints - pack each group of 5 or 3 values 
+			const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3);
+
+			for (int group_index = 0; group_index < total_groups; group_index++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC7 m6 opaque only failed\n");
+				uint8_t vals[5] = { 0, 0, 0, 0, 0 };
+
+				const int limit = basisu::minimum(group_size, num_vals - group_index * group_size);
+				for (int i = 0; i < limit; i++)
+					vals[i] = pSrc_vals[group_index * group_size + i];
+
+				if (group_size == 5)
+					astc_encode_trits(temp, vals, bit_pos, num_bits);
+				else
+					astc_encode_quints(temp, vals, bit_pos, num_bits);
+			}
+		}
+		else
+		{
+			for (int i = 0; i < num_vals; i++)
+				astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits);
+		}
+
+		pDst[0] |= temp[0]; pDst[1] |= temp[1];
+		pDst[2] |= temp[2]; pDst[3] |= temp[3];
+	}
+
+	const uint32_t ASTC_BLOCK_MODE_BITS = 11;
+	const uint32_t ASTC_PART_BITS = 2;
+	const uint32_t ASTC_CEM_BITS = 4;
+	const uint32_t ASTC_PARTITION_INDEX_BITS = 10;
+	const uint32_t ASTC_CCS_BITS = 2;
+
+	const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 };
+
+	bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode)
+	{
+		assert(uastc_mode < TOTAL_UASTC_MODES);
+		uint8_t* pDst_bytes = reinterpret_cast<uint8_t*>(pDst);
+
+		const int total_weights = pBlock->m_dual_plane ? 32 : 16;
+
+		// Set mode bits - see Table 146-147
+		uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode];
+		pDst_bytes[0] = (uint8_t)mode;
+		pDst_bytes[1] = (uint8_t)(mode >> 8);
+
+		memset(pDst_bytes + 2, 0, 16 - 2);
+
+		int bit_pos = ASTC_BLOCK_MODE_BITS;
+
+		// We only support 1-5 bit weight indices
+		assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]);
+		const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0];
+
+		// See table 143 - PART
+		astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS);
+
+		if (pBlock->m_subsets == 1)
+			astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS);
+		else
+		{
+			// See table 145
+			astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS);
+
+			// Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM
+			astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2);
+		}
+
+		if (pBlock->m_dual_plane)
+		{
+			const int total_weight_bits = total_weights * bits_per_weight;
+
+			// See Illegal Encodings 23.24
+			// https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings
+			assert((total_weight_bits >= 24) && (total_weight_bits <= 96));
+
+			int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS;
+			astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS);
+		}
+
+		const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets;
+		assert(num_cem_pairs <= 9);
+
+		astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]);
+
+		// Write the weight bits in reverse bit order.
+		switch (bits_per_weight)
+		{
+		case 1:
+		{
+			const uint32_t N = 1;
+			for (int i = 0; i < total_weights; i++)
+			{
+				const uint32_t ofs = 128 - N - i;
+				assert((ofs >> 3) < 16);
+				pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7));
 			}
 			break;
 		}
-		case transcoder_texture_format::cTFBC7_M5_RGBA:
+		case 2:
 		{
-#if !BASISD_SUPPORT_BC7_MODE5
-			return false;
-#else
-			assert(bytes_per_block == 16);
+			const uint32_t N = 2;
+			for (int i = 0; i < total_weights; i++)
+			{
+				static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 };
+				const uint32_t ofs = 128 - N - (i * N);
+				assert((ofs >> 3) < 16);
+				pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7));
+			}
+			break;
+		}
+		case 3:
+		{
+			const uint32_t N = 3;
+			for (int i = 0; i < total_weights; i++)
+			{
+				static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 
-			// First transcode the color slice. The cBC7_M5_COLOR transcoder will output opaque mode 5 blocks.
-			status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_COLOR, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				const uint32_t ofs = 128 - N - (i * N);
+				const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7);
 
-			if ((status) && (basis_file_has_alpha_slices))
+				uint32_t index = ofs >> 3;
+				assert(index < 16);
+				pDst_bytes[index++] |= rev & 0xFF;
+				if (index < 16)
+					pDst_bytes[index++] |= (rev >> 8);
+			}
+			break;
+		}
+		case 4:
+		{
+			const uint32_t N = 4;
+			for (int i = 0; i < total_weights; i++)
 			{
-				// Now transcode the alpha slice. The cBC7_M5_ALPHA transcoder will now change the opaque mode 5 blocks to blocks with alpha.
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_ALPHA, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+				const int ofs = 128 - N - (i * N);
+				assert(ofs >= 0 && (ofs >> 3) < 16);
+				pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7));
+			}
+			break;
+		}
+		case 5:
+		{
+			const uint32_t N = 5;
+			for (int i = 0; i < total_weights; i++)
+			{
+				static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 };
+
+				const uint32_t ofs = 128 - N - (i * N);
+				const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7);
+
+				uint32_t index = ofs >> 3;
+				assert(index < 16);
+				pDst_bytes[index++] |= rev & 0xFF;
+				if (index < 16)
+					pDst_bytes[index++] |= (rev >> 8);
 			}
 
 			break;
-#endif
 		}
-		case transcoder_texture_format::cTFETC2_RGBA:
+		default:
+			assert(0);
+			break;
+		}
+
+		return true;
+	}
+
+	const uint8_t* get_anchor_indices(uint32_t subsets, uint32_t mode, uint32_t common_pattern, const uint8_t*& pPartition_pattern)
+	{
+		const uint8_t* pSubset_anchor_indices = g_zero_pattern;
+		pPartition_pattern = g_zero_pattern;
+
+		if (subsets >= 2)
 		{
-#if !BASISD_SUPPORT_ETC2_EAC_A8
-			return false;
+			if (subsets == 3)
+			{
+				pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0];
+				pSubset_anchor_indices = &g_astc_bc7_pattern3_anchors[common_pattern][0];
+			}
+			else if (mode == 7)
+			{
+				pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0];
+				pSubset_anchor_indices = &g_bc7_3_astc2_patterns2_anchors[common_pattern][0];
+			}
+			else
+			{
+				pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0];
+				pSubset_anchor_indices = &g_astc_bc7_pattern2_anchors[common_pattern][0];
+			}
+		}
+
+		return pSubset_anchor_indices;
+	}
+
+	static inline uint32_t read_bit(const uint8_t* pBuf, uint32_t& bit_offset)
+	{
+		uint32_t byte_bits = pBuf[bit_offset >> 3] >> (bit_offset & 7);
+		bit_offset += 1;
+		return byte_bits & 1;
+	}
+
+	static inline uint32_t read_bits1_to_9(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 9);
+		if (!codesize)
+			return 0;
+
+		if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS) || (bit_offset >= 112))
+		{
+			const uint8_t* pBytes = &pBuf[bit_offset >> 3U];
+
+			uint32_t byte_bit_offset = bit_offset & 7U;
+
+			uint32_t bits = pBytes[0] >> byte_bit_offset;
+			uint32_t bits_read = basisu::minimum<int>(codesize, 8 - byte_bit_offset);
+
+			uint32_t bits_remaining = codesize - bits_read;
+			if (bits_remaining)
+				bits |= ((uint32_t)pBytes[1]) << bits_read;
+
+			bit_offset += codesize;
+
+			return bits & ((1U << codesize) - 1U);
+		}
+
+		uint32_t byte_bit_offset = bit_offset & 7U;
+		const uint16_t w = *(const uint16_t *)(&pBuf[bit_offset >> 3U]);
+		bit_offset += codesize;
+		return (w >> byte_bit_offset) & ((1U << codesize) - 1U);
+	}
+
+	inline uint64_t read_bits64(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 64U);
+		uint64_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7U;
+			uint32_t bits_to_read = basisu::minimum<int>(codesize - total_bits, 8U - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3U] >> byte_bit_offset;
+			byte_bits &= ((1U << bits_to_read) - 1U);
+
+			bits |= ((uint64_t)(byte_bits) << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+
+	static inline uint32_t read_bits1_to_9_fst(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 9);
+		if (!codesize)
+			return 0;
+		assert(bit_offset < 112);
+
+		if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS))
+		{
+			const uint8_t* pBytes = &pBuf[bit_offset >> 3U];
+
+			uint32_t byte_bit_offset = bit_offset & 7U;
+
+			uint32_t bits = pBytes[0] >> byte_bit_offset;
+			uint32_t bits_read = basisu::minimum<int>(codesize, 8 - byte_bit_offset);
+
+			uint32_t bits_remaining = codesize - bits_read;
+			if (bits_remaining)
+				bits |= ((uint32_t)pBytes[1]) << bits_read;
+
+			bit_offset += codesize;
+
+			return bits & ((1U << codesize) - 1U);
+		}
+
+		uint32_t byte_bit_offset = bit_offset & 7U;
+		const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]);
+		bit_offset += codesize;
+		return (w >> byte_bit_offset)& ((1U << codesize) - 1U);
+	}
+
+	bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints)
+	{
+		//memset(&unpacked, 0, sizeof(unpacked));
+				
+#if 0
+		uint8_t table[128];
+		memset(table, 0xFF, sizeof(table));
+
+		{
+			for (uint32_t mode = 0; mode <= TOTAL_UASTC_MODES; mode++)
+			{
+				const uint32_t code = g_uastc_mode_huff_codes[mode][0];
+				const uint32_t codesize = g_uastc_mode_huff_codes[mode][1];
+
+				table[code] = mode;
+
+				uint32_t bits_left = 7 - codesize;
+				for (uint32_t i = 0; i < (1 << bits_left); i++)
+					table[code | (i << codesize)] = mode;
+			}
+
+			for (uint32_t i = 0; i < 128; i++)
+				printf("%u,", table[i]);
+			exit(0);
+		}
 #endif
-			assert(bytes_per_block == 16);
 
-			if (basis_file_has_alpha_slices)
+		const int mode = g_uastc_huff_modes[blk.m_bytes[0] & 127];
+		if (mode >= (int)TOTAL_UASTC_MODES)
+			return false;
+
+		unpacked.m_mode = mode;
+
+		uint32_t bit_ofs = g_uastc_mode_huff_codes[mode][1];
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			unpacked.m_solid_color.r = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8);
+			unpacked.m_solid_color.g = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8);
+			unpacked.m_solid_color.b = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8);
+			unpacked.m_solid_color.a = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8);
+
+			if (read_hints)
 			{
-				// First decode the alpha data 
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_A8, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				unpacked.m_etc1_flip = false;
+				unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0;
+				unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3);
+				unpacked.m_etc1_inten1 = 0;
+				unpacked.m_etc1_selector = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2);
+				unpacked.m_etc1_r = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5);
+				unpacked.m_etc1_g = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5);
+				unpacked.m_etc1_b = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5);
+				unpacked.m_etc1_bias = 0;
+				unpacked.m_etc2_hints = 0;
 			}
+
+			return true;
+		}
+				
+		if (read_hints)
+		{
+			if (g_uastc_mode_has_bc1_hint0[mode])
+				unpacked.m_bc1_hint0 = read_bit(blk.m_bytes, bit_ofs) != 0;
+			else
+				unpacked.m_bc1_hint0 = false;
+
+			if (g_uastc_mode_has_bc1_hint1[mode])
+				unpacked.m_bc1_hint1 = read_bit(blk.m_bytes, bit_ofs) != 0;
 			else
+				unpacked.m_bc1_hint1 = false;
+
+			unpacked.m_etc1_flip = read_bit(blk.m_bytes, bit_ofs) != 0;
+			unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0;
+			unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3);
+			unpacked.m_etc1_inten1 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3);
+
+			if (g_uastc_mode_has_etc1_bias[mode])
+				unpacked.m_etc1_bias = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5);
+			else
+				unpacked.m_etc1_bias = 0;
+
+			if (g_uastc_mode_has_alpha[mode])
 			{
-				write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_A8, 16, output_row_pitch_in_blocks_or_pixels);
-				status = true;
+				unpacked.m_etc2_hints = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8);
+				//assert(unpacked.m_etc2_hints > 0);
 			}
+			else
+				unpacked.m_etc2_hints = 0;
+		}
+		else
+			bit_ofs += g_uastc_mode_total_hint_bits[mode];
+				
+		uint32_t subsets = 1;
+		switch (mode)
+		{
+		case 2:
+		case 4:
+		case 7:
+		case 9:
+		case 16:
+			unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5);
+			subsets = 2;
+			break;
+		case 3:
+			unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 4);
+			subsets = 3;
+			break;
+		default:
+			break;
+		}
 
-			if (status)
+		uint32_t part_seed = 0;
+		switch (mode)
+		{
+		case 2:
+		case 4:
+		case 9:
+		case 16:
+			if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS2)
+				return false;
+
+			part_seed = g_astc_bc7_common_partitions2[unpacked.m_common_pattern].m_astc;
+			break;
+		case 3:
+			if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS3)
+				return false;
+
+			part_seed = g_astc_bc7_common_partitions3[unpacked.m_common_pattern].m_astc;
+			break;
+		case 7:
+			if (unpacked.m_common_pattern >= TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS)
+				return false;
+
+			part_seed = g_bc7_3_astc2_common_partitions[unpacked.m_common_pattern].m_astc2;
+			break;
+		default:
+			break;
+		}
+
+		uint32_t total_planes = 1;
+		switch (mode)
+		{
+		case 6:
+		case 11:
+		case 13:
+			unpacked.m_astc.m_ccs = (int)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2);
+			total_planes = 2;
+			break;
+		case 17:
+			unpacked.m_astc.m_ccs = 3;
+			total_planes = 2;
+			break;
+		default:
+			break;
+		}
+
+		unpacked.m_astc.m_dual_plane = (total_planes == 2);
+
+		unpacked.m_astc.m_subsets = subsets;
+		unpacked.m_astc.m_partition_seed = part_seed;
+
+		const uint32_t total_comps = g_uastc_mode_comps[mode];
+
+		const uint32_t weight_bits = g_uastc_mode_weight_bits[mode];
+
+		unpacked.m_astc.m_weight_range = g_uastc_mode_weight_ranges[mode];
+
+		const uint32_t total_values = total_comps * 2 * subsets;
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode];
+
+		const uint32_t cem = g_uastc_mode_cem[mode];
+		unpacked.m_astc.m_cem = cem;
+
+		const uint32_t ep_bits = g_astc_bise_range_table[endpoint_range][0];
+		const uint32_t ep_trits = g_astc_bise_range_table[endpoint_range][1];
+		const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2];
+
+		uint32_t total_tqs = 0;
+		uint32_t bundle_size = 0, mul = 0;
+		if (ep_trits)
+		{
+			total_tqs = (total_values + 4) / 5;
+			bundle_size = 5;
+			mul = 3;
+		}
+		else if (ep_quints)
+		{
+			total_tqs = (total_values + 2) / 3;
+			bundle_size = 3;
+			mul = 5;
+		}
+
+		uint32_t tq_values[8];
+		for (uint32_t i = 0; i < total_tqs; i++)
+		{
+			uint32_t num_bits = ep_trits ? 8 : 7;
+			if (i == (total_tqs - 1))
 			{
-				// Now decode the color data
-				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				uint32_t num_remaining = total_values - (total_tqs - 1) * bundle_size;
+				if (ep_trits)
+				{
+					switch (num_remaining)
+					{
+					case 1: num_bits = 2; break;
+					case 2: num_bits = 4; break;
+					case 3: num_bits = 5; break;
+					case 4: num_bits = 7; break;
+					default: break;
+					}
+				}
+				else if (ep_quints)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2 RGB failed\n");
+					switch (num_remaining)
+					{
+					case 1: num_bits = 3; break;
+					case 2: num_bits = 5; break;
+					default: break;
+					}
 				}
 			}
-			else
+
+			tq_values[i] = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, num_bits);
+		} // i
+
+		uint32_t accum = 0;
+		uint32_t accum_remaining = 0;
+		uint32_t next_tq_index = 0;
+
+		for (uint32_t i = 0; i < total_values; i++)
+		{
+			uint32_t value = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, ep_bits);
+
+			if (total_tqs)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2 A failed\n");
+				if (!accum_remaining)
+				{
+					assert(next_tq_index < total_tqs);
+					accum = tq_values[next_tq_index++];
+					accum_remaining = bundle_size;
+				}
+
+				// TODO: Optimize with tables
+				uint32_t v = accum % mul;
+				accum /= mul;
+				accum_remaining--;
+
+				value |= (v << ep_bits);
 			}
-			break;
+
+			unpacked.m_astc.m_endpoints[i] = (uint8_t)value;
 		}
-		case transcoder_texture_format::cTFBC3_RGBA:
+
+		const uint8_t* pPartition_pattern;
+		const uint8_t* pSubset_anchor_indices = get_anchor_indices(subsets, mode, unpacked.m_common_pattern, pPartition_pattern);
+
+#ifdef _DEBUG
+		for (uint32_t i = 0; i < 16; i++)
+			assert(pPartition_pattern[i] == astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true));
+
+		for (uint32_t subset_index = 0; subset_index < subsets; subset_index++)
 		{
-#if !BASISD_SUPPORT_DXT1
-			return false;
-#endif
-#if !BASISD_SUPPORT_DXT5A
-			return false;
+			uint32_t anchor_index = 0;
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if (pPartition_pattern[i] == subset_index)
+				{
+					anchor_index = i;
+					break;
+				}
+			}
+
+			assert(pSubset_anchor_indices[subset_index] == anchor_index);
+		}
 #endif
-			assert(bytes_per_block == 16);
 
-			// First decode the alpha data 
-			if (basis_file_has_alpha_slices)
+#if 0
+		const uint32_t total_planes_shift = total_planes - 1;
+		for (uint32_t i = 0; i < 16 * total_planes; i++)
+		{
+			uint32_t num_bits = weight_bits;
+			for (uint32_t s = 0; s < subsets; s++)
 			{
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				if (pSubset_anchor_indices[s] == (i >> total_planes_shift))
+				{
+					num_bits--;
+					break;
+				}
 			}
+
+			unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, num_bits);
+		}
+#endif
+
+		if (mode == 18)
+		{
+			// Mode 18 is the only mode with more than 64 weight bits.
+			for (uint32_t i = 0; i < 16; i++)
+				unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, i ? weight_bits : (weight_bits - 1));
+		}
+		else
+		{
+			// All other modes have <= 64 weight bits.
+			uint64_t bits;
+			
+			// Read the weight bits
+			if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS))
+				bits = read_bits64(blk.m_bytes, bit_ofs, basisu::minimum<int>(64, 128 - (int)bit_ofs));
 			else
 			{
-				write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
-				status = true;
+#ifdef __EMSCRIPTEN__
+				bits = blk.m_dwords[2];
+				bits |= (((uint64_t)blk.m_dwords[3]) << 32U);
+#else
+				bits = blk.m_qwords[1];
+#endif
+				
+				if (bit_ofs >= 64U)
+					bits >>= (bit_ofs - 64U);
+				else
+				{
+					assert(bit_ofs >= 56U);
+					
+					uint32_t bits_needed = 64U - bit_ofs;
+					bits <<= bits_needed;
+					bits |= (blk.m_bytes[7] >> (8U - bits_needed));
+				}
 			}
+						
+			bit_ofs = 0;
 
-			if (status)
+			const uint32_t mask = (1U << weight_bits) - 1U;
+			const uint32_t anchor_mask = (1U << (weight_bits - 1U)) - 1U;
+			
+			if (total_planes == 2)
 			{
-				// Now decode the color data. Forbid 3 color blocks, which aren't allowed in BC3.
-				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, 16, decode_flags | cDecodeFlagsBC1ForbidThreeColorBlocks, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				// Dual plane modes always have a single subset, and the first 2 weights are anchors.
+
+				unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask);
+				bit_ofs += (weight_bits - 1);
+				
+				unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask);
+				bit_ofs += (weight_bits - 1);
+
+				for (uint32_t i = 2; i < 32; i++)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC3 RGB failed\n");
+					unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask);
+					bit_ofs += weight_bits;
 				}
 			}
 			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC3 A failed\n");
-			}
+				if (subsets == 1)
+				{
+					// Specialize the single subset case.
+					if (weight_bits == 4)
+					{
+						assert(bit_ofs == 0);
+						
+						// Specialize the most common case: 4-bit weights.
+						unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits) & 7);
+						unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> 3) & 15);
+						unpacked.m_astc.m_weights[2] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 1)) & 15);
+						unpacked.m_astc.m_weights[3] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 2)) & 15);
+
+						unpacked.m_astc.m_weights[4] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 3)) & 15);
+						unpacked.m_astc.m_weights[5] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 4)) & 15);
+						unpacked.m_astc.m_weights[6] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 5)) & 15);
+						unpacked.m_astc.m_weights[7] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 6)) & 15);
+
+						unpacked.m_astc.m_weights[8] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 7)) & 15);
+						unpacked.m_astc.m_weights[9] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 8)) & 15);
+						unpacked.m_astc.m_weights[10] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 9)) & 15);
+						unpacked.m_astc.m_weights[11] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 10)) & 15);
+
+						unpacked.m_astc.m_weights[12] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 11)) & 15);
+						unpacked.m_astc.m_weights[13] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 12)) & 15);
+						unpacked.m_astc.m_weights[14] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 13)) & 15);
+						unpacked.m_astc.m_weights[15] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 14)) & 15);
+					}
+					else
+					{
+						// First weight is always an anchor.
+						unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask);
+						bit_ofs += (weight_bits - 1);
 
-			break;
+						for (uint32_t i = 1; i < 16; i++)
+						{
+							unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask);
+							bit_ofs += weight_bits;
+						}
+					}
+				}
+				else
+				{
+					const uint32_t a0 = pSubset_anchor_indices[0], a1 = pSubset_anchor_indices[1], a2 = pSubset_anchor_indices[2];
+
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						if ((i == a0) || (i == a1) || (i == a2))
+						{
+							unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask);
+							bit_ofs += (weight_bits - 1);
+						}
+						else
+						{
+							unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask);
+							bit_ofs += weight_bits;
+						}
+					}
+				}
+			}
 		}
-		case transcoder_texture_format::cTFBC5_RG:
+
+		if ((blue_contract_check) && (total_comps >= 3))
 		{
-#if !BASISD_SUPPORT_DXT5A
-			return false;
-#endif
-			assert(bytes_per_block == 16);
+			// We only need to disable ASTC Blue Contraction when we'll be packing to ASTC. The other transcoders don't care.
+			bool invert_subset[3] = { false, false, false };
+			bool any_flag = false;
 
-			// Decode the R data (actually the green channel of the color data slice in the basis file)
-			status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (status)
+			for (uint32_t subset_index = 0; subset_index < subsets; subset_index++)
 			{
-				if (basis_file_has_alpha_slices)
+				const int s0 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 0]].m_unquant +
+					g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 2]].m_unquant +
+					g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 4]].m_unquant;
+
+				const int s1 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 1]].m_unquant +
+					g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 3]].m_unquant +
+					g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 5]].m_unquant;
+
+				if (s1 < s0)
 				{
-					// Decode the G data (actually the green channel of the alpha data slice in the basis file)
-					status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-					if (!status)
+					for (uint32_t c = 0; c < total_comps; c++)
+						std::swap(unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 0], unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 1]);
+
+					invert_subset[subset_index] = true;
+					any_flag = true;
+				}
+			}
+
+			if (any_flag)
+			{
+				const uint32_t weight_mask = (1 << weight_bits) - 1;
+
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					uint32_t subset = pPartition_pattern[i];
+
+					if (invert_subset[subset])
 					{
-						BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC5 1 failed\n");
+						unpacked.m_astc.m_weights[i * total_planes] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes]);
+
+						if (total_planes == 2)
+							unpacked.m_astc.m_weights[i * total_planes + 1] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes + 1]);
 					}
 				}
+			}
+		}
+
+		return true;
+	}
+
+	static const uint32_t* g_astc_weight_tables[6] = { nullptr, g_bc7_weights1, g_bc7_weights2, g_bc7_weights3, g_astc_weights4, g_astc_weights5 };
+
+	bool unpack_uastc(uint32_t mode, uint32_t common_pattern, const color32& solid_color, const astc_block_desc& astc, color32* pPixels, bool srgb)
+	{
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+				pPixels[i] = solid_color;
+			return true;
+		}
+
+		color32 endpoints[3][2];
+
+		const uint32_t total_subsets = g_uastc_mode_subsets[mode];
+		const uint32_t total_comps = basisu::minimum<uint32_t>(4U, g_uastc_mode_comps[mode]);
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode];
+		const uint32_t total_planes = g_uastc_mode_planes[mode];
+		const uint32_t weight_bits = g_uastc_mode_weight_bits[mode];
+		const uint32_t weight_levels = 1 << weight_bits;
+
+		for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++)
+		{
+			if (total_comps == 2)
+			{
+				const uint32_t ll = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 0]].m_unquant;
+				const uint32_t lh = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 1]].m_unquant;
+
+				const uint32_t al = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 0]].m_unquant;
+				const uint32_t ah = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 1]].m_unquant;
+
+				endpoints[subset_index][0].set_noclamp_rgba(ll, ll, ll, al);
+				endpoints[subset_index][1].set_noclamp_rgba(lh, lh, lh, ah);
+			}
+			else
+			{
+				for (uint32_t comp_index = 0; comp_index < total_comps; comp_index++)
+				{
+					endpoints[subset_index][0][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 0]].m_unquant;
+					endpoints[subset_index][1][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 1]].m_unquant;
+				}
+				for (uint32_t comp_index = total_comps; comp_index < 4; comp_index++)
+				{
+					endpoints[subset_index][0][comp_index] = 255;
+					endpoints[subset_index][1][comp_index] = 255;
+				}
+			}
+		}
+
+		color32 block_colors[3][32];
+
+		const uint32_t* pWeights = g_astc_weight_tables[weight_bits];
+
+		for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++)
+		{
+			for (uint32_t l = 0; l < weight_levels; l++)
+			{
+				if (total_comps == 2)
+				{
+					const uint8_t lc = (uint8_t)astc_interpolate(endpoints[subset_index][0][0], endpoints[subset_index][1][0], pWeights[l], srgb);
+					const uint8_t ac = (uint8_t)astc_interpolate(endpoints[subset_index][0][3], endpoints[subset_index][1][3], pWeights[l], srgb);
+
+					block_colors[subset_index][l].set(lc, lc, lc, ac);
+				}
 				else
 				{
-					write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
-					status = true;
+					uint32_t comp_index;
+					for (comp_index = 0; comp_index < total_comps; comp_index++)
+						block_colors[subset_index][l][comp_index] = (uint8_t)astc_interpolate(endpoints[subset_index][0][comp_index], endpoints[subset_index][1][comp_index], pWeights[l], srgb);
+
+					for (; comp_index < 4; comp_index++)
+						block_colors[subset_index][l][comp_index] = 255;
 				}
 			}
+		}
+
+		const uint8_t* pPartition_pattern = g_zero_pattern;
+
+		if (total_subsets >= 2)
+		{
+			if (total_subsets == 3)
+				pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0];
+			else if (mode == 7)
+				pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0];
 			else
+				pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0];
+
+#ifdef _DEBUG
+			for (uint32_t i = 0; i < 16; i++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC5 channel 0 failed\n");
+				assert(pPartition_pattern[i] == (uint8_t)astc_compute_texel_partition(astc.m_partition_seed, i & 3, i >> 2, 0, total_subsets, true));
 			}
-			break;
+#endif
 		}
-		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+
+		if (total_planes == 1)
 		{
-#if !BASISD_SUPPORT_ASTC
+			if (total_subsets == 1)
+			{
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					assert(astc.m_weights[i] < weight_levels);
+					pPixels[i] = block_colors[0][astc.m_weights[i]];
+				}
+			}
+			else
+			{
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					assert(astc.m_weights[i] < weight_levels);
+					pPixels[i] = block_colors[pPartition_pattern[i]][astc.m_weights[i]];
+				}
+			}
+		}
+		else
+		{
+			assert(total_subsets == 1);
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				const uint32_t subset_index = 0; // pPartition_pattern[i];
+
+				const uint32_t weight_index0 = astc.m_weights[i * 2];
+				const uint32_t weight_index1 = astc.m_weights[i * 2 + 1];
+
+				assert(weight_index0 < weight_levels && weight_index1 < weight_levels);
+
+				color32& c = pPixels[i];
+				for (uint32_t comp = 0; comp < 4; comp++)
+				{
+					if ((int)comp == astc.m_ccs)
+						c[comp] = block_colors[subset_index][weight_index1][comp];
+					else
+						c[comp] = block_colors[subset_index][weight_index0][comp];
+				}
+			}
+		}
+
+		return true;
+	}
+
+	bool unpack_uastc(const unpacked_uastc_block& unpacked_blk, color32* pPixels, bool srgb)
+	{
+		return unpack_uastc(unpacked_blk.m_mode, unpacked_blk.m_common_pattern, unpacked_blk.m_solid_color, unpacked_blk.m_astc, pPixels, srgb);
+	}
+
+	bool unpack_uastc(const uastc_block& blk, color32* pPixels, bool srgb)
+	{
+		unpacked_uastc_block unpacked_blk;
+
+		if (!unpack_uastc(blk, unpacked_blk, false, false))
 			return false;
-#endif
-			assert(bytes_per_block == 16);
 
-			if (basis_file_has_alpha_slices)
+		return unpack_uastc(unpacked_blk, pPixels, srgb);
+	}
+
+	// Determines the best shared pbits to use to encode xl/xh
+	static void determine_shared_pbits(
+		uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4],
+		color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2])
+	{
+		const uint32_t total_bits = comp_bits + 1;
+		assert(total_bits >= 4 && total_bits <= 8);
+
+		const int iscalep = (1 << total_bits) - 1;
+		const float scalep = (float)iscalep;
+
+		float best_err = 1e+9f;
+
+		for (int p = 0; p < 2; p++)
+		{
+			color_quad_u8 xMinColor, xMaxColor;
+			for (uint32_t c = 0; c < 4; c++)
 			{
-				// First decode the alpha data to the output (we're using the output texture as a temp buffer here).
-				status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+			}
+
+			color_quad_u8 scaledLow, scaledHigh;
+
+			for (uint32_t i = 0; i < 4; i++)
+			{
+				scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits));
+				scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits);
+				assert(scaledLow.m_c[i] <= 255);
+
+				scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits));
+				scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits);
+				assert(scaledHigh.m_c[i] <= 255);
+			}
+
+			float err = 0;
+			for (uint32_t i = 0; i < total_comps; i++)
+				err += basisu::squaref((scaledLow.m_c[i] / 255.0f) - xl[i]) + basisu::squaref((scaledHigh.m_c[i] / 255.0f) - xh[i]);
+
+			if (err < best_err)
+			{
+				best_err = err;
+				best_pbits[0] = p;
+				best_pbits[1] = p;
+				for (uint32_t j = 0; j < 4; j++)
+				{
+					bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
+					bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
+				}
+			}
+		}
+	}
+
+	// Determines the best unique pbits to use to encode xl/xh
+	static void determine_unique_pbits(
+		uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4],
+		color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2])
+	{
+		const uint32_t total_bits = comp_bits + 1;
+		const int iscalep = (1 << total_bits) - 1;
+		const float scalep = (float)iscalep;
+
+		float best_err0 = 1e+9f;
+		float best_err1 = 1e+9f;
+
+		for (int p = 0; p < 2; p++)
+		{
+			color_quad_u8 xMinColor, xMaxColor;
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+			}
+
+			color_quad_u8 scaledLow, scaledHigh;
+			for (uint32_t i = 0; i < 4; i++)
+			{
+				scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits));
+				scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits);
+				assert(scaledLow.m_c[i] <= 255);
+
+				scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits));
+				scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits);
+				assert(scaledHigh.m_c[i] <= 255);
+			}
+
+			float err0 = 0, err1 = 0;
+			for (uint32_t i = 0; i < total_comps; i++)
+			{
+				err0 += basisu::squaref(scaledLow.m_c[i] - xl[i] * 255.0f);
+				err1 += basisu::squaref(scaledHigh.m_c[i] - xh[i] * 255.0f);
+			}
+
+			if (err0 < best_err0)
+			{
+				best_err0 = err0;
+				best_pbits[0] = p;
+
+				bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
+				bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
+				bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
+				bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
+			}
+
+			if (err1 < best_err1)
+			{
+				best_err1 = err1;
+				best_pbits[1] = p;
+
+				bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
+				bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
+				bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
+				bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
+			}
+		}
+	}
+
+	bool transcode_uastc_to_astc(const uastc_block& src_blk, void* pDst)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, true, false))
+			return false;
+
+		bool success = false;
+		if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			pack_astc_solid_block(pDst, unpacked_src_blk.m_solid_color);
+			success = true;
+		}
+		else
+		{
+			success = pack_astc_block(static_cast<uint32_t*>(pDst), &unpacked_src_blk.m_astc, unpacked_src_blk.m_mode);
+		}
+
+		return success;
+	}
+
+	bool transcode_uastc_to_bc7(const unpacked_uastc_block& unpacked_src_blk, bc7_optimization_results& dst_blk)
+	{
+		memset(&dst_blk, 0, sizeof(dst_blk));
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode];
+		const uint32_t total_comps = g_uastc_mode_comps[mode];
+
+		switch (mode)
+		{
+		case 0:
+		case 5:
+		case 10:
+		case 12:
+		case 14:
+		case 15:
+		case 18:
+		{
+			// MODE 0: DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 19 (192) - BC7 MODE6 RGB
+			// MODE 5: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6 RGB
+			// MODE 10 DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE6
+			// MODE 12: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 19 (192) - BC7 MODE6
+			// MODE 14: DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6
+			// MODE 18: DualPlane: 0, WeightRange : 11 (32), Subsets : 1, CEM : 8, EndpointRange : 11 (32) - BC7 MODE6
+			// MODE 15: DualPlane: 0, WeightRange : 8 (16), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256) - BC7 MODE6
+			dst_blk.m_mode = 6;
+
+			float xl[4], xh[4];
+			if (total_comps == 2)
+			{
+				xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f;
+				xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f;
+
+				xl[1] = xl[0];
+				xh[1] = xh[0];
+
+				xl[2] = xl[0];
+				xh[2] = xh[0];
+
+				xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f;
+				xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f;
+			}
+			else
+			{
+				xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f;
+				xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f;
+				xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4]].m_unquant / 255.0f;
+
+				xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f;
+				xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f;
+				xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5]].m_unquant / 255.0f;
+
+				if (total_comps == 4)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to failed\n");
+					xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6]].m_unquant / 255.0f;
+					xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7]].m_unquant / 255.0f;
 				}
 				else
 				{
-					// Now decode the color data and transcode to ASTC. The transcoder function will read the alpha selector data from the output texture as it converts and
-					// transcode both the alpha and color data at the same time to ASTC.
-					status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState);
+					xl[3] = 1.0f;
+					xh[3] = 1.0f;
 				}
 			}
+
+			uint32_t best_pbits[2];
+			color_quad_u8 bestMinColor, bestMaxColor;
+			determine_unique_pbits((total_comps == 2) ? 4 : total_comps, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits);
+
+			dst_blk.m_low[0] = bestMinColor;
+			dst_blk.m_high[0] = bestMaxColor;
+
+			if (total_comps == 3)
+			{
+				dst_blk.m_low[0].m_c[3] = 127;
+				dst_blk.m_high[0].m_c[3] = 127;
+			}
+
+			dst_blk.m_pbits[0][0] = best_pbits[0];
+			dst_blk.m_pbits[0][1] = best_pbits[1];
+
+			if (mode == 18)
+			{
+				const uint8_t s_bc7_5_to_4[32] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 };
+				for (uint32_t i = 0; i < 16; i++)
+					dst_blk.m_selectors[i] = s_bc7_5_to_4[unpacked_src_blk.m_astc.m_weights[i]];
+			}
+			else if (mode == 14)
+			{
+				const uint8_t s_bc7_2_to_4[4] = { 0, 5, 10, 15 };
+				for (uint32_t i = 0; i < 16; i++)
+					dst_blk.m_selectors[i] = s_bc7_2_to_4[unpacked_src_blk.m_astc.m_weights[i]];
+			}
+			else if ((mode == 5) || (mode == 12))
+			{
+				const uint8_t s_bc7_3_to_4[8] = { 0, 2, 4, 6, 9, 11, 13, 15 };
+				for (uint32_t i = 0; i < 16; i++)
+					dst_blk.m_selectors[i] = s_bc7_3_to_4[unpacked_src_blk.m_astc.m_weights[i]];
+			}
 			else
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+			{
+				for (uint32_t i = 0; i < 16; i++)
+					dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+			}
 
 			break;
 		}
-		case transcoder_texture_format::cTFATC_RGB:
+		case 1:
 		{
-#if !BASISD_SUPPORT_ATC
-			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			// DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE3
+			// Mode 1 uses endpoint range 20 - no need to use ASTC dequant tables.
+			dst_blk.m_mode = 3;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+			float xl[4], xh[4];
+			xl[0] = unpacked_src_blk.m_astc.m_endpoints[0] / 255.0f;
+			xl[1] = unpacked_src_blk.m_astc.m_endpoints[2] / 255.0f;
+			xl[2] = unpacked_src_blk.m_astc.m_endpoints[4] / 255.0f;
+			xl[3] = 1.0f;
+
+			xh[0] = unpacked_src_blk.m_astc.m_endpoints[1] / 255.0f;
+			xh[1] = unpacked_src_blk.m_astc.m_endpoints[3] / 255.0f;
+			xh[2] = unpacked_src_blk.m_astc.m_endpoints[5] / 255.0f;
+			xh[3] = 1.0f;
+
+			uint32_t best_pbits[2];
+			color_quad_u8 bestMinColor, bestMaxColor;
+			memset(&bestMinColor, 0, sizeof(bestMinColor));
+			memset(&bestMaxColor, 0, sizeof(bestMaxColor));
+			determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits);
+
+			for (uint32_t i = 0; i < 3; i++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ATC_RGB failed\n");
+				dst_blk.m_low[0].m_c[i] = bestMinColor.m_c[i];
+				dst_blk.m_high[0].m_c[i] = bestMaxColor.m_c[i];
+				dst_blk.m_low[1].m_c[i] = bestMinColor.m_c[i];
+				dst_blk.m_high[1].m_c[i] = bestMaxColor.m_c[i];
 			}
+			dst_blk.m_pbits[0][0] = best_pbits[0];
+			dst_blk.m_pbits[0][1] = best_pbits[1];
+			dst_blk.m_pbits[1][0] = best_pbits[0];
+			dst_blk.m_pbits[1][1] = best_pbits[1];
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+
 			break;
 		}
-		case transcoder_texture_format::cTFATC_RGBA:
+		case 2:
 		{
-#if !BASISD_SUPPORT_ATC
-			return false;
-#endif
-#if !BASISD_SUPPORT_DXT5A
-			return false;
-#endif
-			assert(bytes_per_block == 16);
+			// 2. DualPlane: 0, WeightRange : 5 (8), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE1 
+			dst_blk.m_mode = 1;
+			dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7;
 
-			// First decode the alpha data 
-			if (basis_file_has_alpha_slices)
+			const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert;
+
+			float xl[4], xh[4];
+			xl[3] = 1.0f;
+			xh[3] = 1.0f;
+
+			for (uint32_t subset = 0; subset < 2; subset++)
 			{
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					uint32_t v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6];
+					v = (v << 4) | v;
+					xl[i] = v / 255.0f;
+
+					v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1];
+					v = (v << 4) | v;
+					xh[i] = v / 255.0f;
+				}
+
+				uint32_t best_pbits[2] = { 0, 0 };
+				color_quad_u8 bestMinColor, bestMaxColor;
+				memset(&bestMinColor, 0, sizeof(bestMinColor));
+				memset(&bestMaxColor, 0, sizeof(bestMaxColor));
+				determine_shared_pbits(3, 6, xl, xh, bestMinColor, bestMaxColor, best_pbits);
+
+				const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset;
+
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i];
+					dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i];
+				}
+
+				dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0];
+			} // subset
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+
+			break;
+		}
+		case 3:
+		{
+			// DualPlane: 0, WeightRange : 2 (4), Subsets : 3, EndpointRange : 7 (12) - BC7 MODE2
+			dst_blk.m_mode = 2;
+			dst_blk.m_partition = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_bc7;
+
+			const uint32_t perm = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_astc_to_bc7_perm;
+
+			for (uint32_t subset = 0; subset < 3; subset++)
+			{
+				for (uint32_t comp = 0; comp < 3; comp++)
+				{
+					uint32_t lo = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 0 + subset * 6]].m_unquant;
+					uint32_t hi = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 1 + subset * 6]].m_unquant;
+
+					// TODO: I think this can be improved by using tables like Basis Universal does with ETC1S conversion.
+					lo = (lo * 31 + 127) / 255;
+					hi = (hi * 31 + 127) / 255;
+
+					const uint32_t bc7_subset_index = g_astc_to_bc7_partition_index_perm_tables[perm][subset];
+
+					dst_blk.m_low[bc7_subset_index].m_c[comp] = (uint8_t)lo;
+					dst_blk.m_high[bc7_subset_index].m_c[comp] = (uint8_t)hi;
+				}
+			}
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+
+			break;
+		}
+		case 4:
+		{
+			// 4. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, EndpointRange: 12 (40) - BC7 MODE3
+			dst_blk.m_mode = 3;
+			dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7;
+
+			const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert;
+
+			float xl[4], xh[4];
+			xl[3] = 1.0f;
+			xh[3] = 1.0f;
+
+			for (uint32_t subset = 0; subset < 2; subset++)
+			{
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					xl[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6]].m_unquant / 255.0f;
+					xh[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1]].m_unquant / 255.0f;
+				}
+
+				uint32_t best_pbits[2] = { 0, 0 };
+				color_quad_u8 bestMinColor, bestMaxColor;
+				memset(&bestMinColor, 0, sizeof(bestMinColor));
+				memset(&bestMaxColor, 0, sizeof(bestMaxColor));
+				determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits);
+
+				const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset;
+
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i];
+					dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i];
+				}
+				dst_blk.m_low[bc7_subset_index].m_c[3] = 127;
+				dst_blk.m_high[bc7_subset_index].m_c[3] = 127;
+
+				dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0];
+				dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1];
+
+			} // subset
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+
+			break;
+		}
+		case 6:
+		case 11:
+		case 13:
+		case 17:
+		{
+			// MODE 6: DualPlane: 1, WeightRange : 2 (4), Subsets : 1, EndpointRange : 18 (160) - BC7 MODE5 RGB
+			// MODE 11: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE5
+			// MODE 13: DualPlane: 1, WeightRange: 0 (2), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE5
+			// MODE 17: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE5
+			dst_blk.m_mode = 5;
+			dst_blk.m_rotation = (unpacked_src_blk.m_astc.m_ccs + 1) & 3;
+
+			if (total_comps == 2)
+			{
+				assert(unpacked_src_blk.m_astc.m_ccs == 3);
+
+				dst_blk.m_low->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant * 127 + 127) / 255);
+				dst_blk.m_high->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant * 127 + 127) / 255);
+
+				dst_blk.m_low->m_c[1] = dst_blk.m_low->m_c[0];
+				dst_blk.m_high->m_c[1] = dst_blk.m_high->m_c[0];
+
+				dst_blk.m_low->m_c[2] = dst_blk.m_low->m_c[0];
+				dst_blk.m_high->m_c[2] = dst_blk.m_high->m_c[0];
+
+				dst_blk.m_low->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant);
+				dst_blk.m_high->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant);
 			}
 			else
 			{
-				write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, output_row_pitch_in_blocks_or_pixels);
-				status = true;
+				for (uint32_t astc_comp = 0; astc_comp < 4; astc_comp++)
+				{
+					uint32_t bc7_comp = astc_comp;
+					// ASTC and BC7 handle dual plane component rotations differently:
+					// ASTC: 2nd plane separately interpolates the CCS channel.
+					// BC7: 2nd plane channel is swapped with alpha, 2nd plane controls alpha interpolation, then we swap alpha with the desired channel.
+					if (astc_comp == (uint32_t)unpacked_src_blk.m_astc.m_ccs)
+						bc7_comp = 3;
+					else if (astc_comp == 3)
+						bc7_comp = unpacked_src_blk.m_astc.m_ccs;
+
+					uint32_t l = 255, h = 255;
+					if (astc_comp < total_comps)
+					{
+						l = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 0]].m_unquant;
+						h = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 1]].m_unquant;
+					}
+
+					if (bc7_comp < 3)
+					{
+						l = (l * 127 + 127) / 255;
+						h = (h * 127 + 127) / 255;
+					}
+
+					dst_blk.m_low->m_c[bc7_comp] = (uint8_t)l;
+					dst_blk.m_high->m_c[bc7_comp] = (uint8_t)h;
+				}
 			}
 
-			if (status)
+			if (mode == 13)
 			{
-				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				for (uint32_t i = 0; i < 16; i++)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ATC RGB failed\n");
+					dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2] ? 3 : 0;
+					dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1] ? 3 : 0;
 				}
 			}
 			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ATC A failed\n");
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2];
+					dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1];
+				}
 			}
+
 			break;
 		}
-		case transcoder_texture_format::cTFPVRTC2_4_RGB:
+		case 7:
 		{
-#if !BASISD_SUPPORT_PVRTC2
-			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			// DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 12 (40) - BC7 MODE2
+			dst_blk.m_mode = 2;
+			dst_blk.m_partition = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].m_bc73;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+			const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].k;
+
+			for (uint32_t bc7_part = 0; bc7_part < 3; bc7_part++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to cPVRTC2_4_RGB failed\n");
+				const uint32_t astc_part = bc7_convert_partition_index_3_to_2(bc7_part, common_pattern_k);
+
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					dst_blk.m_low[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 0 + astc_part * 6]].m_unquant * 31 + 127) / 255;
+					dst_blk.m_high[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 1 + astc_part * 6]].m_unquant * 31 + 127) / 255;
+				}
 			}
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
+
 			break;
 		}
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
+		case UASTC_MODE_INDEX_SOLID_COLOR:
 		{
-#if !BASISD_SUPPORT_PVRTC2
-			return false;
-#endif
-			if (basis_file_has_alpha_slices)
+			// Void-Extent: Solid Color RGBA (BC7 MODE5 or MODE6)
+			const color32& solid_color = unpacked_src_blk.m_solid_color;
+
+			uint32_t best_err0 = g_bc7_mode_6_optimal_endpoints[solid_color.r][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][0].m_error +
+				g_bc7_mode_6_optimal_endpoints[solid_color.b][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][0].m_error;
+
+			uint32_t best_err1 = g_bc7_mode_6_optimal_endpoints[solid_color.r][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][1].m_error +
+				g_bc7_mode_6_optimal_endpoints[solid_color.b][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][1].m_error;
+
+			if (best_err0 > 0 && best_err1 > 0)
 			{
-				// First decode the alpha data to the output (we're using the output texture as a temp buffer here).
-				status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				dst_blk.m_mode = 5;
+
+				for (uint32_t c = 0; c < 3; c++)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to failed\n");
+					dst_blk.m_low[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_lo;
+					dst_blk.m_high[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_hi;
 				}
-				else
+
+				memset(dst_blk.m_selectors, BC7ENC_MODE_5_OPTIMAL_INDEX, 16);
+
+				dst_blk.m_low[0].m_c[3] = solid_color.c[3];
+				dst_blk.m_high[0].m_c[3] = solid_color.c[3];
+
+				//memset(dst_blk.m_alpha_selectors, 0, 16);
+			}
+			else
+			{
+				dst_blk.m_mode = 6;
+
+				uint32_t best_p = 0;
+				if (best_err1 < best_err0)
+					best_p = 1;
+
+				for (uint32_t c = 0; c < 4; c++)
 				{
-					// Now decode the color data and transcode to PVRTC2 RGBA. 
-					status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGBA, bytes_per_block, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState);
+					dst_blk.m_low[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_lo;
+					dst_blk.m_high[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_hi;
 				}
+
+				dst_blk.m_pbits[0][0] = best_p;
+				dst_blk.m_pbits[0][1] = best_p;
+				memset(dst_blk.m_selectors, BC7ENC_MODE_6_OPTIMAL_INDEX, 16);
 			}
-			else
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
 
-			if (!status)
+			break;
+		}
+		case 9:
+		case 16:
+		{
+			// 9. DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE7
+			// 16. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE7
+
+			dst_blk.m_mode = 7;
+			dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7;
+
+			const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert;
+
+			for (uint32_t astc_subset = 0; astc_subset < 2; astc_subset++)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to cPVRTC2_4_RGBA failed\n");
-			}
+				float xl[4], xh[4];
+
+				if (total_comps == 2)
+				{
+					xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 4]].m_unquant / 255.0f;
+					xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 4]].m_unquant / 255.0f;
+
+					xl[1] = xl[0];
+					xh[1] = xh[0];
+
+					xl[2] = xl[0];
+					xh[2] = xh[0];
+
+					xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 4]].m_unquant / 255.0f;
+					xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 4]].m_unquant / 255.0f;
+				}
+				else
+				{
+					xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 8]].m_unquant / 255.0f;
+					xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 8]].m_unquant / 255.0f;
+					xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4 + astc_subset * 8]].m_unquant / 255.0f;
+					xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6 + astc_subset * 8]].m_unquant / 255.0f;
+
+					xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 8]].m_unquant / 255.0f;
+					xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 8]].m_unquant / 255.0f;
+					xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5 + astc_subset * 8]].m_unquant / 255.0f;
+					xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7 + astc_subset * 8]].m_unquant / 255.0f;
+				}
+
+				uint32_t best_pbits[2] = { 0, 0 };
+				color_quad_u8 bestMinColor, bestMaxColor;
+				memset(&bestMinColor, 0, sizeof(bestMinColor));
+				memset(&bestMaxColor, 0, sizeof(bestMaxColor));
+				determine_unique_pbits(4, 5, xl, xh, bestMinColor, bestMaxColor, best_pbits);
+
+				const uint32_t bc7_subset_index = invert_partition ? (1 - astc_subset) : astc_subset;
+
+				dst_blk.m_low[bc7_subset_index] = bestMinColor;
+				dst_blk.m_high[bc7_subset_index] = bestMaxColor;
+
+				dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0];
+				dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1];
+			} // astc_subset
+
+			for (uint32_t i = 0; i < 16; i++)
+				dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i];
 
 			break;
 		}
-		case transcoder_texture_format::cTFRGBA32:
+		default:
+			return false;
+		}
+
+		return true;
+	}
+
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, bc7_optimization_results& dst_blk)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false, false))
+			return false;
+
+		return transcode_uastc_to_bc7(unpacked_src_blk, dst_blk);
+	}
+
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, void* pDst)
+	{
+		bc7_optimization_results temp;
+		if (!transcode_uastc_to_bc7(src_blk, temp))
+			return false;
+
+		encode_bc7_block(pDst, &temp);
+		return true;
+	}
+
+	color32 apply_etc1_bias(const color32 &block_color, uint32_t bias, uint32_t limit, uint32_t subblock)
+	{
+		color32 result;
+
+		for (uint32_t c = 0; c < 3; c++)
 		{
-			// Raw 32bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
+			static const int s_divs[3] = { 1, 3, 9 };
 
-			// First decode the alpha data 
-			if (basis_file_has_alpha_slices)
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+			int delta = 0;
+
+			switch (bias)
+			{
+			case 2: delta = subblock ? 0 : ((c == 0) ? -1 : 0); break;
+			case 5: delta = subblock ? 0 : ((c == 1) ? -1 : 0); break;
+			case 6: delta = subblock ? 0 : ((c == 2) ? -1 : 0); break;
+
+			case 7: delta = subblock ? 0 : ((c == 0) ? 1 : 0); break;
+			case 11: delta = subblock ? 0 : ((c == 1) ? 1 : 0); break;
+			case 15: delta = subblock ? 0 : ((c == 2) ? 1 : 0); break;
+
+			case 18: delta = subblock ? ((c == 0) ? -1 : 0) : 0; break;
+			case 19: delta = subblock ? ((c == 1) ? -1 : 0) : 0; break;
+			case 20: delta = subblock ? ((c == 2) ? -1 : 0) : 0; break;
+
+			case 21: delta = subblock ? ((c == 0) ? 1 : 0) : 0; break;
+			case 24: delta = subblock ? ((c == 1) ? 1 : 0) : 0; break;
+			case 8: delta = subblock ? ((c == 2) ? 1 : 0) : 0; break;
+
+			case 10: delta = -2; break;
+
+			case 27: delta = subblock ? 0 : -1; break;
+			case 28: delta = subblock ? -1 : 1; break;
+			case 29: delta = subblock ? 1 : 0; break;
+			case 30: delta = subblock ? -1 : 0; break;
+			case 31: delta = subblock ? 0 : 1; break;
+
+			default:
+				delta = ((bias / s_divs[c]) % 3) - 1;
+				break;
+			}
+
+			int v = block_color[c];
+			if (v == 0)
+			{
+				if (delta == -2)
+					v += 3;
+				else
+					v += delta + 1;
+			}
+			else if (v == (int)limit)
+			{
+				v += (delta - 1);
+			}
 			else
-				status = true;
+			{
+				v += delta;
+				if ((v < 0) || (v > (int)limit))
+					v = (v - delta) - delta;
+			}
 
-			if (status)
+			assert(v >= 0);
+			assert(v <= (int)limit);
+
+			result[c] = (uint8_t)v;
+		}
+
+		return result;
+	}
+
+	static void etc1_determine_selectors(decoder_etc_block& dst_blk, const color32* pSource_pixels, uint32_t first_subblock, uint32_t last_subblock)
+	{
+		static const uint8_t s_tran[4] = { 1, 0, 2, 3 };
+
+		uint16_t l_bitmask = 0;
+		uint16_t h_bitmask = 0;
+
+		for (uint32_t subblock = first_subblock; subblock < last_subblock; subblock++)
+		{
+			color32 block_colors[4];
+			dst_blk.get_block_colors(block_colors, subblock);
+
+			uint32_t block_y[4];
+			for (uint32_t i = 0; i < 4; i++)
+				block_y[i] = block_colors[i][0] * 54 + block_colors[i][1] * 183 + block_colors[i][2] * 19;
+
+			const uint32_t block_y01 = block_y[0] + block_y[1];
+			const uint32_t block_y12 = block_y[1] + block_y[2];
+			const uint32_t block_y23 = block_y[2] + block_y[3];
+
+			// X0 X0 X0 X0 X1 X1 X1 X1 X2 X2 X2 X2 X3 X3 X3 X3
+			// Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3
+
+			if (dst_blk.get_flip_bit())
 			{
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
-				if (!status)
+				uint32_t ofs = subblock * 2;
+
+				for (uint32_t y = 0; y < 2; y++)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to RGBA32 RGB failed\n");
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const color32& c = pSource_pixels[x + (subblock * 2 + y) * 4];
+						const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38;
+
+						uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)];
+
+						assert(ofs < 16);
+						l_bitmask |= ((t & 1) << ofs);
+						h_bitmask |= ((t >> 1) << ofs);
+						ofs += 4;
+					}
+
+					ofs = (int)ofs + 1 - 4 * 4;
 				}
 			}
 			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to RGBA32 A failed\n");
+				uint32_t ofs = (subblock * 2) * 4;
+				for (uint32_t x = 0; x < 2; x++)
+				{
+					for (uint32_t y = 0; y < 4; y++)
+					{
+						const color32& c = pSource_pixels[subblock * 2 + x + y * 4];
+						const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38;
+
+						uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)];
+
+						assert(ofs < 16);
+						l_bitmask |= ((t & 1) << ofs);
+						h_bitmask |= ((t >> 1) << ofs);
+						++ofs;
+					}
+				}
 			}
+		}
 
-			break;
+		dst_blk.m_bytes[7] = (uint8_t)(l_bitmask);
+		dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8);
+		dst_blk.m_bytes[5] = (uint8_t)(h_bitmask);
+		dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8);
+	}
+
+	static const uint8_t s_etc1_solid_selectors[4][4] = { { 255, 255, 255, 255 }, { 255, 255, 0, 0 }, { 0, 0, 0, 0 }, {0, 0, 255, 255 } };
+
+	struct etc_coord2
+	{
+		uint8_t m_x, m_y;
+	};
+
+	// [flip][subblock][pixel_index]
+	const etc_coord2 g_etc1_pixel_coords[2][2][8] =
+	{
+		{
+		  {
+			 { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
+			 { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
+		  },
+		  {
+			 { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+			 { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
+		  }
+		},
+		{
+		  {
+			 { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+			 { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 }
+		  },
+		  {
+			 { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+			 { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
+		  },
 		}
-		case transcoder_texture_format::cTFRGB565:
-		case transcoder_texture_format::cTFBGR565:
+	};
+
+	void transcode_uastc_to_etc1(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst)
+	{
+		decoder_etc_block& dst_blk = *static_cast<decoder_etc_block*>(pDst);
+
+		if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-			// Raw 16bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
-			
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			dst_blk.m_bytes[3] = (uint8_t)((unpacked_src_blk.m_etc1_diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten0 << 2));
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, (fmt == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
-			if (!status)
+			if (unpacked_src_blk.m_etc1_diff)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to RGB565 RGB failed\n");
+				dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r << 3);
+				dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g << 3);
+				dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b << 3);
+			}
+			else
+			{
+				dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r | (unpacked_src_blk.m_etc1_r << 4));
+				dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g | (unpacked_src_blk.m_etc1_g << 4));
+				dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b | (unpacked_src_blk.m_etc1_b << 4));
 			}
 
-			break;
+			memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[unpacked_src_blk.m_etc1_selector][0], 4);
+
+			return;
 		}
-		case transcoder_texture_format::cTFRGBA4444:
+
+		const bool flip = unpacked_src_blk.m_etc1_flip != 0;
+		const bool diff = unpacked_src_blk.m_etc1_diff != 0;
+
+		dst_blk.m_bytes[3] = (uint8_t)((int)flip | (diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten1 << 2));
+
+		const uint32_t limit = diff ? 31 : 15;
+
+		color32 block_colors[2];
+
+		for (uint32_t subset = 0; subset < 2; subset++)
 		{
-			// Raw 16bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory.
+			uint32_t avg_color[3];
+			memset(avg_color, 0, sizeof(avg_color));
 
-			// First decode the alpha data 
-			if (basis_file_has_alpha_slices)
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
+			for (uint32_t j = 0; j < 8; j++)
+			{
+				const etc_coord2& c = g_etc1_pixel_coords[flip][subset][j];
+
+				avg_color[0] += block_pixels[c.m_y][c.m_x].r;
+				avg_color[1] += block_pixels[c.m_y][c.m_x].g;
+				avg_color[2] += block_pixels[c.m_y][c.m_x].b;
+			} // j
+
+			block_colors[subset][0] = (uint8_t)((avg_color[0] * limit + 1020) / (8 * 255));
+			block_colors[subset][1] = (uint8_t)((avg_color[1] * limit + 1020) / (8 * 255));
+			block_colors[subset][2] = (uint8_t)((avg_color[2] * limit + 1020) / (8 * 255));
+			block_colors[subset][3] = 0;
+
+			if (g_uastc_mode_has_etc1_bias[unpacked_src_blk.m_mode])
+			{
+				block_colors[subset] = apply_etc1_bias(block_colors[subset], unpacked_src_blk.m_etc1_bias, limit, subset);
+			}
+
+		} // subset
+
+		if (diff)
+		{
+			int dr = block_colors[1].r - block_colors[0].r;
+			int dg = block_colors[1].g - block_colors[0].g;
+			int db = block_colors[1].b - block_colors[0].b;
+
+			dr = basisu::clamp<int>(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			dg = basisu::clamp<int>(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			db = basisu::clamp<int>(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+
+			if (dr < 0) dr += 8;
+			if (dg < 0) dg += 8;
+			if (db < 0) db += 8;
+
+			dst_blk.m_bytes[0] = (uint8_t)((block_colors[0].r << 3) | dr);
+			dst_blk.m_bytes[1] = (uint8_t)((block_colors[0].g << 3) | dg);
+			dst_blk.m_bytes[2] = (uint8_t)((block_colors[0].b << 3) | db);
+		}
+		else
+		{
+			dst_blk.m_bytes[0] = (uint8_t)(block_colors[1].r | (block_colors[0].r << 4));
+			dst_blk.m_bytes[1] = (uint8_t)(block_colors[1].g | (block_colors[0].g << 4));
+			dst_blk.m_bytes[2] = (uint8_t)(block_colors[1].b | (block_colors[0].b << 4));
+		}
+
+		etc1_determine_selectors(dst_blk, &block_pixels[0][0], 0, 2);
+	}
+
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		color32 block_pixels[4][4];
+		if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			const bool unpack_srgb = false;
+			if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+				return false;
+		}
+
+		transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, pDst);
+
+		return true;
+	}
+
+	static inline int gray_distance2(const uint8_t c, int y)
+	{
+		int gray_dist = (int)c - y;
+		return gray_dist * gray_dist;
+	}
+
+	static bool pack_etc1_y_estimate_flipped(const uint8_t* pSrc_pixels,
+		int& upper_avg, int& lower_avg, int& left_avg, int& right_avg)
+	{
+		int sums[2][2];
+
+#define GET_XY(x, y) pSrc_pixels[(x) + ((y) * 4)]
+
+		sums[0][0] = GET_XY(0, 0) + GET_XY(0, 1) + GET_XY(1, 0) + GET_XY(1, 1);
+		sums[1][0] = GET_XY(2, 0) + GET_XY(2, 1) + GET_XY(3, 0) + GET_XY(3, 1);
+		sums[0][1] = GET_XY(0, 2) + GET_XY(0, 3) + GET_XY(1, 2) + GET_XY(1, 3);
+		sums[1][1] = GET_XY(2, 2) + GET_XY(2, 3) + GET_XY(3, 2) + GET_XY(3, 3);
+
+		upper_avg = (sums[0][0] + sums[1][0] + 4) / 8;
+		lower_avg = (sums[0][1] + sums[1][1] + 4) / 8;
+		left_avg = (sums[0][0] + sums[0][1] + 4) / 8;
+		right_avg = (sums[1][0] + sums[1][1] + 4) / 8;
+
+#undef GET_XY
+#define GET_XY(x, y, a) gray_distance2(pSrc_pixels[(x) + ((y) * 4)], a)
+
+		int upper_gray_dist = 0, lower_gray_dist = 0, left_gray_dist = 0, right_gray_dist = 0;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			for (uint32_t j = 0; j < 2; j++)
+			{
+				upper_gray_dist += GET_XY(i, j, upper_avg);
+				lower_gray_dist += GET_XY(i, 2 + j, lower_avg);
+				left_gray_dist += GET_XY(j, i, left_avg);
+				right_gray_dist += GET_XY(2 + j, i, right_avg);
+			}
+		}
+
+#undef GET_XY
+
+		int upper_lower_sum = upper_gray_dist + lower_gray_dist;
+		int left_right_sum = left_gray_dist + right_gray_dist;
+
+		return upper_lower_sum < left_right_sum;
+	}
+
+	// Base  Sel Table
+	// XXXXX XX  XXX
+	static const uint16_t g_etc1_y_solid_block_configs[256] =
+	{
+		0,781,64,161,260,192,33,131,96,320,65,162,261,193,34,291,97,224,66,163,262,194,35,549,98,4,67,653,164,195,523,36,99,5,578,68,165,353,196,37,135,100,324,69,166,354,197,38,295,101,228,70,167,
+		355,198,39,553,102,8,71,608,168,199,527,40,103,9,582,72,169,357,200,41,139,104,328,73,170,358,201,42,299,105,232,74,171,359,202,43,557,106,12,75,612,172,203,531,44,107,13,586,76,173,361,
+		204,45,143,108,332,77,174,362,205,46,303,109,236,78,175,363,206,47,561,110,16,79,616,176,207,535,48,111,17,590,80,177,365,208,49,147,112,336,81,178,366,209,50,307,113,240,82,179,367,210,
+		51,565,114,20,83,620,180,211,539,52,115,21,594,84,181,369,212,53,151,116,340,85,182,370,213,54,311,117,244,86,183,371,214,55,569,118,24,87,624,184,215,543,56,119,25,598,88,185,373,216,57,
+		155,120,344,89,186,374,217,58,315,121,248,90,187,375,218,59,573,122,28,91,628,188,219,754,60,123,29,602,92,189,377,220,61,159,124,348,93,190,378,221,62,319,125,252,94,191,379,222,63,882,126
+	};
+
+	// individual
+	// table base sel0 sel1 sel2 sel3
+	static const uint16_t g_etc1_y_solid_block_4i_configs[256] =
+	{
+		0xA000,0xA800,0x540B,0xAA01,0xAA01,0xFE00,0xFF00,0xFF00,0x8,0x5515,0x5509,0x5509,0xAA03,0x5508,0x5508,0x9508,0xA508,0xA908,0xAA08,0x5513,0xAA09,0xAA09,0xAA05,0xFF08,0xFF08,0x10,0x551D,0x5511,0x5511,
+		0xAA0B,0x5510,0x5510,0x9510,0xA510,0xA910,0xAA10,0x551B,0xAA11,0xAA11,0xAA0D,0xFF10,0xFF10,0x18,0x5525,0x5519,0x5519,0xAA13,0x5518,0x5518,0x9518,0xA518,0xA918,0xAA18,0x5523,0xAA19,0xAA19,0xAA15,
+		0xFF18,0xFF18,0x20,0x552D,0x5521,0x5521,0xAA1B,0x5520,0x5520,0x9520,0xA520,0xA920,0xAA20,0x552B,0xAA21,0xAA21,0xAA1D,0xFF20,0xFF20,0x28,0x5535,0x5529,0x5529,0xAA23,0x5528,0x5528,0x9528,0xA528,0xA928,
+		0xAA28,0x5533,0xAA29,0xAA29,0xAA25,0xFF28,0xFF28,0x30,0x553D,0x5531,0x5531,0xAA2B,0x5530,0x5530,0x9530,0xA530,0xA930,0xAA30,0x553B,0xAA31,0xAA31,0xAA2D,0xFF30,0xFF30,0x38,0x5545,0x5539,0x5539,0xAA33,
+		0x5538,0x5538,0x9538,0xA538,0xA938,0xAA38,0x5543,0xAA39,0xAA39,0xAA35,0xFF38,0xFF38,0x40,0x554D,0x5541,0x5541,0xAA3B,0x5540,0x5540,0x9540,0xA540,0xA940,0xAA40,0x554B,0xAA41,0xAA41,0xAA3D,0xFF40,0xFF40,
+		0x48,0x5555,0x5549,0x5549,0xAA43,0x5548,0x5548,0x9548,0xA548,0xA948,0xAA48,0x5553,0xAA49,0xAA49,0xAA45,0xFF48,0xFF48,0x50,0x555D,0x5551,0x5551,0xAA4B,0x5550,0x5550,0x9550,0xA550,0xA950,0xAA50,0x555B,
+		0xAA51,0xAA51,0xAA4D,0xFF50,0xFF50,0x58,0x5565,0x5559,0x5559,0xAA53,0x5558,0x5558,0x9558,0xA558,0xA958,0xAA58,0x5563,0xAA59,0xAA59,0xAA55,0xFF58,0xFF58,0x60,0x556D,0x5561,0x5561,0xAA5B,0x5560,0x5560,
+		0x9560,0xA560,0xA960,0xAA60,0x556B,0xAA61,0xAA61,0xAA5D,0xFF60,0xFF60,0x68,0x5575,0x5569,0x5569,0xAA63,0x5568,0x5568,0x9568,0xA568,0xA968,0xAA68,0x5573,0xAA69,0xAA69,0xAA65,0xFF68,0xFF68,0x70,0x557D,
+		0x5571,0x5571,0xAA6B,0x5570,0x5570,0x9570,0xA570,0xA970,0xAA70,0x557B,0xAA71,0xAA71,0xAA6D,0xFF70,0xFF70,0x78,0x78,0x5579,0x5579,0xAA73,0x5578,0x9578,0x2578,0xE6E,0x278
+	};
+
+	static const uint16_t g_etc1_y_solid_block_2i_configs[256] =
+	{
+		0x416,0x800,0xA00,0x50B,0xA01,0xA01,0xF00,0xF00,0xF00,0x8,0x515,0x509,0x509,0xA03,0x508,0x508,0xF01,0xF01,0xA08,0xA08,0x513,0xA09,0xA09,0xA05,0xF08,0xF08,0x10,0x51D,0x511,0x511,0xA0B,0x510,0x510,0xF09,
+		0xF09,0xA10,0xA10,0x51B,0xA11,0xA11,0xA0D,0xF10,0xF10,0x18,0x525,0x519,0x519,0xA13,0x518,0x518,0xF11,0xF11,0xA18,0xA18,0x523,0xA19,0xA19,0xA15,0xF18,0xF18,0x20,0x52D,0x521,0x521,0xA1B,0x520,0x520,0xF19,
+		0xF19,0xA20,0xA20,0x52B,0xA21,0xA21,0xA1D,0xF20,0xF20,0x28,0x535,0x529,0x529,0xA23,0x528,0x528,0xF21,0xF21,0xA28,0xA28,0x533,0xA29,0xA29,0xA25,0xF28,0xF28,0x30,0x53D,0x531,0x531,0xA2B,0x530,0x530,0xF29,
+		0xF29,0xA30,0xA30,0x53B,0xA31,0xA31,0xA2D,0xF30,0xF30,0x38,0x545,0x539,0x539,0xA33,0x538,0x538,0xF31,0xF31,0xA38,0xA38,0x543,0xA39,0xA39,0xA35,0xF38,0xF38,0x40,0x54D,0x541,0x541,0xA3B,0x540,0x540,0xF39,
+		0xF39,0xA40,0xA40,0x54B,0xA41,0xA41,0xA3D,0xF40,0xF40,0x48,0x555,0x549,0x549,0xA43,0x548,0x548,0xF41,0xF41,0xA48,0xA48,0x553,0xA49,0xA49,0xA45,0xF48,0xF48,0x50,0x55D,0x551,0x551,0xA4B,0x550,0x550,0xF49,
+		0xF49,0xA50,0xA50,0x55B,0xA51,0xA51,0xA4D,0xF50,0xF50,0x58,0x565,0x559,0x559,0xA53,0x558,0x558,0xF51,0xF51,0xA58,0xA58,0x563,0xA59,0xA59,0xA55,0xF58,0xF58,0x60,0x56D,0x561,0x561,0xA5B,0x560,0x560,0xF59,
+		0xF59,0xA60,0xA60,0x56B,0xA61,0xA61,0xA5D,0xF60,0xF60,0x68,0x575,0x569,0x569,0xA63,0x568,0x568,0xF61,0xF61,0xA68,0xA68,0x573,0xA69,0xA69,0xA65,0xF68,0xF68,0x70,0x57D,0x571,0x571,0xA6B,0x570,0x570,0xF69,
+		0xF69,0xA70,0xA70,0x57B,0xA71,0xA71,0xA6D,0xF70,0xF70,0x78,0x78,0x579,0x579,0xA73,0x578,0x578,0xE6E,0x278
+	};
+
+	static const uint16_t g_etc1_y_solid_block_1i_configs[256] =
+	{
+		0x0,0x116,0x200,0x200,0x10B,0x201,0x201,0x300,0x300,0x8,0x115,0x109,0x109,0x203,0x108,0x108,0x114,0x301,0x204,0x208,0x208,0x113,0x209,0x209,0x205,0x308,0x10,0x11D,0x111,0x111,0x20B,0x110,0x110,0x11C,0x309,
+		0x20C,0x210,0x210,0x11B,0x211,0x211,0x20D,0x310,0x18,0x125,0x119,0x119,0x213,0x118,0x118,0x124,0x311,0x214,0x218,0x218,0x123,0x219,0x219,0x215,0x318,0x20,0x12D,0x121,0x121,0x21B,0x120,0x120,0x12C,0x319,0x21C,
+		0x220,0x220,0x12B,0x221,0x221,0x21D,0x320,0x28,0x135,0x129,0x129,0x223,0x128,0x128,0x134,0x321,0x224,0x228,0x228,0x133,0x229,0x229,0x225,0x328,0x30,0x13D,0x131,0x131,0x22B,0x130,0x130,0x13C,0x329,0x22C,0x230,
+		0x230,0x13B,0x231,0x231,0x22D,0x330,0x38,0x145,0x139,0x139,0x233,0x138,0x138,0x144,0x331,0x234,0x238,0x238,0x143,0x239,0x239,0x235,0x338,0x40,0x14D,0x141,0x141,0x23B,0x140,0x140,0x14C,0x339,0x23C,0x240,0x240,
+		0x14B,0x241,0x241,0x23D,0x340,0x48,0x155,0x149,0x149,0x243,0x148,0x148,0x154,0x341,0x244,0x248,0x248,0x153,0x249,0x249,0x245,0x348,0x50,0x15D,0x151,0x151,0x24B,0x150,0x150,0x15C,0x349,0x24C,0x250,0x250,0x15B,
+		0x251,0x251,0x24D,0x350,0x58,0x165,0x159,0x159,0x253,0x158,0x158,0x164,0x351,0x254,0x258,0x258,0x163,0x259,0x259,0x255,0x358,0x60,0x16D,0x161,0x161,0x25B,0x160,0x160,0x16C,0x359,0x25C,0x260,0x260,0x16B,0x261,
+		0x261,0x25D,0x360,0x68,0x175,0x169,0x169,0x263,0x168,0x168,0x174,0x361,0x264,0x268,0x268,0x173,0x269,0x269,0x265,0x368,0x70,0x17D,0x171,0x171,0x26B,0x170,0x170,0x17C,0x369,0x26C,0x270,0x270,0x17B,0x271,0x271,
+		0x26D,0x370,0x78,0x78,0x179,0x179,0x273,0x178,0x178,0x26E,0x278
+	};
+
+	// We don't have any useful hints to accelerate single channel ETC1, so we need to real-time encode from scratch.
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst, uint32_t channel)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+#if 0
+		for (uint32_t individ = 0; individ < 2; individ++)
+		{
+			uint32_t overall_error = 0;
+
+			for (uint32_t c = 0; c < 256; c++)
+			{
+				uint32_t best_err = UINT32_MAX;
+				uint32_t best_individ = 0;
+				uint32_t best_base = 0;
+				uint32_t best_sels[4] = { 0,0,0,0 };
+				uint32_t best_table = 0;
+
+				const uint32_t limit = individ ? 16 : 32;
+
+				for (uint32_t table = 0; table < 8; table++)
+				{
+					for (uint32_t base = 0; base < limit; base++)
+					{
+						uint32_t total_e = 0;
+						uint32_t sels[4] = { 0,0,0,0 };
+
+						const uint32_t N = 4;
+						for (uint32_t i = 0; i < basisu::minimum<uint32_t>(N, (256 - c)); i++)
+						{
+							uint32_t best_sel_e = UINT32_MAX;
+							uint32_t best_sel = 0;
+
+							for (uint32_t sel = 0; sel < 4; sel++)
+							{
+								int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2));
+								val = clamp255(val + g_etc1_inten_tables[table][sel]);
+
+								int e = iabs(val - clamp255(c + i));
+								if (e < best_sel_e)
+								{
+									best_sel_e = e;
+									best_sel = sel;
+								}
+
+							} // sel
+
+							sels[i] = best_sel;
+							total_e += best_sel_e * best_sel_e;
+
+						} // i
+
+						if (total_e < best_err)
+						{
+							best_err = total_e;
+							best_individ = individ;
+							best_base = base;
+							memcpy(best_sels, sels, sizeof(best_sels));
+							best_table = table;
+						}
+
+					} // base
+				} // table
+
+				//printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]);
+
+				uint32_t encoded = best_table | (best_base << 3) |
+					(best_sels[0] << 8) |
+					(best_sels[1] << 10) |
+					(best_sels[2] << 12) |
+					(best_sels[3] << 14);
+
+				printf("0x%X,", encoded);
+
+				overall_error += best_err;
+			} // c
+
+			printf("\n");
+			printf("Overall error: %u\n", overall_error);
+
+		} // individ
+
+		exit(0);
+#endif
+
+#if 0
+		for (uint32_t individ = 0; individ < 2; individ++)
+		{
+			uint32_t overall_error = 0;
+
+			for (uint32_t c = 0; c < 256; c++)
+			{
+				uint32_t best_err = UINT32_MAX;
+				uint32_t best_individ = 0;
+				uint32_t best_base = 0;
+				uint32_t best_sels[4] = { 0,0,0,0 };
+				uint32_t best_table = 0;
+
+				const uint32_t limit = individ ? 16 : 32;
+
+				for (uint32_t table = 0; table < 8; table++)
+				{
+					for (uint32_t base = 0; base < limit; base++)
+					{
+						uint32_t total_e = 0;
+						uint32_t sels[4] = { 0,0,0,0 };
+
+						const uint32_t N = 1;
+						for (uint32_t i = 0; i < basisu::minimum<uint32_t>(N, (256 - c)); i++)
+						{
+							uint32_t best_sel_e = UINT32_MAX;
+							uint32_t best_sel = 0;
+
+							for (uint32_t sel = 0; sel < 4; sel++)
+							{
+								int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2));
+								val = clamp255(val + g_etc1_inten_tables[table][sel]);
+
+								int e = iabs(val - clamp255(c + i));
+								if (e < best_sel_e)
+								{
+									best_sel_e = e;
+									best_sel = sel;
+								}
+
+							} // sel
+
+							sels[i] = best_sel;
+							total_e += best_sel_e * best_sel_e;
+
+						} // i
+
+						if (total_e < best_err)
+						{
+							best_err = total_e;
+							best_individ = individ;
+							best_base = base;
+							memcpy(best_sels, sels, sizeof(best_sels));
+							best_table = table;
+						}
+
+					} // base
+				} // table
+
+				//printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]);
+
+				uint32_t encoded = best_table | (best_base << 3) |
+					(best_sels[0] << 8) |
+					(best_sels[1] << 10) |
+					(best_sels[2] << 12) |
+					(best_sels[3] << 14);
+
+				printf("0x%X,", encoded);
+
+				overall_error += best_err;
+			} // c
+
+			printf("\n");
+			printf("Overall error: %u\n", overall_error);
+
+		} // individ
+
+		exit(0);
+#endif
+
+		decoder_etc_block& dst_blk = *static_cast<decoder_etc_block*>(pDst);
+
+		if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			const uint32_t y = unpacked_src_blk.m_solid_color[channel];
+			const uint32_t encoded_config = g_etc1_y_solid_block_configs[y];
+
+			const uint32_t base = encoded_config & 31;
+			const uint32_t sel = (encoded_config >> 5) & 3;
+			const uint32_t table = encoded_config >> 7;
+
+			dst_blk.m_bytes[3] = (uint8_t)(2 | (table << 5) | (table << 2));
+
+			dst_blk.m_bytes[0] = (uint8_t)(base << 3);
+			dst_blk.m_bytes[1] = (uint8_t)(base << 3);
+			dst_blk.m_bytes[2] = (uint8_t)(base << 3);
+
+			memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[sel][0], 4);
+			return true;
+		}
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		uint8_t block_y[4][4];
+		for (uint32_t i = 0; i < 16; i++)
+			((uint8_t*)block_y)[i] = ((color32*)block_pixels)[i][channel];
+
+		int upper_avg, lower_avg, left_avg, right_avg;
+		bool flip = pack_etc1_y_estimate_flipped(&block_y[0][0], upper_avg, lower_avg, left_avg, right_avg);
+
+		// non-flipped: | |
+		// vs. 
+		// flipped:     --
+		//              --
+
+		uint32_t low[2] = { 255, 255 }, high[2] = { 0, 0 };
+
+		if (flip)
+		{
+			for (uint32_t y = 0; y < 2; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t v = block_y[y][x];
+					low[0] = basisu::minimum(low[0], v);
+					high[0] = basisu::maximum(high[0], v);
+				}
+			}
+			for (uint32_t y = 2; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t v = block_y[y][x];
+					low[1] = basisu::minimum(low[1], v);
+					high[1] = basisu::maximum(high[1], v);
+				}
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 2; x++)
+				{
+					const uint32_t v = block_y[y][x];
+					low[0] = basisu::minimum(low[0], v);
+					high[0] = basisu::maximum(high[0], v);
+				}
+			}
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 2; x < 4; x++)
+				{
+					const uint32_t v = block_y[y][x];
+					low[1] = basisu::minimum(low[1], v);
+					high[1] = basisu::maximum(high[1], v);
+				}
+			}
+		}
+
+		const uint32_t range[2] = { high[0] - low[0], high[1] - low[1] };
+
+		dst_blk.m_bytes[3] = (uint8_t)((int)flip);
+
+		if ((range[0] <= 3) && (range[1] <= 3))
+		{
+			// This is primarily for better gradients.
+			dst_blk.m_bytes[0] = 0;
+			dst_blk.m_bytes[1] = 0;
+			dst_blk.m_bytes[2] = 0;
+
+			uint16_t l_bitmask = 0, h_bitmask = 0;
+
+			for (uint32_t subblock = 0; subblock < 2; subblock++)
+			{
+				const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]);
+
+				const uint32_t table = encoded & 7;
+				const uint32_t base = (encoded >> 3) & 31;
+				assert(base <= 15);
+				const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 };
+
+				dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5));
+
+				const uint32_t sv = base << (subblock ? 0 : 4);
+				dst_blk.m_bytes[0] |= (uint8_t)(sv);
+				dst_blk.m_bytes[1] |= (uint8_t)(sv);
+				dst_blk.m_bytes[2] |= (uint8_t)(sv);
+
+				if (flip)
+				{
+					uint32_t ofs = subblock * 2;
+					for (uint32_t y = 0; y < 2; y++)
+					{
+						for (uint32_t x = 0; x < 4; x++)
+						{
+							uint32_t t = block_y[y + subblock * 2][x];
+							assert(t >= low[subblock] && t <= high[subblock]);
+							t -= low[subblock];
+							assert(t <= 3);
+
+							t = g_selector_index_to_etc1[sels[t]];
+
+							assert(ofs < 16);
+							l_bitmask |= ((t & 1) << ofs);
+							h_bitmask |= ((t >> 1) << ofs);
+							ofs += 4;
+						}
+
+						ofs = (int)ofs + 1 - 4 * 4;
+					}
+				}
+				else
+				{
+					uint32_t ofs = (subblock * 2) * 4;
+					for (uint32_t x = 0; x < 2; x++)
+					{
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							uint32_t t = block_y[y][x + subblock * 2];
+							assert(t >= low[subblock] && t <= high[subblock]);
+							t -= low[subblock];
+							assert(t <= 3);
+
+							t = g_selector_index_to_etc1[sels[t]];
+
+							assert(ofs < 16);
+							l_bitmask |= ((t & 1) << ofs);
+							h_bitmask |= ((t >> 1) << ofs);
+							++ofs;
+						}
+					}
+				}
+			} // subblock
+
+			dst_blk.m_bytes[7] = (uint8_t)(l_bitmask);
+			dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8);
+			dst_blk.m_bytes[5] = (uint8_t)(h_bitmask);
+			dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8);
+
+			return true;
+		}
+
+		uint32_t y0 = ((flip ? upper_avg : left_avg) * 31 + 127) / 255;
+		uint32_t y1 = ((flip ? lower_avg : right_avg) * 31 + 127) / 255;
+
+		bool diff = true;
+
+		int dy = y1 - y0;
+
+		if ((dy < cETC1ColorDeltaMin) || (dy > cETC1ColorDeltaMax))
+		{
+			diff = false;
+
+			y0 = ((flip ? upper_avg : left_avg) * 15 + 127) / 255;
+			y1 = ((flip ? lower_avg : right_avg) * 15 + 127) / 255;
+
+			dst_blk.m_bytes[0] = (uint8_t)(y1 | (y0 << 4));
+			dst_blk.m_bytes[1] = (uint8_t)(y1 | (y0 << 4));
+			dst_blk.m_bytes[2] = (uint8_t)(y1 | (y0 << 4));
+		}
+		else
+		{
+			dy = basisu::clamp<int>(dy, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+
+			y1 = y0 + dy;
+
+			if (dy < 0) dy += 8;
+
+			dst_blk.m_bytes[0] = (uint8_t)((y0 << 3) | dy);
+			dst_blk.m_bytes[1] = (uint8_t)((y0 << 3) | dy);
+			dst_blk.m_bytes[2] = (uint8_t)((y0 << 3) | dy);
+
+			dst_blk.m_bytes[3] |= 2;
+		}
+
+		const uint32_t base_y[2] = { diff ? ((y0 << 3) | (y0 >> 2)) : ((y0 << 4) | y0), diff ? ((y1 << 3) | (y1 >> 2)) : ((y1 << 4) | y1) };
+
+		uint32_t enc_range[2];
+		for (uint32_t subset = 0; subset < 2; subset++)
+		{
+			const int pos = basisu::iabs((int)high[subset] - (int)base_y[subset]);
+			const int neg = basisu::iabs((int)base_y[subset] - (int)low[subset]);
+
+			enc_range[subset] = basisu::maximum(pos, neg);
+		}
+
+		uint16_t l_bitmask = 0, h_bitmask = 0;
+		for (uint32_t subblock = 0; subblock < 2; subblock++)
+		{
+			if ((!diff) && (range[subblock] <= 3))
+			{
+				const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]);
+
+				const uint32_t table = encoded & 7;
+				const uint32_t base = (encoded >> 3) & 31;
+				assert(base <= 15);
+				const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 };
+
+				dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5));
+
+				const uint32_t mask = ~(0xF << (subblock ? 0 : 4));
+
+				dst_blk.m_bytes[0] &= mask;
+				dst_blk.m_bytes[1] &= mask;
+				dst_blk.m_bytes[2] &= mask;
+
+				const uint32_t sv = base << (subblock ? 0 : 4);
+				dst_blk.m_bytes[0] |= (uint8_t)(sv);
+				dst_blk.m_bytes[1] |= (uint8_t)(sv);
+				dst_blk.m_bytes[2] |= (uint8_t)(sv);
+
+				if (flip)
+				{
+					uint32_t ofs = subblock * 2;
+					for (uint32_t y = 0; y < 2; y++)
+					{
+						for (uint32_t x = 0; x < 4; x++)
+						{
+							uint32_t t = block_y[y + subblock * 2][x];
+							assert(t >= low[subblock] && t <= high[subblock]);
+							t -= low[subblock];
+							assert(t <= 3);
+
+							t = g_selector_index_to_etc1[sels[t]];
+
+							assert(ofs < 16);
+							l_bitmask |= ((t & 1) << ofs);
+							h_bitmask |= ((t >> 1) << ofs);
+							ofs += 4;
+						}
+
+						ofs = (int)ofs + 1 - 4 * 4;
+					}
+				}
+				else
+				{
+					uint32_t ofs = (subblock * 2) * 4;
+					for (uint32_t x = 0; x < 2; x++)
+					{
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							uint32_t t = block_y[y][x + subblock * 2];
+							assert(t >= low[subblock] && t <= high[subblock]);
+							t -= low[subblock];
+							assert(t <= 3);
+
+							t = g_selector_index_to_etc1[sels[t]];
+
+							assert(ofs < 16);
+							l_bitmask |= ((t & 1) << ofs);
+							h_bitmask |= ((t >> 1) << ofs);
+							++ofs;
+						}
+					}
+				}
+
+				continue;
+			} // if
+
+			uint32_t best_err = UINT32_MAX;
+			uint8_t best_sels[8];
+			uint32_t best_inten = 0;
+
+			const int base = base_y[subblock];
+
+			const int low_limit = -base;
+			const int high_limit = 255 - base;
+
+			assert(low_limit <= 0 && high_limit >= 0);
+
+			uint32_t inten_table_mask = 0xFF;
+			const uint32_t er = enc_range[subblock];
+			// Each one of these tables is expensive to evaluate, so let's only examine the ones we know may be useful.
+			if (er <= 51)
+			{
+				inten_table_mask = 0xF;
+
+				if (er > 22)
+					inten_table_mask &= ~(1 << 0);
+
+				if ((er < 4) || (er > 39))
+					inten_table_mask &= ~(1 << 1);
+
+				if (er < 9)
+					inten_table_mask &= ~(1 << 2);
+
+				if (er < 12)
+					inten_table_mask &= ~(1 << 3);
+			}
 			else
-				status = true;
+			{
+				inten_table_mask &= ~((1 << 0) | (1 << 1));
 
-			if (status)
+				if (er > 60)
+					inten_table_mask &= ~(1 << 2);
+
+				if (er > 89)
+					inten_table_mask &= ~(1 << 3);
+
+				if (er > 120)
+					inten_table_mask &= ~(1 << 4);
+
+				if (er > 136)
+					inten_table_mask &= ~(1 << 5);
+
+				if (er > 174)
+					inten_table_mask &= ~(1 << 6);
+			}
+
+			for (uint32_t inten = 0; inten < 8; inten++)
 			{
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels);
-				if (!status)
+				if ((inten_table_mask & (1 << inten)) == 0)
+					continue;
+
+				const int t0 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][0]);
+				const int t1 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][1]);
+				const int t2 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][2]);
+				const int t3 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][3]);
+				assert((t0 <= t1) && (t1 <= t2) && (t2 <= t3));
+
+				const int tv[4] = { t2, t3, t1, t0 };
+
+				const int thresh01 = t0 + t1;
+				const int thresh12 = t1 + t2;
+				const int thresh23 = t2 + t3;
+
+				assert(thresh01 <= thresh12 && thresh12 <= thresh23);
+
+				static const uint8_t s_table[4] = { 1, 0, 2, 3 };
+
+				uint32_t total_err = 0;
+				uint8_t sels[8];
+
+				if (flip)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to RGBA4444 RGB failed\n");
+					if (((int)high[subblock] - base) * 2 < thresh01)
+					{
+						memset(sels, 3, 8);
+
+						for (uint32_t y = 0; y < 2; y++)
+						{
+							for (uint32_t x = 0; x < 4; x++)
+							{
+								const int delta = (int)block_y[y + subblock * 2][x] - base;
+
+								const uint32_t c = 3;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+					else if (((int)low[subblock] - base) * 2 >= thresh23)
+					{
+						memset(sels, 1, 8);
+
+						for (uint32_t y = 0; y < 2; y++)
+						{
+							for (uint32_t x = 0; x < 4; x++)
+							{
+								const int delta = (int)block_y[y + subblock * 2][x] - base;
+
+								const uint32_t c = 1;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+					else
+					{
+						for (uint32_t y = 0; y < 2; y++)
+						{
+							for (uint32_t x = 0; x < 4; x++)
+							{
+								const int delta = (int)block_y[y + subblock * 2][x] - base;
+								const int delta2 = delta * 2;
+
+								uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)];
+								sels[y * 4 + x] = (uint8_t)c;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+				}
+				else
+				{
+					if (((int)high[subblock] - base) * 2 < thresh01)
+					{
+						memset(sels, 3, 8);
+
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							for (uint32_t x = 0; x < 2; x++)
+							{
+								const int delta = (int)block_y[y][x + subblock * 2] - base;
+
+								const uint32_t c = 3;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+					else if (((int)low[subblock] - base) * 2 >= thresh23)
+					{
+						memset(sels, 1, 8);
+
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							for (uint32_t x = 0; x < 2; x++)
+							{
+								const int delta = (int)block_y[y][x + subblock * 2] - base;
+
+								const uint32_t c = 1;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+					else
+					{
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							for (uint32_t x = 0; x < 2; x++)
+							{
+								const int delta = (int)block_y[y][x + subblock * 2] - base;
+								const int delta2 = delta * 2;
+
+								uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)];
+								sels[y * 2 + x] = (uint8_t)c;
+
+								uint32_t e = basisu::iabs(tv[c] - delta);
+								total_err += e * e;
+							}
+							if (total_err >= best_err)
+								break;
+						}
+					}
+				}
+
+				if (total_err < best_err)
+				{
+					best_err = total_err;
+					best_inten = inten;
+					memcpy(best_sels, sels, 8);
+				}
+
+			} // inten
+
+			//g_inten_hist[best_inten][enc_range[subblock]]++;
+
+			dst_blk.m_bytes[3] |= (uint8_t)(best_inten << (subblock ? 2 : 5));
+
+			if (flip)
+			{
+				uint32_t ofs = subblock * 2;
+				for (uint32_t y = 0; y < 2; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						uint32_t t = best_sels[y * 4 + x];
+
+						assert(ofs < 16);
+						l_bitmask |= ((t & 1) << ofs);
+						h_bitmask |= ((t >> 1) << ofs);
+						ofs += 4;
+					}
+
+					ofs = (int)ofs + 1 - 4 * 4;
 				}
 			}
 			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to RGBA4444 A failed\n");
+				uint32_t ofs = (subblock * 2) * 4;
+				for (uint32_t x = 0; x < 2; x++)
+				{
+					for (uint32_t y = 0; y < 4; y++)
+					{
+						uint32_t t = best_sels[y * 2 + x];
+
+						assert(ofs < 16);
+						l_bitmask |= ((t & 1) << ofs);
+						h_bitmask |= ((t >> 1) << ofs);
+						++ofs;
+					}
+				}
 			}
 
-			break;
+		} // subblock
+
+		dst_blk.m_bytes[7] = (uint8_t)(l_bitmask);
+		dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8);
+		dst_blk.m_bytes[5] = (uint8_t)(h_bitmask);
+		dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8);
+
+		return true;
+	}
+
+	const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7;
+
+	void transcode_uastc_to_etc2_eac_a8(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst)
+	{
+		eac_block& dst = *static_cast<eac_block*>(pDst);
+		const color32* pSrc_pixels = &block_pixels[0][0];
+
+		if ((!g_uastc_mode_has_alpha[unpacked_src_blk.m_mode]) || (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			const uint32_t a = (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) ? unpacked_src_blk.m_solid_color[3] : 255;
+
+			dst.m_base = a;
+			dst.m_table = 13;
+			dst.m_multiplier = 1;
+
+			memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4));
+
+			return;
 		}
-		case transcoder_texture_format::cTFFXT1_RGB:
+
+		uint32_t min_a = 255, max_a = 0;
+		for (uint32_t i = 0; i < 16; i++)
 		{
-#if !BASISD_SUPPORT_FXT1
+			min_a = basisu::minimum<uint32_t>(min_a, pSrc_pixels[i].a);
+			max_a = basisu::maximum<uint32_t>(max_a, pSrc_pixels[i].a);
+		}
+
+		if (min_a == max_a)
+		{
+			dst.m_base = min_a;
+			dst.m_table = 13;
+			dst.m_multiplier = 1;
+
+			memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4));
+			return;
+		}
+
+		const uint32_t table = unpacked_src_blk.m_etc2_hints & 0xF;
+		const int multiplier = unpacked_src_blk.m_etc2_hints >> 4;
+
+		assert(multiplier >= 1);
+
+		dst.m_multiplier = multiplier;
+		dst.m_table = table;
+
+		const float range = (float)(g_eac_modifier_table[dst.m_table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+		const int center = (int)roundf(basisu::lerp((float)min_a, (float)max_a, (float)(0 - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range));
+
+		dst.m_base = center;
+
+		const int8_t* pTable = &g_eac_modifier_table[dst.m_table][0];
+
+		uint32_t vals[8];
+		for (uint32_t j = 0; j < 8; j++)
+			vals[j] = clamp255(center + (pTable[j] * multiplier));
+
+		uint64_t sels = 0;
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t a = block_pixels[i & 3][i >> 2].a;
+
+			const uint32_t err0 = (basisu::iabs(vals[0] - a) << 3) | 0;
+			const uint32_t err1 = (basisu::iabs(vals[1] - a) << 3) | 1;
+			const uint32_t err2 = (basisu::iabs(vals[2] - a) << 3) | 2;
+			const uint32_t err3 = (basisu::iabs(vals[3] - a) << 3) | 3;
+			const uint32_t err4 = (basisu::iabs(vals[4] - a) << 3) | 4;
+			const uint32_t err5 = (basisu::iabs(vals[5] - a) << 3) | 5;
+			const uint32_t err6 = (basisu::iabs(vals[6] - a) << 3) | 6;
+			const uint32_t err7 = (basisu::iabs(vals[7] - a) << 3) | 7;
+
+			const uint32_t min_err = basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(err0, err1, err2), err3), err4), err5), err6), err7);
+
+			const uint64_t best_index = min_err & 7;
+			sels |= (best_index << (45 - i * 3));
+		}
+
+		dst.set_selector_bits(sels);
+	}
+
+	bool transcode_uastc_to_etc2_rgba(const uastc_block& src_blk, void* pDst)
+	{
+		eac_block& dst_etc2_eac_a8_blk = *static_cast<eac_block*>(pDst);
+		decoder_etc_block& dst_etc1_blk = static_cast<decoder_etc_block*>(pDst)[1];
+
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
 			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cFXT1_RGB, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+		color32 block_pixels[4][4];
+		if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			const bool unpack_srgb = false;
+			if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+				return false;
+		}
+
+		transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &dst_etc2_eac_a8_blk);
+
+		transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, &dst_etc1_blk);
+
+		return true;
+	}
+
+	static const uint8_t s_uastc5_to_bc1[32] = { 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1 };
+	static const uint8_t s_uastc4_to_bc1[16] = { 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1 };
+	static const uint8_t s_uastc3_to_bc1[8] = { 0, 0, 2, 2, 3, 3, 1, 1 };
+	static const uint8_t s_uastc2_to_bc1[4] = { 0, 2, 3, 1 };
+	static const uint8_t s_uastc1_to_bc1[2] = { 0, 1 };
+	const uint8_t* s_uastc_to_bc1_weights[6] = { nullptr, s_uastc1_to_bc1, s_uastc2_to_bc1, s_uastc3_to_bc1, s_uastc4_to_bc1, s_uastc5_to_bc1 };
+				
+	void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride)
+	{
+		uint32_t min0_v, max0_v, min1_v, max1_v,min2_v, max2_v, min3_v, max3_v;
+
+		{
+			min0_v = max0_v = pPixels[0 * stride];
+			min1_v = max1_v = pPixels[1 * stride];
+			min2_v = max2_v = pPixels[2 * stride];
+			min3_v = max3_v = pPixels[3 * stride];
+		}
+
+		{
+			uint32_t v0 = pPixels[4 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0);
+			uint32_t v1 = pPixels[5 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1);
+			uint32_t v2 = pPixels[6 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2);
+			uint32_t v3 = pPixels[7 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3);
+		}
+
+		{
+			uint32_t v0 = pPixels[8 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0);
+			uint32_t v1 = pPixels[9 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1);
+			uint32_t v2 = pPixels[10 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2);
+			uint32_t v3 = pPixels[11 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3);
+		}
+
+		{
+			uint32_t v0 = pPixels[12 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0);
+			uint32_t v1 = pPixels[13 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1);
+			uint32_t v2 = pPixels[14 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2);
+			uint32_t v3 = pPixels[15 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3);
+		}
+
+		const uint32_t min_v = basisu::minimum(min0_v, min1_v, min2_v, min3_v);
+		const uint32_t max_v = basisu::maximum(max0_v, max1_v, max2_v, max3_v);
+
+		uint8_t* pDst_bytes = static_cast<uint8_t*>(pDst);
+		pDst_bytes[0] = (uint8_t)max_v;
+		pDst_bytes[1] = (uint8_t)min_v;
+
+		if (max_v == min_v)
+		{
+			memset(pDst_bytes + 2, 0, 6);
+			return;
+		}
+
+		const uint32_t delta = max_v - min_v;
+
+		// min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors.
+		const int t0 = delta * 13;
+		const int t1 = delta * 11;
+		const int t2 = delta * 9;
+		const int t3 = delta * 7;
+		const int t4 = delta * 5;
+		const int t5 = delta * 3;
+		const int t6 = delta * 1;
+
+		// BC4 floors in its divisions, which we compensate for with the 4 bias.
+		// This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one).
+		const int bias = 4 - min_v * 14;
+
+		static const uint32_t s_tran0[8] = { 1U      , 7U      , 6U      , 5U      , 4U      , 3U      , 2U      , 0U };
+		static const uint32_t s_tran1[8] = { 1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U };
+		static const uint32_t s_tran2[8] = { 1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U };
+		static const uint32_t s_tran3[8] = { 1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U };
+
+		uint64_t a0, a1, a2, a3;
+		{
+			const int v0 = pPixels[0 * stride] * 14 + bias;
+			const int v1 = pPixels[1 * stride] * 14 + bias;
+			const int v2 = pPixels[2 * stride] * 14 + bias;
+			const int v3 = pPixels[3 * stride] * 14 + bias;
+			a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)];
+			a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)];
+			a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)];
+			a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)];
+		}
+
+		{
+			const int v0 = pPixels[4 * stride] * 14 + bias;
+			const int v1 = pPixels[5 * stride] * 14 + bias;
+			const int v2 = pPixels[6 * stride] * 14 + bias;
+			const int v3 = pPixels[7 * stride] * 14 + bias;
+			a0 |= (s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U);
+			a1 |= (s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U);
+			a2 |= (s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U);
+			a3 |= (s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U);
+		}
+		
+		{
+			const int v0 = pPixels[8 * stride] * 14 + bias;
+			const int v1 = pPixels[9 * stride] * 14 + bias;
+			const int v2 = pPixels[10 * stride] * 14 + bias;
+			const int v3 = pPixels[11 * stride] * 14 + bias;
+			a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U);
+			a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U);
+			a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U);
+			a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U);
+		}
+
+		{
+			const int v0 = pPixels[12 * stride] * 14 + bias;
+			const int v1 = pPixels[13 * stride] * 14 + bias;
+			const int v2 = pPixels[14 * stride] * 14 + bias;
+			const int v3 = pPixels[15 * stride] * 14 + bias;
+			a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U);
+			a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U);
+			a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U);
+			a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U);
+		}
+
+		const uint64_t f = a0 | a1 | a2 | a3;
+		
+		pDst_bytes[2] = (uint8_t)f;
+		pDst_bytes[3] = (uint8_t)(f >> 8U);
+		pDst_bytes[4] = (uint8_t)(f >> 16U);
+		pDst_bytes[5] = (uint8_t)(f >> 24U);
+		pDst_bytes[6] = (uint8_t)(f >> 32U);
+		pDst_bytes[7] = (uint8_t)(f >> 40U);
+	}
+
+	static void bc1_find_sels(const color32 *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16])
+	{
+		uint32_t block_r[4], block_g[4], block_b[4];
+
+		block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4);	block_b[0] = (lb << 3) | (lb >> 2);
+		block_r[3] = (hr << 3) | (hr >> 2);	block_g[3] = (hg << 2) | (hg >> 4);	block_b[3] = (hb << 3) | (hb >> 2);
+		block_r[1] = (block_r[0] * 2 + block_r[3]) / 3;	block_g[1] = (block_g[0] * 2 + block_g[3]) / 3;	block_b[1] = (block_b[0] * 2 + block_b[3]) / 3;
+		block_r[2] = (block_r[3] * 2 + block_r[0]) / 3;	block_g[2] = (block_g[3] * 2 + block_g[0]) / 3;	block_b[2] = (block_b[3] * 2 + block_b[0]) / 3;
+
+		int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];
+
+		int dots[4];
+		for (uint32_t i = 0; i < 4; i++)
+			dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;
+				
+		int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
+
+		ar *= 2; ag *= 2; ab *= 2;
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const int d = pSrc_pixels[i].r * ar + pSrc_pixels[i].g * ag + pSrc_pixels[i].b * ab;
+			static const uint8_t s_sels[4] = { 3, 2, 1, 0 };
+		
+			// Rounding matters here!
+			// d <= t0: <=, not <, to the later LS step "sees" a wider range of selectors. It matters for quality.
+			sels[i] = s_sels[(d <= t0) + (d < t1) + (d < t2)];
+		}
+	}
+
+	static inline void bc1_find_sels_2(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16])
+	{
+		uint32_t block_r[4], block_g[4], block_b[4];
+
+		block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4);	block_b[0] = (lb << 3) | (lb >> 2);
+		block_r[3] = (hr << 3) | (hr >> 2);	block_g[3] = (hg << 2) | (hg >> 4);	block_b[3] = (hb << 3) | (hb >> 2);
+		block_r[1] = (block_r[0] * 2 + block_r[3]) / 3;	block_g[1] = (block_g[0] * 2 + block_g[3]) / 3;	block_b[1] = (block_b[0] * 2 + block_b[3]) / 3;
+		block_r[2] = (block_r[3] * 2 + block_r[0]) / 3;	block_g[2] = (block_g[3] * 2 + block_g[0]) / 3;	block_b[2] = (block_b[3] * 2 + block_b[0]) / 3;
+
+		int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];
+
+		int dots[4];
+		for (uint32_t i = 0; i < 4; i++)
+			dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;
+
+		int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
+
+		ar *= 2; ag *= 2; ab *= 2;
+
+		static const uint8_t s_sels[4] = { 3, 2, 1, 0 };
+
+		for (uint32_t i = 0; i < 16; i += 4)
+		{
+			const int d0 = pSrc_pixels[i+0].r * ar + pSrc_pixels[i+0].g * ag + pSrc_pixels[i+0].b * ab;
+			const int d1 = pSrc_pixels[i+1].r * ar + pSrc_pixels[i+1].g * ag + pSrc_pixels[i+1].b * ab;
+			const int d2 = pSrc_pixels[i+2].r * ar + pSrc_pixels[i+2].g * ag + pSrc_pixels[i+2].b * ab;
+			const int d3 = pSrc_pixels[i+3].r * ar + pSrc_pixels[i+3].g * ag + pSrc_pixels[i+3].b * ab;
+
+			sels[i+0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)];
+			sels[i+1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)];
+			sels[i+2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)];
+			sels[i+3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)];
+		}
+	}
+
+	struct vec3F { float c[3]; };
+		
+	static bool compute_least_squares_endpoints_rgb(const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh)
+	{
+		// Derived from bc7enc16's LS function.
+		// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+		// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+		uint32_t uq00_r = 0, uq10_r = 0, ut_r = 0, uq00_g = 0, uq10_g = 0, ut_g = 0, uq00_b = 0, uq10_b = 0, ut_b = 0;
+
+		// This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w))
+		// where w is [0,1/3,2/3,1]. 9 is the perfect multiplier.
+		static const uint32_t s_weight_vals[4] = { 0x000009, 0x010204, 0x040201, 0x090000 };
+
+		uint32_t weight_accum = 0;
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2];
+			const uint32_t sel = pSelectors[i];
+			ut_r += r;
+			ut_g += g;
+			ut_b += b;
+			weight_accum += s_weight_vals[sel];
+			uq00_r += sel * r;
+			uq00_g += sel * g;
+			uq00_b += sel * b;
+		}
+
+		float q00_r = (float)uq00_r, q10_r = (float)uq10_r, t_r = (float)ut_r;
+		float q00_g = (float)uq00_g, q10_g = (float)uq10_g, t_g = (float)ut_g;
+		float q00_b = (float)uq00_b, q10_b = (float)uq10_b, t_b = (float)ut_b;
+
+		q10_r = t_r * 3.0f - q00_r;
+		q10_g = t_g * 3.0f - q00_g;
+		q10_b = t_b * 3.0f - q00_b;
+
+		float z00 = (float)((weight_accum >> 16) & 0xFF);
+		float z10 = (float)((weight_accum >> 8) & 0xFF);
+		float z11 = (float)(weight_accum & 0xFF);
+		float z01 = z10;
+
+		float det = z00 * z11 - z01 * z10;
+		if (fabs(det) < 1e-8f)
+			return false;
+
+		det = 3.0f / det;
+
+		float iz00, iz01, iz10, iz11;
+		iz00 = z11 * det;
+		iz01 = -z01 * det;
+		iz10 = -z10 * det;
+		iz11 = z00 * det;
+
+		pXl->c[0] = iz00 * q00_r + iz01 * q10_r; pXh->c[0] = iz10 * q00_r + iz11 * q10_r;
+		pXl->c[1] = iz00 * q00_g + iz01 * q10_g; pXh->c[1] = iz10 * q00_g + iz11 * q10_g;
+		pXl->c[2] = iz00 * q00_b + iz01 * q10_b; pXh->c[2] = iz10 * q00_b + iz11 * q10_b;
+
+		// Check and fix channel singularities - might not be needed, but is in UASTC's encoder.
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			if ((pXl->c[c] < 0.0f) || (pXh->c[c] > 255.0f))
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to FXT1_RGB failed\n");
+				uint32_t lo_v = UINT32_MAX, hi_v = 0;
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					lo_v = basisu::minimumu(lo_v, pColors[i].c[c]);
+					hi_v = basisu::maximumu(hi_v, pColors[i].c[c]);
+				}
+
+				if (lo_v == hi_v)
+				{
+					pXl->c[c] = (float)lo_v;
+					pXh->c[c] = (float)hi_v;
+				}
 			}
-			break;
 		}
-		case transcoder_texture_format::cTFETC2_EAC_R11:
+
+		return true;
+	}
+
+	void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb) 
+	{
+		dxt1_block* pDst_block = static_cast<dxt1_block*>(pDst);
+
+		uint32_t mask = 0xAA;
+		uint32_t max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi;
+		uint32_t min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo;
+
+		if (min16 == max16)
 		{
-#if !BASISD_SUPPORT_ETC2_EAC_RG11
-			return false;
-#endif
-			uint32_t slice_index_to_decode = slice_index;
-			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
-			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
-				slice_index_to_decode++;
+			// Always forbid 3 color blocks
+			// This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's.
+			mask = 0;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, bytes_per_block, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-			if (!status)
+			// Make l > h
+			if (min16 > 0)
+				min16--;
+			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2_EAC_R11 failed\n");
+				// l = h = 0
+				assert(min16 == max16 && max16 == 0);
+
+				max16 = 1;
+				min16 = 0;
+				mask = 0x55;
 			}
 
-			break;
+			assert(max16 > min16);
 		}
-		case transcoder_texture_format::cTFETC2_EAC_RG11:
+
+		if (max16 < min16)
 		{
-#if !BASISD_SUPPORT_ETC2_EAC_RG11
-			return false;
+			std::swap(max16, min16);
+			mask ^= 0x55;
+		}
+
+		pDst_block->set_low_color(static_cast<uint16_t>(max16));
+		pDst_block->set_high_color(static_cast<uint16_t>(min16));
+		pDst_block->m_selectors[0] = static_cast<uint8_t>(mask);
+		pDst_block->m_selectors[1] = static_cast<uint8_t>(mask);
+		pDst_block->m_selectors[2] = static_cast<uint8_t>(mask);
+		pDst_block->m_selectors[3] = static_cast<uint8_t>(mask);
+	}
+
+	static inline uint8_t to_5(uint32_t v) { v = v * 31 + 128; return (uint8_t)((v + (v >> 8)) >> 8); }
+	static inline uint8_t to_6(uint32_t v) { v = v * 63 + 128; return (uint8_t)((v + (v >> 8)) >> 8); }
+
+	// Good references: squish library, stb_dxt.
+	void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags)
+	{
+		const color32* pSrc_pixels = (const color32*)pPixels;
+		dxt1_block* pDst_block = static_cast<dxt1_block*>(pDst);
+		
+		int avg_r = -1, avg_g = 0, avg_b = 0;
+		int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0;
+		uint8_t sels[16];
+		
+		const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0;
+		if (use_sels)
+		{
+			// Caller is jamming in their own selectors for us to try.
+			const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24);
+			
+			static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 };
+			
+			for (uint32_t i = 0; i < 16; i++)
+				sels[i] = s_sel_tran[(s >> (i * 2)) & 3];
+		}
+		else
+		{
+			const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b;
+
+			uint32_t j;
+			for (j = 1; j < 16; j++)
+				if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb))
+					break;
+						
+			if (j == 16)
+			{
+				encode_bc1_solid_block(pDst, fr, fg, fb);
+				return;
+			}
+			
+			// Select 2 colors along the principle axis. (There must be a faster/simpler way.)
+			int total_r = fr, total_g = fg, total_b = fb;
+			int max_r = fr, max_g = fg, max_b = fb;
+			int min_r = fr, min_g = fg, min_b = fb;
+			for (uint32_t i = 1; i < 16; i++)
+			{
+				const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+				max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b);
+				min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b);
+				total_r += r; total_g += g; total_b += b;
+			}
+
+			avg_r = (total_r + 8) >> 4;
+			avg_g = (total_g + 8) >> 4;
+			avg_b = (total_b + 8) >> 4;
+
+			int icov[6] = { 0, 0, 0, 0, 0, 0 };
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				int r = (int)pSrc_pixels[i].r - avg_r;
+				int g = (int)pSrc_pixels[i].g - avg_g;
+				int b = (int)pSrc_pixels[i].b - avg_b;
+				icov[0] += r * r;
+				icov[1] += r * g;
+				icov[2] += r * b;
+				icov[3] += g * g;
+				icov[4] += g * b;
+				icov[5] += b * b;
+			}
+
+			float cov[6];
+			for (uint32_t i = 0; i < 6; i++)
+				cov[i] = static_cast<float>(icov[i])* (1.0f / 255.0f);
+			
+#if 0
+			// Seems silly to use full PCA to choose 2 colors. The diff in avg. PSNR between using PCA vs. not is small (~.025 difference).
+			// TODO: Try 2 or 3 different normalized diagonal vectors, choose the one that results in the largest dot delta
+			int saxis_r = max_r - min_r;
+			int saxis_g = max_g - min_g;
+			int saxis_b = max_b - min_b;
+#else
+			float xr = (float)(max_r - min_r);
+			float xg = (float)(max_g - min_g);
+			float xb = (float)(max_b - min_b);
+			//float xr = (float)(max_r - avg_r); // max-avg is nearly the same, and doesn't require computing min's
+			//float xg = (float)(max_g - avg_g);
+			//float xb = (float)(max_b - avg_b);
+			for (uint32_t power_iter = 0; power_iter < 4; power_iter++)
+			{
+				float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
+				float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
+				float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
+				xr = r; xg = g; xb = b;
+			}
+
+			float k = basisu::maximum(fabsf(xr), fabsf(xg), fabsf(xb));
+			int saxis_r = 306, saxis_g = 601, saxis_b = 117;
+			if (k >= 2)
+			{
+				float m = 1024.0f / k;
+				saxis_r = (int)(xr * m);
+				saxis_g = (int)(xg * m);
+				saxis_b = (int)(xb * m);
+			}
 #endif
-			assert(bytes_per_block == 16);
+			
+			int low_dot = INT_MAX, high_dot = INT_MIN, low_c = 0, high_c = 0;
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				int dot = pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b;
+				if (dot < low_dot)
+				{
+					low_dot = dot;
+					low_c = i;
+				}
+				if (dot > high_dot)
+				{
+					high_dot = dot;
+					high_c = i;
+				}
+			}
 
-			if (basis_file_has_alpha_slices)
+			lr = to_5(pSrc_pixels[low_c].r);
+			lg = to_6(pSrc_pixels[low_c].g);
+			lb = to_5(pSrc_pixels[low_c].b);
+
+			hr = to_5(pSrc_pixels[high_c].r);
+			hg = to_6(pSrc_pixels[high_c].g);
+			hb = to_5(pSrc_pixels[high_c].b);
+						
+			bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);
+		} // if (use_sels)
+
+		const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1);
+		for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++)
+		{
+			// This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors.
+			vec3F xl, xh;
+			if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh))
 			{
-				// First decode the alpha data to G
-				status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t *)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
+				if (avg_r < 0)
+				{
+					int total_r = 0, total_g = 0, total_b = 0;
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						total_r += pSrc_pixels[i].r;
+						total_g += pSrc_pixels[i].g;
+						total_b += pSrc_pixels[i].b;
+					}
+
+					avg_r = (total_r + 8) >> 4;
+					avg_g = (total_g + 8) >> 4;
+					avg_b = (total_b + 8) >> 4;
+				}
+
+				// All selectors equal - treat it as a solid block which should always be equal or better.
+				lr = g_bc1_match5_equals_1[avg_r].m_hi;
+				lg = g_bc1_match6_equals_1[avg_g].m_hi;
+				lb = g_bc1_match5_equals_1[avg_b].m_hi;
+
+				hr = g_bc1_match5_equals_1[avg_r].m_lo;
+				hg = g_bc1_match6_equals_1[avg_g].m_lo;
+				hb = g_bc1_match5_equals_1[avg_b].m_lo;
+
+				// In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge.
 			}
 			else
 			{
-				write_opaque_alpha_blocks(pSlice_descs[slice_index].m_num_blocks_x, pSlice_descs[slice_index].m_num_blocks_y, (uint8_t *)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, output_row_pitch_in_blocks_or_pixels);
-				status = true;
+				lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
+				lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
+				lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);
+
+				hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
+				hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
+				hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);
 			}
+									
+			bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);
+		}
 
-			if (status)
+		uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb);
+		uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb);
+				
+		// Always forbid 3 color blocks
+		if (lc16 == hc16)
+		{
+			uint8_t mask = 0;
+
+			// Make l > h
+			if (hc16 > 0)
+				hc16--;
+			else
 			{
-				// Now decode the color data to R
-				status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState);
-				if (!status)
+				// lc16 = hc16 = 0
+				assert(lc16 == hc16 && hc16 == 0);
+
+				hc16 = 0;
+				lc16 = 1;
+				mask = 0x55; // select hc16
+			}
+
+			assert(lc16 > hc16);
+			pDst_block->set_low_color(static_cast<uint16_t>(lc16));
+			pDst_block->set_high_color(static_cast<uint16_t>(hc16));
+
+			pDst_block->m_selectors[0] = mask;
+			pDst_block->m_selectors[1] = mask;
+			pDst_block->m_selectors[2] = mask;
+			pDst_block->m_selectors[3] = mask;
+		}
+		else
+		{
+			uint8_t invert_mask = 0;
+			if (lc16 < hc16)
+			{
+				std::swap(lc16, hc16);
+				invert_mask = 0x55;
+			}
+
+			assert(lc16 > hc16);
+			pDst_block->set_low_color((uint16_t)lc16);
+			pDst_block->set_high_color((uint16_t)hc16);
+
+			uint32_t packed_sels = 0;
+			static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 };
+			for (uint32_t i = 0; i < 16; i++)
+				packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2));
+
+			pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask;
+			pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask;
+			pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask;
+			pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask;
+		}
+	}
+		
+	void encode_bc1_alt(void* pDst, const uint8_t* pPixels, uint32_t flags)
+	{
+		const color32* pSrc_pixels = (const color32*)pPixels;
+		dxt1_block* pDst_block = static_cast<dxt1_block*>(pDst);
+
+		int avg_r = -1, avg_g = 0, avg_b = 0;
+		int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0;
+		uint8_t sels[16];
+
+		const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0;
+		if (use_sels)
+		{
+			// Caller is jamming in their own selectors for us to try.
+			const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24);
+
+			static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 };
+
+			for (uint32_t i = 0; i < 16; i++)
+				sels[i] = s_sel_tran[(s >> (i * 2)) & 3];
+		}
+		else
+		{
+			const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b;
+
+			uint32_t j;
+			for (j = 1; j < 16; j++)
+				if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb))
+					break;
+
+			if (j == 16)
+			{
+				encode_bc1_solid_block(pDst, fr, fg, fb);
+				return;
+			}
+
+			// Select 2 colors along the principle axis. (There must be a faster/simpler way.)
+			int total_r = fr, total_g = fg, total_b = fb;
+			int max_r = fr, max_g = fg, max_b = fb;
+			int min_r = fr, min_g = fg, min_b = fb;
+			uint32_t grayscale_flag = (fr == fg) && (fr == fb);
+			for (uint32_t i = 1; i < 16; i++)
+			{
+				const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+				grayscale_flag &= ((r == g) && (r == b));
+				max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b);
+				min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b);
+				total_r += r; total_g += g; total_b += b;
+			}
+						
+			if (grayscale_flag) 
+			{
+				// Grayscale blocks are a common enough case to specialize.
+				if ((max_r - min_r) < 2)
 				{
-					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2_EAC_R11 R failed\n");
+					lr = lb = hr = hb = to_5(fr);
+					lg = hg = to_6(fr);
+				}
+				else
+				{
+					lr = lb = to_5(min_r);
+					lg = to_6(min_r);
+
+					hr = hb = to_5(max_r);
+					hg = to_6(max_r);
 				}
 			}
 			else
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2_EAC_R11 G failed\n");
+				avg_r = (total_r + 8) >> 4;
+				avg_g = (total_g + 8) >> 4;
+				avg_b = (total_b + 8) >> 4;
+
+				// Find the shortest vector from a AABB corner to the block's average color.
+				// This is to help avoid outliers.
+
+				uint32_t dist[3][2];
+				dist[0][0] = basisu::square(min_r - avg_r) << 3; dist[0][1] = basisu::square(max_r - avg_r) << 3;
+				dist[1][0] = basisu::square(min_g - avg_g) << 3; dist[1][1] = basisu::square(max_g - avg_g) << 3;
+				dist[2][0] = basisu::square(min_b - avg_b) << 3; dist[2][1] = basisu::square(max_b - avg_b) << 3;
+
+				uint32_t min_d0 = (dist[0][0] + dist[1][0] + dist[2][0]);
+				uint32_t d4 = (dist[0][0] + dist[1][0] + dist[2][1]) | 4;
+				min_d0 = basisu::minimum(min_d0, d4);
+
+				uint32_t min_d1 = (dist[0][1] + dist[1][0] + dist[2][0]) | 1;
+				uint32_t d5 = (dist[0][1] + dist[1][0] + dist[2][1]) | 5;
+				min_d1 = basisu::minimum(min_d1, d5);
+
+				uint32_t d2 = (dist[0][0] + dist[1][1] + dist[2][0]) | 2;
+				min_d0 = basisu::minimum(min_d0, d2);
+
+				uint32_t d3 = (dist[0][1] + dist[1][1] + dist[2][0]) | 3;
+				min_d1 = basisu::minimum(min_d1, d3);
+
+				uint32_t d6 = (dist[0][0] + dist[1][1] + dist[2][1]) | 6;
+				min_d0 = basisu::minimum(min_d0, d6);
+
+				uint32_t d7 = (dist[0][1] + dist[1][1] + dist[2][1]) | 7;
+				min_d1 = basisu::minimum(min_d1, d7);
+
+				uint32_t min_d = basisu::minimum(min_d0, min_d1);
+				uint32_t best_i = min_d & 7;
+
+				int delta_r = (best_i & 1) ? (max_r - avg_r) : (avg_r - min_r);
+				int delta_g = (best_i & 2) ? (max_g - avg_g) : (avg_g - min_g);
+				int delta_b = (best_i & 4) ? (max_b - avg_b) : (avg_b - min_b);
+
+				// Note: if delta_r/g/b==0, we actually want to choose a single color, so the block average color optimization kicks in.
+				uint32_t low_c = 0, high_c = 0;
+				if ((delta_r | delta_g | delta_b) != 0)
+				{
+					// Now we have a smaller AABB going from the block's average color to a cornerpoint of the larger AABB.
+					// Project all pixels colors along the 4 vectors going from a smaller AABB cornerpoint to the opposite cornerpoint, find largest projection.
+					// One of these vectors will be a decent approximation of the block's PCA.
+					const int saxis0_r = delta_r, saxis0_g = delta_g, saxis0_b = delta_b;
+
+					int low_dot0 = INT_MAX, high_dot0 = INT_MIN;
+					int low_dot1 = INT_MAX, high_dot1 = INT_MIN;
+					int low_dot2 = INT_MAX, high_dot2 = INT_MIN;
+					int low_dot3 = INT_MAX, high_dot3 = INT_MIN;
+
+					//int low_c0, low_c1, low_c2, low_c3;
+					//int high_c0, high_c1, high_c2, high_c3;
+
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const int dotx = pSrc_pixels[i].r * saxis0_r;
+						const int doty = pSrc_pixels[i].g * saxis0_g;
+						const int dotz = pSrc_pixels[i].b * saxis0_b;
+
+						const int dot0 = ((dotz + dotx + doty) << 4) + i;
+						const int dot1 = ((dotz - dotx - doty) << 4) + i;
+						const int dot2 = ((dotz - dotx + doty) << 4) + i;
+						const int dot3 = ((dotz + dotx - doty) << 4) + i;
+
+						if (dot0 < low_dot0)
+						{
+							low_dot0 = dot0;
+							//low_c0 = i;
+						}
+						if ((dot0 ^ 15) > high_dot0)
+						{
+							high_dot0 = dot0 ^ 15;
+							//high_c0 = i;
+						}
+
+						if (dot1 < low_dot1)
+						{
+							low_dot1 = dot1;
+							//low_c1 = i;
+						}
+						if ((dot1 ^ 15) > high_dot1)
+						{
+							high_dot1 = dot1 ^ 15;
+							//high_c1 = i;
+						}
+
+						if (dot2 < low_dot2)
+						{
+							low_dot2 = dot2;
+							//low_c2 = i;
+						}
+						if ((dot2 ^ 15) > high_dot2)
+						{
+							high_dot2 = dot2 ^ 15;
+							//high_c2 = i;
+						}
+
+						if (dot3 < low_dot3)
+						{
+							low_dot3 = dot3;
+							//low_c3 = i;
+						}
+						if ((dot3 ^ 15) > high_dot3)
+						{
+							high_dot3 = dot3 ^ 15;
+							//high_c3 = i;
+						}
+					}
+
+					low_c = low_dot0 & 15;
+					high_c = ~high_dot0 & 15;
+					uint32_t r = (high_dot0 & ~15) - (low_dot0 & ~15);
+
+					uint32_t tr = (high_dot1 & ~15) - (low_dot1 & ~15);
+					if (tr > r) {
+						low_c = low_dot1 & 15;
+						high_c = ~high_dot1 & 15;
+						r = tr;
+					}
+
+					tr = (high_dot2 & ~15) - (low_dot2 & ~15);
+					if (tr > r) {
+						low_c = low_dot2 & 15;
+						high_c = ~high_dot2 & 15;
+						r = tr;
+					}
+
+					tr = (high_dot3 & ~15) - (low_dot3 & ~15);
+					if (tr > r) {
+						low_c = low_dot3 & 15;
+						high_c = ~high_dot3 & 15;
+					}
+				}
+
+				lr = to_5(pSrc_pixels[low_c].r);
+				lg = to_6(pSrc_pixels[low_c].g);
+				lb = to_5(pSrc_pixels[low_c].b);
+
+				hr = to_5(pSrc_pixels[high_c].r);
+				hg = to_6(pSrc_pixels[high_c].g);
+				hb = to_5(pSrc_pixels[high_c].b);
 			}
 
-			break;
+			bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);
+		} // if (use_sels)
+
+		const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1);
+		for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++)
+		{
+			int prev_lr = lr, prev_lg = lg, prev_lb = lb, prev_hr = hr, prev_hg = hg, prev_hb = hb;
+
+			// This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors.
+			vec3F xl, xh;
+			if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh))
+			{
+				if (avg_r < 0)
+				{
+					int total_r = 0, total_g = 0, total_b = 0;
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						total_r += pSrc_pixels[i].r;
+						total_g += pSrc_pixels[i].g;
+						total_b += pSrc_pixels[i].b;
+					}
+
+					avg_r = (total_r + 8) >> 4;
+					avg_g = (total_g + 8) >> 4;
+					avg_b = (total_b + 8) >> 4;
+				}
+
+				// All selectors equal - treat it as a solid block which should always be equal or better.
+				lr = g_bc1_match5_equals_1[avg_r].m_hi;
+				lg = g_bc1_match6_equals_1[avg_g].m_hi;
+				lb = g_bc1_match5_equals_1[avg_b].m_hi;
+
+				hr = g_bc1_match5_equals_1[avg_r].m_lo;
+				hg = g_bc1_match6_equals_1[avg_g].m_lo;
+				hb = g_bc1_match5_equals_1[avg_b].m_lo;
+
+				// In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge.
+			}
+			else
+			{
+				lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
+				lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
+				lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);
+
+				hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
+				hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
+				hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);
+			}
+
+			if ((prev_lr == lr) && (prev_lg == lg) && (prev_lb == lb) && (prev_hr == hr) && (prev_hg == hg) && (prev_hb == hb))
+				break;
+
+			bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);
 		}
-		default:
+
+		uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb);
+		uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb);
+
+		// Always forbid 3 color blocks
+		if (lc16 == hc16)
 		{
-			assert(0);
-			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Invalid fmt\n");
-			break;
+			uint8_t mask = 0;
+
+			// Make l > h
+			if (hc16 > 0)
+				hc16--;
+			else
+			{
+				// lc16 = hc16 = 0
+				assert(lc16 == hc16 && hc16 == 0);
+
+				hc16 = 0;
+				lc16 = 1;
+				mask = 0x55; // select hc16
+			}
+
+			assert(lc16 > hc16);
+			pDst_block->set_low_color(static_cast<uint16_t>(lc16));
+			pDst_block->set_high_color(static_cast<uint16_t>(hc16));
+
+			pDst_block->m_selectors[0] = mask;
+			pDst_block->m_selectors[1] = mask;
+			pDst_block->m_selectors[2] = mask;
+			pDst_block->m_selectors[3] = mask;
 		}
+		else
+		{
+			uint8_t invert_mask = 0;
+			if (lc16 < hc16)
+			{
+				std::swap(lc16, hc16);
+				invert_mask = 0x55;
+			}
+
+			assert(lc16 > hc16);
+			pDst_block->set_low_color((uint16_t)lc16);
+			pDst_block->set_high_color((uint16_t)hc16);
+
+			uint32_t packed_sels = 0;
+			static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 };
+			for (uint32_t i = 0; i < 16; i++)
+				packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2));
+
+			pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask;
+			pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask;
+			pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask;
+			pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask;
 		}
+	}
 
-		return status;
+	// Scale the UASTC first subset endpoints and first plane's weight indices directly to BC1's - fastest.
+	void transcode_uastc_to_bc1_hint0(const unpacked_uastc_block& unpacked_src_blk, void* pDst)
+	{
+		const uint32_t mode = unpacked_src_blk.m_mode;
+		const astc_block_desc& astc_blk = unpacked_src_blk.m_astc;
+
+		dxt1_block& b = *static_cast<dxt1_block*>(pDst);
+
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode];
+
+		const uint32_t total_comps = g_uastc_mode_comps[mode];
+
+		if (total_comps == 2)
+		{
+			const uint32_t l = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant;
+			const uint32_t h = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant;
+
+			b.set_low_color(dxt1_block::pack_color(color32(l, l, l, 255), true, 127));
+			b.set_high_color(dxt1_block::pack_color(color32(h, h, h, 255), true, 127));
+		}
+		else
+		{
+			b.set_low_color(dxt1_block::pack_color(
+				color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant,
+					g_astc_unquant[endpoint_range][astc_blk.m_endpoints[2]].m_unquant,
+					g_astc_unquant[endpoint_range][astc_blk.m_endpoints[4]].m_unquant,
+					255), true, 127)
+			);
+
+			b.set_high_color(dxt1_block::pack_color(
+				color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant,
+					g_astc_unquant[endpoint_range][astc_blk.m_endpoints[3]].m_unquant,
+					g_astc_unquant[endpoint_range][astc_blk.m_endpoints[5]].m_unquant,
+					255), true, 127)
+			);
+		}
+
+		if (b.get_low_color() == b.get_high_color())
+		{
+			// Always forbid 3 color blocks
+			uint16_t lc16 = (uint16_t)b.get_low_color();
+			uint16_t hc16 = (uint16_t)b.get_high_color();
+			
+			uint8_t mask = 0;
+
+			// Make l > h
+			if (hc16 > 0)
+				hc16--;
+			else
+			{
+				// lc16 = hc16 = 0
+				assert(lc16 == hc16 && hc16 == 0);
+
+				hc16 = 0;
+				lc16 = 1;
+				mask = 0x55; // select hc16
+			}
+
+			assert(lc16 > hc16);
+			b.set_low_color(static_cast<uint16_t>(lc16));
+			b.set_high_color(static_cast<uint16_t>(hc16));
+
+			b.m_selectors[0] = mask;
+			b.m_selectors[1] = mask;
+			b.m_selectors[2] = mask;
+			b.m_selectors[3] = mask;
+		}
+		else
+		{
+			bool invert = false;
+			if (b.get_low_color() < b.get_high_color())
+			{
+				std::swap(b.m_low_color[0], b.m_high_color[0]);
+				std::swap(b.m_low_color[1], b.m_high_color[1]);
+				invert = true;
+			}
+
+			const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]];
+
+			const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1;
+
+			uint32_t sels = 0;
+			for (int i = 15; i >= 0; --i)
+			{
+				uint32_t s = pTran[astc_blk.m_weights[i << plane_shift]];
+
+				if (invert)
+					s ^= 1;
+
+				sels = (sels << 2) | s;
+			}
+			b.m_selectors[0] = sels & 0xFF;
+			b.m_selectors[1] = (sels >> 8) & 0xFF;
+			b.m_selectors[2] = (sels >> 16) & 0xFF;
+			b.m_selectors[3] = (sels >> 24) & 0xFF;
+		}
 	}
 
-	uint32_t basis_get_bytes_per_block(transcoder_texture_format fmt)
+	// Scale the UASTC first plane's weight indices to BC1, use 1 or 2 least squares passes to compute endpoints - no PCA needed.
+	void transcode_uastc_to_bc1_hint1(const unpacked_uastc_block& unpacked_src_blk, const color32 block_pixels[4][4], void* pDst, bool high_quality)
 	{
-		switch (fmt)
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		const astc_block_desc& astc_blk = unpacked_src_blk.m_astc;
+
+		dxt1_block& b = *static_cast<dxt1_block*>(pDst);
+
+		b.set_low_color(1);
+		b.set_high_color(0);
+
+		const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]];
+
+		const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1;
+
+		uint32_t sels = 0;
+		for (int i = 15; i >= 0; --i)
 		{
-		case transcoder_texture_format::cTFETC1_RGB:
-		case transcoder_texture_format::cTFBC1_RGB:
-		case transcoder_texture_format::cTFBC4_R:
-		case transcoder_texture_format::cTFPVRTC1_4_RGB:
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
-		case transcoder_texture_format::cTFATC_RGB:
-		case transcoder_texture_format::cTFPVRTC2_4_RGB:
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
-		case transcoder_texture_format::cTFETC2_EAC_R11:
-			return 8;
-		case transcoder_texture_format::cTFBC7_M6_RGB:
-		case transcoder_texture_format::cTFBC7_M5_RGBA:
-		case transcoder_texture_format::cTFETC2_RGBA:
-		case transcoder_texture_format::cTFBC3_RGBA:
-		case transcoder_texture_format::cTFBC5_RG:
-		case transcoder_texture_format::cTFASTC_4x4_RGBA:
-		case transcoder_texture_format::cTFATC_RGBA:
-		case transcoder_texture_format::cTFFXT1_RGB:
-		case transcoder_texture_format::cTFETC2_EAC_RG11:
-			return 16;
-		case transcoder_texture_format::cTFRGBA32:
-			return sizeof(uint32_t) * 16;
-		case transcoder_texture_format::cTFRGB565:
-		case transcoder_texture_format::cTFBGR565:
-		case transcoder_texture_format::cTFRGBA4444:
-			return sizeof(uint16_t) * 16;
-		default:
-			assert(0);
-			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
-			break;
+			sels <<= 2;
+			sels |= pTran[astc_blk.m_weights[i << plane_shift]];
 		}
-		return 0;
+
+		b.m_selectors[0] = sels & 0xFF;
+		b.m_selectors[1] = (sels >> 8) & 0xFF;
+		b.m_selectors[2] = (sels >> 16) & 0xFF;
+		b.m_selectors[3] = (sels >> 24) & 0xFF;
+
+		encode_bc1(&b, (const uint8_t*)&block_pixels[0][0].c[0], (high_quality ? cEncodeBC1HighQuality : 0) | cEncodeBC1UseSelectors);
 	}
 
-	const char* basis_get_format_name(transcoder_texture_format fmt)
+	bool transcode_uastc_to_bc1(const uastc_block& src_blk, void* pDst, bool high_quality)
 	{
-		switch (fmt)
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-		case transcoder_texture_format::cTFETC1_RGB: return "ETC1_RGB";
-		case transcoder_texture_format::cTFBC1_RGB: return "BC1_RGB";
-		case transcoder_texture_format::cTFBC4_R: return "BC4_R";
-		case transcoder_texture_format::cTFPVRTC1_4_RGB: return "PVRTC1_4_RGB";
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA: return "PVRTC1_4_RGBA";
-		case transcoder_texture_format::cTFBC7_M6_RGB: return "BC7_M6_RGB";
-		case transcoder_texture_format::cTFBC7_M5_RGBA: return "BC7_M5_RGBA";
-		case transcoder_texture_format::cTFETC2_RGBA: return "ETC2_RGBA";
-		case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA";
-		case transcoder_texture_format::cTFBC5_RG: return "BC5_RG";
-		case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA";
-		case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB";
-		case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA";
-		case transcoder_texture_format::cTFRGBA32: return "RGBA32";
-		case transcoder_texture_format::cTFRGB565: return "RGB565";
-		case transcoder_texture_format::cTFBGR565: return "BGR565";
-		case transcoder_texture_format::cTFRGBA4444: return "RGBA4444";
-		case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB";
-		case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB";
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA";
-		case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11";
-		case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11";
-		default:
-			assert(0);
-			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
-			break;
+			encode_bc1_solid_block(pDst, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b);
+			return true;
 		}
-		return "";
+
+		if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0))
+			transcode_uastc_to_bc1_hint0(unpacked_src_blk, pDst);
+		else
+		{
+			color32 block_pixels[4][4];
+			const bool unpack_srgb = false;
+			if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+				return false;
+
+			if (unpacked_src_blk.m_bc1_hint1)
+				transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pDst, high_quality);
+			else
+				encode_bc1(pDst, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0);
+		}
+
+		return true;
 	}
 
-	const char* basis_get_texture_type_name(basis_texture_type tex_type)
+	static void write_bc4_solid_block(uint8_t* pDst, uint32_t a)
 	{
-		switch (tex_type)
+		pDst[0] = (uint8_t)a;
+		pDst[1] = (uint8_t)a;
+		memset(pDst + 2, 0, 6);
+	}
+
+	bool transcode_uastc_to_bc3(const uastc_block& src_blk, void* pDst, bool high_quality)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		void* pBC4_block = pDst;
+		dxt1_block* pBC1_block = &static_cast<dxt1_block*>(pDst)[1];
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-		case cBASISTexType2D: return "2D";
-		case cBASISTexType2DArray: return "2D array";
-		case cBASISTexTypeCubemapArray: return "cubemap array";
-		case cBASISTexTypeVideoFrames: return "video";
-		case cBASISTexTypeVolume: return "3D";
-		default:
-			assert(0);
-			BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n");
-			break;
+			write_bc4_solid_block(static_cast<uint8_t*>(pBC4_block), unpacked_src_blk.m_solid_color.a);
+			encode_bc1_solid_block(pBC1_block, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b);
+			return true;
 		}
-		return "";
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		basist::encode_bc4(pBC4_block, &block_pixels[0][0].a, sizeof(color32));
+
+		if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0))
+			transcode_uastc_to_bc1_hint0(unpacked_src_blk, pBC1_block);
+		else
+		{
+			if (unpacked_src_blk.m_bc1_hint1)
+				transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pBC1_block, high_quality);
+			else
+				encode_bc1(pBC1_block, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0);
+		}
+
+		return true;
 	}
 
-	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt)
+	bool transcode_uastc_to_bc4(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0)
 	{
-		switch (fmt)
+		BASISU_NOTE_UNUSED(high_quality);
+
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		void* pBC4_block = pDst;
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-		case transcoder_texture_format::cTFETC2_RGBA:
-		case transcoder_texture_format::cTFBC3_RGBA:
-		case transcoder_texture_format::cTFASTC_4x4_RGBA:
-		case transcoder_texture_format::cTFBC7_M5_RGBA:
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
-		case transcoder_texture_format::cTFATC_RGBA:
-		case transcoder_texture_format::cTFRGBA32:
-		case transcoder_texture_format::cTFRGBA4444:
+			write_bc4_solid_block(static_cast<uint8_t*>(pBC4_block), unpacked_src_blk.m_solid_color.c[chan0]);
 			return true;
-		default:
-			break;
 		}
-		return false;
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		basist::encode_bc4(pBC4_block, &block_pixels[0][0].c[chan0], sizeof(color32));
+
+		return true;
 	}
 
-	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt)
+	bool transcode_uastc_to_bc5(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1)
 	{
-		switch (fmt)
+		BASISU_NOTE_UNUSED(high_quality);
+
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		void* pBC4_block0 = pDst;
+		void* pBC4_block1 = (uint8_t*)pDst + 8;
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-		case transcoder_texture_format::cTFETC1_RGB: return basisu::texture_format::cETC1;
-		case transcoder_texture_format::cTFBC1_RGB: return basisu::texture_format::cBC1;
-		case transcoder_texture_format::cTFBC4_R: return basisu::texture_format::cBC4;
-		case transcoder_texture_format::cTFPVRTC1_4_RGB: return basisu::texture_format::cPVRTC1_4_RGB;
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA: return basisu::texture_format::cPVRTC1_4_RGBA;
-		case transcoder_texture_format::cTFBC7_M6_RGB: return basisu::texture_format::cBC7;
-		case transcoder_texture_format::cTFBC7_M5_RGBA: return basisu::texture_format::cBC7;
-		case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA;
-		case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3;
-		case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5;
-		case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC4x4;
-		case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB;
-		case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA;
-		case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32;
-		case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565;
-		case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565;
-		case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444;
-		case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB;
-		case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA;
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA;
-		case transcoder_texture_format::cTFETC2_EAC_R11: return basisu::texture_format::cETC2_R11_EAC;
-		case transcoder_texture_format::cTFETC2_EAC_RG11: return basisu::texture_format::cETC2_RG11_EAC;
-		default:
-			assert(0);
-			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
-			break;
+			write_bc4_solid_block(static_cast<uint8_t*>(pBC4_block0), unpacked_src_blk.m_solid_color.c[chan0]);
+			write_bc4_solid_block(static_cast<uint8_t*>(pBC4_block1), unpacked_src_blk.m_solid_color.c[chan1]);
+			return true;
 		}
-		return basisu::texture_format::cInvalidTextureFormat;
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		basist::encode_bc4(pBC4_block0, &block_pixels[0][0].c[chan0], sizeof(color32));
+		basist::encode_bc4(pBC4_block1, &block_pixels[0][0].c[chan1], sizeof(color32));
+
+		return true;
 	}
 
-	bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type)
+	static const uint8_t s_etc2_eac_bit_ofs[16] = { 45, 33, 21, 9, 42, 30, 18, 6, 39, 27, 15, 3,	36, 24, 12,	0 };
+
+	static void pack_eac_solid_block(eac_block& blk, uint32_t a)
 	{
-		switch (tex_type)
+		blk.m_base = static_cast<uint8_t>(a);
+		blk.m_table = 13;
+		blk.m_multiplier = 0;
+				
+		memcpy(blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4));
+
+		return;
+	}
+
+	// Only checks 4 tables.
+	static void pack_eac(eac_block& blk, const uint8_t* pPixels, uint32_t stride)
+	{
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < 16; i++)
 		{
-		case transcoder_texture_format::cTFRGBA32:
-		case transcoder_texture_format::cTFRGB565:
-		case transcoder_texture_format::cTFBGR565:
-		case transcoder_texture_format::cTFRGBA4444:
+			const uint32_t a = pPixels[i * stride];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			pack_eac_solid_block(blk, min_alpha);
+			return;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		const uint32_t SINGLE_TABLE_THRESH = 5;
+		if (alpha_range <= SINGLE_TABLE_THRESH)
+		{
+			// If alpha_range <= 5 table 13 is lossless
+			int base = clamp255((int)max_alpha - 2);
+
+			blk.m_base = base;
+			blk.m_multiplier = 1;
+			blk.m_table = 13;
+
+			base -= 3;
+
+			uint64_t packed_sels = 0;
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				const int a = pPixels[i * stride];
+
+				static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 };
+
+				int sel = a - base;
+				assert(sel >= 0 && sel <= 5);
+
+				packed_sels |= (static_cast<uint64_t>(s_sels[sel]) << s_etc2_eac_bit_ofs[i]);
+			}
+
+			blk.set_selector_bits(packed_sels);
+
+			return;
+		}
+
+		const uint32_t T0 = 2, T1 = 8, T2 = 11, T3 = 13;
+		static const uint8_t s_tables[4] = { T0, T1, T2, T3 };
+
+		int base[4], mul[4];
+		uint32_t mul_or = 0;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			const uint32_t table = s_tables[i];
+
+			const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+
+			base[i] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range)));
+			mul[i] = clampi((int)roundf(alpha_range / range), 1, 15);
+			mul_or |= mul[i];
+		}
+
+		uint32_t total_err[4] = { 0, 0, 0, 0 };
+		uint8_t sels[4][16];
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const int a = pPixels[i * stride];
+
+			uint32_t l0 = UINT32_MAX, l1 = UINT32_MAX, l2 = UINT32_MAX, l3 = UINT32_MAX;
+
+			if ((a < 7) || (a > (255 - 7)))
+			{
+				for (uint32_t s = 0; s < 8; s++)
+				{
+					const int v0 = clamp255(mul[0] * g_eac_modifier_table[T0][s] + base[0]);
+					const int v1 = clamp255(mul[1] * g_eac_modifier_table[T1][s] + base[1]);
+					const int v2 = clamp255(mul[2] * g_eac_modifier_table[T2][s] + base[2]);
+					const int v3 = clamp255(mul[3] * g_eac_modifier_table[T3][s] + base[3]);
+
+					l0 = basisu::minimum(l0, (basisu::iabs(v0 - a) << 3) | s);
+					l1 = basisu::minimum(l1, (basisu::iabs(v1 - a) << 3) | s);
+					l2 = basisu::minimum(l2, (basisu::iabs(v2 - a) << 3) | s);
+					l3 = basisu::minimum(l3, (basisu::iabs(v3 - a) << 3) | s);
+				}
+			}
+			else if (mul_or == 1)
+			{
+				const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a;
+
+				for (uint32_t s = 0; s < 8; s++)
+				{
+					const int v0 = g_eac_modifier_table[T0][s] + a0;
+					const int v1 = g_eac_modifier_table[T1][s] + a1;
+					const int v2 = g_eac_modifier_table[T2][s] + a2;
+					const int v3 = g_eac_modifier_table[T3][s] + a3;
+
+					l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s);
+					l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s);
+					l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s);
+					l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s);
+				}
+			}
+			else
+			{
+				const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a;
+
+				for (uint32_t s = 0; s < 8; s++)
+				{
+					const int v0 = mul[0] * g_eac_modifier_table[T0][s] + a0;
+					const int v1 = mul[1] * g_eac_modifier_table[T1][s] + a1;
+					const int v2 = mul[2] * g_eac_modifier_table[T2][s] + a2;
+					const int v3 = mul[3] * g_eac_modifier_table[T3][s] + a3;
+
+					l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s);
+					l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s);
+					l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s);
+					l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s);
+				}
+			}
+
+			sels[0][i] = l0 & 7;
+			sels[1][i] = l1 & 7;
+			sels[2][i] = l2 & 7;
+			sels[3][i] = l3 & 7;
+
+			total_err[0] += basisu::square<uint32_t>(l0 >> 3);
+			total_err[1] += basisu::square<uint32_t>(l1 >> 3);
+			total_err[2] += basisu::square<uint32_t>(l2 >> 3);
+			total_err[3] += basisu::square<uint32_t>(l3 >> 3);
+		}
+
+		uint32_t min_err = total_err[0], min_index = 0;
+		for (uint32_t i = 1; i < 4; i++)
+		{
+			if (total_err[i] < min_err)
+			{
+				min_err = total_err[i];
+				min_index = i;
+			}
+		}
+
+		blk.m_base = base[min_index];
+		blk.m_multiplier = mul[min_index];
+		blk.m_table = s_tables[min_index];
+
+		uint64_t packed_sels = 0;
+		const uint8_t* pSels = &sels[min_index][0];
+		for (uint32_t i = 0; i < 16; i++)
+			packed_sels |= (static_cast<uint64_t>(pSels[i]) << s_etc2_eac_bit_ofs[i]);
+
+		blk.set_selector_bits(packed_sels);
+	}
+
+	// Checks all 16 tables. Around ~2 dB better vs. pack_eac(), ~1.2 dB less than near-optimal.
+	static void pack_eac_high_quality(eac_block& blk, const uint8_t* pPixels, uint32_t stride)
+	{
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t a = pPixels[i * stride];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			pack_eac_solid_block(blk, min_alpha);
+			return;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		const uint32_t SINGLE_TABLE_THRESH = 5;
+		if (alpha_range <= SINGLE_TABLE_THRESH)
+		{
+			// If alpha_range <= 5 table 13 is lossless
+			int base = clamp255((int)max_alpha - 2);
+
+			blk.m_base = base;
+			blk.m_multiplier = 1;
+			blk.m_table = 13;
+
+			base -= 3;
+
+			uint64_t packed_sels = 0;
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				const int a = pPixels[i * stride];
+
+				static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 };
+
+				int sel = a - base;
+				assert(sel >= 0 && sel <= 5);
+
+				packed_sels |= (static_cast<uint64_t>(s_sels[sel]) << s_etc2_eac_bit_ofs[i]);
+			}
+
+			blk.set_selector_bits(packed_sels);
+
+			return;
+		}
+
+		int base[16], mul[16];
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+
+			base[table] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range)));
+			mul[table] = clampi((int)roundf(alpha_range / range), 1, 15);
+		}
+
+		uint32_t total_err[16];
+		memset(total_err, 0, sizeof(total_err));
+
+		uint8_t sels[16][16];
+
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			const int8_t* pTable = &g_eac_modifier_table[table][0];
+			const int m = mul[table], b = base[table];
+
+			uint32_t prev_l = 0, prev_a = UINT32_MAX;
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				const int a = pPixels[i * stride];
+
+				if ((uint32_t)a == prev_a)
+				{
+					sels[table][i] = prev_l & 7;
+					total_err[table] += basisu::square<uint32_t>(prev_l >> 3);
+				}
+				else
+				{
+					uint32_t l = basisu::iabs(clamp255(m * pTable[0] + b) - a) << 3;
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[1] + b) - a) << 3) | 1);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[2] + b) - a) << 3) | 2);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[3] + b) - a) << 3) | 3);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[4] + b) - a) << 3) | 4);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[5] + b) - a) << 3) | 5);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[6] + b) - a) << 3) | 6);
+					l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[7] + b) - a) << 3) | 7);
+
+					sels[table][i] = l & 7;
+					total_err[table] += basisu::square<uint32_t>(l >> 3);
+
+					prev_l = l;
+					prev_a = a;
+				}
+			}
+		}
+
+		uint32_t min_err = total_err[0], min_index = 0;
+		for (uint32_t i = 1; i < 16; i++)
+		{
+			if (total_err[i] < min_err)
+			{
+				min_err = total_err[i];
+				min_index = i;
+			}
+		}
+
+		blk.m_base = base[min_index];
+		blk.m_multiplier = mul[min_index];
+		blk.m_table = min_index;
+
+		uint64_t packed_sels = 0;
+		const uint8_t* pSels = &sels[min_index][0];
+		for (uint32_t i = 0; i < 16; i++)
+			packed_sels |= (static_cast<uint64_t>(pSels[i]) << s_etc2_eac_bit_ofs[i]);
+
+		blk.set_selector_bits(packed_sels);
+	}
+
+	bool transcode_uastc_to_etc2_eac_r11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0)
+	{
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			pack_eac_solid_block(*static_cast<eac_block*>(pDst), unpacked_src_blk.m_solid_color.c[chan0]);
 			return true;
-		default:
-			break;
 		}
-		return false;
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		if (chan0 == 3)
+			transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, pDst);
+		else
+			(high_quality ? pack_eac_high_quality : pack_eac)(*static_cast<eac_block*>(pDst), &block_pixels[0][0].c[chan0], sizeof(color32));
+
+		return true;
 	}
 
-	bool basis_block_format_is_uncompressed(block_format tex_type)
+	bool transcode_uastc_to_etc2_eac_rg11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1)
 	{
-		switch (tex_type)
+		unpacked_uastc_block unpacked_src_blk;
+		if (!unpack_uastc(src_blk, unpacked_src_blk, false))
+			return false;
+
+		const uint32_t mode = unpacked_src_blk.m_mode;
+
+		if (mode == UASTC_MODE_INDEX_SOLID_COLOR)
 		{
-		case block_format::cRGB32:
-		case block_format::cRGBA32:
-		case block_format::cA32:
-		case block_format::cRGB565:
-		case block_format::cBGR565:
-		case block_format::cRGBA4444_COLOR:
-		case block_format::cRGBA4444_ALPHA:
-		case block_format::cRGBA4444_COLOR_OPAQUE:
+			pack_eac_solid_block(static_cast<eac_block*>(pDst)[0], unpacked_src_blk.m_solid_color.c[chan0]);
+			pack_eac_solid_block(static_cast<eac_block*>(pDst)[1], unpacked_src_blk.m_solid_color.c[chan1]);
 			return true;
-		default:
-			break;
 		}
-		return false;
+
+		color32 block_pixels[4][4];
+		const bool unpack_srgb = false;
+		if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb))
+			return false;
+
+		if (chan0 == 3)
+			transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast<eac_block*>(pDst)[0]);
+		else
+			(high_quality ? pack_eac_high_quality : pack_eac)(static_cast<eac_block*>(pDst)[0], &block_pixels[0][0].c[chan0], sizeof(color32));
+
+		if (chan1 == 3)
+			transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast<eac_block*>(pDst)[1]);
+		else
+			(high_quality ? pack_eac_high_quality : pack_eac)(static_cast<eac_block*>(pDst)[1], &block_pixels[0][0].c[chan1], sizeof(color32));
+		return true;
 	}
-	
-	uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt)
+
+	// PVRTC1
+	static void fixup_pvrtc1_4_modulation_rgb(
+		const uastc_block* pSrc_blocks,
+		const uint32_t* pPVRTC_endpoints,
+		void* pDst_blocks,
+		uint32_t num_blocks_x, uint32_t num_blocks_y, bool from_alpha)
 	{
-		switch (fmt)
+		const uint32_t x_mask = num_blocks_x - 1;
+		const uint32_t y_mask = num_blocks_y - 1;
+		const uint32_t x_bits = basisu::total_bits(x_mask);
+		const uint32_t y_bits = basisu::total_bits(y_mask);
+		const uint32_t min_bits = basisu::minimum(x_bits, y_bits);
+		//const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
+		const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1;
+
+		uint32_t block_index = 0;
+
+		// really 3x3
+		int e0[4][4], e1[4][4];
+
+		for (int y = 0; y < static_cast<int>(num_blocks_y); y++)
 		{
-		case transcoder_texture_format::cTFRGBA32:
-			return sizeof(uint32_t); 
-		case transcoder_texture_format::cTFRGB565:
-		case transcoder_texture_format::cTFBGR565:
-		case transcoder_texture_format::cTFRGBA4444:
-			return sizeof(uint16_t);
-		default:
-			break;
+			const uint32_t* pE_rows[3];
+
+			for (int ey = 0; ey < 3; ey++)
+			{
+				int by = y + ey - 1;
+
+				const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x];
+
+				pE_rows[ey] = pE;
+
+				for (int ex = 0; ex < 3; ex++)
+				{
+					int bx = 0 + ex - 1;
+
+					const uint32_t e = pE[bx & x_mask];
+
+					e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31;
+					e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31;
+				}
+			}
+
+			const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF];
+
+			for (int x = 0; x < static_cast<int>(num_blocks_x); x++, block_index++)
+			{
+				const uastc_block& src_block = pSrc_blocks[block_index];
+
+				color32 block_pixels[4][4];
+				unpack_uastc(src_block, &block_pixels[0][0], false);
+				if (from_alpha)
+				{
+					// Just set RGB to alpha to avoid adding complexity below.
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const uint8_t a = ((color32*)block_pixels)[i].a;
+						((color32*)block_pixels)[i].set(a, a, a, 255);
+					}
+				}
+
+				const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1);
+
+				uint32_t swizzled = x_swizzle | y_swizzle;
+				if (num_blocks_x != num_blocks_y)
+				{
+					swizzled &= swizzle_mask;
+
+					if (num_blocks_x > num_blocks_y)
+						swizzled |= ((x >> min_bits) << (min_bits * 2));
+					else
+						swizzled |= ((y >> min_bits) << (min_bits * 2));
+				}
+
+				pvrtc4_block* pDst_block = static_cast<pvrtc4_block*>(pDst_blocks) + swizzled;
+				pDst_block->m_endpoints = pPVRTC_endpoints[block_index];
+
+				{
+					const uint32_t ex = 2;
+					int bx = x + ex - 1;
+					bx &= x_mask;
+
+#define DO_ROW(ey) \
+					{ \
+						const uint32_t e = pE_rows[ey][bx]; \
+						e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31; \
+						e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31; \
+					}
+
+					DO_ROW(0);
+					DO_ROW(1);
+					DO_ROW(2);
+#undef DO_ROW
+				}
+
+				uint32_t mod = 0;
+
+#define DO_PIX(lx, ly, w0, w1, w2, w3) \
+				{ \
+					int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \
+					int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \
+					int cl = (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b) * 16; \
+					int dl = cb_l - ca_l; \
+					int vl = cl - ca_l; \
+					int p = vl * 16; \
+					if (ca_l > cb_l) { p = -p; dl = -dl; } \
+					uint32_t m = 0; \
+					if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \
+					if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \
+					if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \
+					mod |= m; \
+				}
+
+				{
+					const uint32_t ex = 0, ey = 0;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(0, 0, 4, 4, 4, 4);
+					DO_PIX(1, 0, 2, 6, 2, 6);
+					DO_PIX(0, 1, 2, 2, 6, 6);
+					DO_PIX(1, 1, 1, 3, 3, 9);
+				}
+
+				{
+					const uint32_t ex = 1, ey = 0;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(2, 0, 8, 0, 8, 0);
+					DO_PIX(3, 0, 6, 2, 6, 2);
+					DO_PIX(2, 1, 4, 0, 12, 0);
+					DO_PIX(3, 1, 3, 1, 9, 3);
+				}
+
+				{
+					const uint32_t ex = 0, ey = 1;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(0, 2, 8, 8, 0, 0);
+					DO_PIX(1, 2, 4, 12, 0, 0);
+					DO_PIX(0, 3, 6, 6, 2, 2);
+					DO_PIX(1, 3, 3, 9, 1, 3);
+				}
+
+				{
+					const uint32_t ex = 1, ey = 1;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(2, 2, 16, 0, 0, 0);
+					DO_PIX(3, 2, 12, 4, 0, 0);
+					DO_PIX(2, 3, 12, 0, 4, 0);
+					DO_PIX(3, 3, 9, 3, 3, 1);
+				}
+#undef DO_PIX
+
+				pDst_block->m_modulation = mod;
+
+				e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0];
+				e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1];
+				e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2];
+
+				e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0];
+				e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1];
+				e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2];
+
+			} // x
+		} // y
+	}
+
+	static void fixup_pvrtc1_4_modulation_rgba(
+		const uastc_block* pSrc_blocks,
+		const uint32_t* pPVRTC_endpoints,
+		void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y)
+	{
+		const uint32_t x_mask = num_blocks_x - 1;
+		const uint32_t y_mask = num_blocks_y - 1;
+		const uint32_t x_bits = basisu::total_bits(x_mask);
+		const uint32_t y_bits = basisu::total_bits(y_mask);
+		const uint32_t min_bits = basisu::minimum(x_bits, y_bits);
+		//const uint32_t max_bits = basisu::maximum(x_bits, y_bits);
+		const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1;
+
+		uint32_t block_index = 0;
+
+		// really 3x3
+		int e0[4][4], e1[4][4];
+
+		for (int y = 0; y < static_cast<int>(num_blocks_y); y++)
+		{
+			const uint32_t* pE_rows[3];
+
+			for (int ey = 0; ey < 3; ey++)
+			{
+				int by = y + ey - 1;
+
+				const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x];
+
+				pE_rows[ey] = pE;
+
+				for (int ex = 0; ex < 3; ex++)
+				{
+					int bx = 0 + ex - 1;
+
+					const uint32_t e = pE[bx & x_mask];
+
+					e0[ex][ey] = get_endpoint_l8(e, 0);
+					e1[ex][ey] = get_endpoint_l8(e, 1);
+				}
+			}
+
+			const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF];
+
+			for (int x = 0; x < static_cast<int>(num_blocks_x); x++, block_index++)
+			{
+				const uastc_block& src_block = pSrc_blocks[block_index];
+
+				color32 block_pixels[4][4];
+				unpack_uastc(src_block, &block_pixels[0][0], false);
+
+				const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1);
+
+				uint32_t swizzled = x_swizzle | y_swizzle;
+				if (num_blocks_x != num_blocks_y)
+				{
+					swizzled &= swizzle_mask;
+
+					if (num_blocks_x > num_blocks_y)
+						swizzled |= ((x >> min_bits) << (min_bits * 2));
+					else
+						swizzled |= ((y >> min_bits) << (min_bits * 2));
+				}
+
+				pvrtc4_block* pDst_block = static_cast<pvrtc4_block*>(pDst_blocks) + swizzled;
+				pDst_block->m_endpoints = pPVRTC_endpoints[block_index];
+
+				{
+					const uint32_t ex = 2;
+					int bx = x + ex - 1;
+					bx &= x_mask;
+
+#define DO_ROW(ey) \
+					{ \
+						const uint32_t e = pE_rows[ey][bx]; \
+						e0[ex][ey] = get_endpoint_l8(e, 0); \
+						e1[ex][ey] = get_endpoint_l8(e, 1); \
+					}
+
+					DO_ROW(0);
+					DO_ROW(1);
+					DO_ROW(2);
+#undef DO_ROW
+				}
+
+				uint32_t mod = 0;
+
+#define DO_PIX(lx, ly, w0, w1, w2, w3) \
+				{ \
+					int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \
+					int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \
+					int cl = 16 * (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b + block_pixels[ly][lx].a); \
+					int dl = cb_l - ca_l; \
+					int vl = cl - ca_l; \
+					int p = vl * 16; \
+					if (ca_l > cb_l) { p = -p; dl = -dl; } \
+					uint32_t m = 0; \
+					if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \
+					if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \
+					if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \
+					mod |= m; \
+				}
+
+				{
+					const uint32_t ex = 0, ey = 0;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(0, 0, 4, 4, 4, 4);
+					DO_PIX(1, 0, 2, 6, 2, 6);
+					DO_PIX(0, 1, 2, 2, 6, 6);
+					DO_PIX(1, 1, 1, 3, 3, 9);
+				}
+
+				{
+					const uint32_t ex = 1, ey = 0;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(2, 0, 8, 0, 8, 0);
+					DO_PIX(3, 0, 6, 2, 6, 2);
+					DO_PIX(2, 1, 4, 0, 12, 0);
+					DO_PIX(3, 1, 3, 1, 9, 3);
+				}
+
+				{
+					const uint32_t ex = 0, ey = 1;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(0, 2, 8, 8, 0, 0);
+					DO_PIX(1, 2, 4, 12, 0, 0);
+					DO_PIX(0, 3, 6, 6, 2, 2);
+					DO_PIX(1, 3, 3, 9, 1, 3);
+				}
+
+				{
+					const uint32_t ex = 1, ey = 1;
+					const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1];
+					const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1];
+					DO_PIX(2, 2, 16, 0, 0, 0);
+					DO_PIX(3, 2, 12, 4, 0, 0);
+					DO_PIX(2, 3, 12, 0, 4, 0);
+					DO_PIX(3, 3, 9, 3, 3, 1);
+				}
+#undef DO_PIX
+
+				pDst_block->m_modulation = mod;
+
+				e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0];
+				e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1];
+				e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2];
+
+				e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0];
+				e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1];
+				e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2];
+
+			} // x
+		} // y
+	}
+
+	bool transcode_uastc_to_pvrtc1_4_rgb(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality, bool from_alpha)
+	{
+		BASISU_NOTE_UNUSED(high_quality);
+
+		if ((!num_blocks_x) || (!num_blocks_y))
+			return false;
+
+		const uint32_t width = num_blocks_x * 4;
+		const uint32_t height = num_blocks_y * 4;
+		if (!basisu::is_pow2(width) || !basisu::is_pow2(height))
+			return false;
+
+		basisu::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
+
+		for (uint32_t y = 0; y < num_blocks_y; y++)
+		{
+			for (uint32_t x = 0; x < num_blocks_x; x++)
+			{
+				color32 block_pixels[16];
+				if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false))
+					return false;
+
+				// Get block's RGB bounding box 
+				color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0);
+
+				if (from_alpha)
+				{
+					uint32_t low_a = 255, high_a = 0;
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						low_a = basisu::minimum<uint32_t>(low_a, block_pixels[i].a);
+						high_a = basisu::maximum<uint32_t>(high_a, block_pixels[i].a);
+					}
+					low_color.set(low_a, low_a, low_a, 255);
+					high_color.set(high_a, high_a, high_a, 255);
+				}
+				else
+				{
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						low_color = color32::comp_min(low_color, block_pixels[i]);
+						high_color = color32::comp_max(high_color, block_pixels[i]);
+					}
+				}
+
+				// Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates.
+				pvrtc4_block temp;
+				temp.set_opaque_endpoint_floor(0, low_color);
+				temp.set_opaque_endpoint_ceil(1, high_color);
+
+				temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints;
+			}
 		}
-		return 0;
+
+		fixup_pvrtc1_4_modulation_rgb(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y, from_alpha);
+
+		return true;
 	}
-	
-	uint32_t basis_get_block_width(transcoder_texture_format tex_type)
+
+	bool transcode_uastc_to_pvrtc1_4_rgba(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality)
 	{
-		switch (tex_type)
+		BASISU_NOTE_UNUSED(high_quality);
+
+		if ((!num_blocks_x) || (!num_blocks_y))
+			return false;
+
+		const uint32_t width = num_blocks_x * 4;
+		const uint32_t height = num_blocks_y * 4;
+		if (!basisu::is_pow2(width) || !basisu::is_pow2(height))
+			return false;
+
+		basisu::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
+
+		for (uint32_t y = 0; y < num_blocks_y; y++)
 		{
-			case transcoder_texture_format::cTFFXT1_RGB:
-				return 8;
-			default:
+			for (uint32_t x = 0; x < num_blocks_x; x++)
+			{
+				color32 block_pixels[16];
+				if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false))
+					return false;
+
+				// Get block's RGBA bounding box 
+				color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0);
+
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					low_color = color32::comp_min(low_color, block_pixels[i]);
+					high_color = color32::comp_max(high_color, block_pixels[i]);
+				}
+
+				// Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates.
+				pvrtc4_block temp;
+				temp.set_endpoint_floor(0, low_color);
+				temp.set_endpoint_ceil(1, high_color);
+
+				temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints;
+			}
+		}
+
+		fixup_pvrtc1_4_modulation_rgba(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y);
+
+		return true;
+	}
+
+	void uastc_init()
+	{
+		for (uint32_t range = 0; range < BC7ENC_TOTAL_ASTC_RANGES; range++)
+		{
+			if (!astc_is_valid_endpoint_range(range))
+				continue;
+
+			const uint32_t levels = astc_get_levels(range);
+
+			uint32_t vals[256];
+			for (uint32_t i = 0; i < levels; i++)
+				vals[i] = (unquant_astc_endpoint_val(i, range) << 8) | i;
+
+			std::sort(vals, vals + levels);
+
+			for (uint32_t i = 0; i < levels; i++)
+			{
+				const uint32_t order = vals[i] & 0xFF;
+				const uint32_t unq = vals[i] >> 8;
+
+				g_astc_unquant[range][order].m_unquant = (uint8_t)unq;
+				g_astc_unquant[range][order].m_index = (uint8_t)i;
+
+			} // i
+		}
+
+		// TODO: Precompute?
+		// BC7 777.1
+		for (int c = 0; c < 256; c++)
+		{
+			for (uint32_t lp = 0; lp < 2; lp++)
+			{
+				endpoint_err best;
+				best.m_error = (uint16_t)UINT16_MAX;
+
+				for (uint32_t l = 0; l < 128; l++)
+				{
+					const uint32_t low = (l << 1) | lp;
+
+					for (uint32_t h = 0; h < 128; h++)
+					{
+						const uint32_t high = (h << 1) | lp;
+
+						const int k = (low * (64 - g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX]) + high * g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX] + 32) >> 6;
+
+						const int err = (k - c) * (k - c);
+						if (err < best.m_error)
+						{
+							best.m_error = (uint16_t)err;
+							best.m_lo = (uint8_t)l;
+							best.m_hi = (uint8_t)h;
+						}
+					} // h
+				} // l
+
+				g_bc7_mode_6_optimal_endpoints[c][lp] = best;
+			} // lp
+
+		} // c
+
+		// BC7 777
+		for (int c = 0; c < 256; c++)
+		{
+			endpoint_err best;
+			best.m_error = (uint16_t)UINT16_MAX;
+
+			for (uint32_t l = 0; l < 128; l++)
+			{
+				const uint32_t low = (l << 1) | (l >> 6);
+
+				for (uint32_t h = 0; h < 128; h++)
+				{
+					const uint32_t high = (h << 1) | (h >> 6);
+
+					const int k = (low * (64 - g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX]) + high * g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX] + 32) >> 6;
+
+					const int err = (k - c) * (k - c);
+					if (err < best.m_error)
+					{
+						best.m_error = (uint16_t)err;
+						best.m_lo = (uint8_t)l;
+						best.m_hi = (uint8_t)h;
+					}
+				} // h
+			} // l
+
+			g_bc7_mode_5_optimal_endpoints[c] = best;
+
+		} // c
+	}
+
+#endif // #if BASISD_SUPPORT_UASTC
+
+// ------------------------------------------------------------------------------------------------------ 
+// KTX2
+// ------------------------------------------------------------------------------------------------------ 
+
+#if BASISD_SUPPORT_KTX2
+	const uint8_t g_ktx2_file_identifier[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x32, 0x30, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A };
+
+	ktx2_transcoder::ktx2_transcoder(basist::etc1_global_selector_codebook* pGlobal_sel_codebook) :
+		m_etc1s_transcoder(pGlobal_sel_codebook)
+	{
+		clear();
+	}
+
+	void ktx2_transcoder::clear()
+	{
+		m_pData = nullptr;
+		m_data_size = 0;
+
+		memset(&m_header, 0, sizeof(m_header));
+		m_levels.clear();
+		m_dfd.clear();
+		m_key_values.clear();
+		memset(&m_etc1s_header, 0, sizeof(m_etc1s_header));
+		m_etc1s_image_descs.clear();
+				
+		m_format = basist::basis_tex_format::cETC1S;
+
+		m_dfd_color_model = 0;
+		m_dfd_color_prims = KTX2_DF_PRIMARIES_UNSPECIFIED;
+		m_dfd_transfer_func = 0;
+		m_dfd_flags = 0;
+		m_dfd_samples = 0;
+		m_dfd_chan0 = KTX2_DF_CHANNEL_UASTC_RGB;
+		m_dfd_chan1 = KTX2_DF_CHANNEL_UASTC_RGB;
+
+		m_etc1s_transcoder.clear();
+				
+		m_def_transcoder_state.clear();
+		
+		m_has_alpha = false;
+		m_is_video = false;
+	}
+
+	bool ktx2_transcoder::init(const void* pData, uint32_t data_size)
+	{
+		clear();
+
+		if (!pData)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: pData is nullptr\n");
+			assert(0);
+			return false;
+		}
+
+		if (data_size <= sizeof(ktx2_header))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is impossibly too small to be a valid KTX2 file\n");
+			return false;
+		}
+
+		if (memcmp(pData, g_ktx2_file_identifier, sizeof(g_ktx2_file_identifier)) != 0)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file identifier is not present\n");
+			return false;
+		}
+
+		m_pData = static_cast<const uint8_t *>(pData);
+		m_data_size = data_size;
+
+		memcpy(&m_header, pData, sizeof(m_header));
+
+		// We only support UASTC and ETC1S
+		if (m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC format\n");
+			return false;
+		}
+
+		// 3.3: "When format is VK_FORMAT_UNDEFINED, typeSize must equal 1."
+		if (m_header.m_type_size != 1)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid type_size\n");
+			return false;
+		}
+
+		// We only currently support 2D textures (plain, cubemapped, or texture array), which is by far the most common use case.
+		// The BasisU library does not support 1D or 3D textures at all.
+		if ((m_header.m_pixel_width < 1) || (m_header.m_pixel_height < 1) || (m_header.m_pixel_depth > 0))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Only 2D or cubemap textures are supported\n");
+			return false;
+		}
+
+		// Face count must be 1 or 6
+		if ((m_header.m_face_count != 1) && (m_header.m_face_count != 6))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid face count, file is corrupted or invalid\n");
+			return false;
+		}
+
+		if (m_header.m_face_count > 1)
+		{
+			// 3.4: Make sure cubemaps are square.
+			if (m_header.m_pixel_width != m_header.m_pixel_height)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Cubemap is not square\n");
+				return false;
+			}
+		}
+		
+		// 3.7 levelCount: "levelCount=0 is allowed, except for block-compressed formats"
+		if (m_header.m_level_count < 1)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level count\n");
+			return false;
+		}
+
+		// Sanity check the level count.
+		if (m_header.m_level_count > KTX2_MAX_SUPPORTED_LEVEL_COUNT)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Too many levels or file is corrupted or invalid\n");
+			return false;
+		}
+
+		if (m_header.m_supercompression_scheme > KTX2_SS_ZSTANDARD)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid/unsupported supercompression or file is corrupted or invalid\n");
+			return false;
+		}
+
+		if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ)
+		{
+			if (m_header.m_sgd_byte_length <= sizeof(ktx2_etc1s_global_data_header))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data is too small\n");
+				return false;
+			}
+
+			if (m_header.m_sgd_byte_offset < sizeof(ktx2_header))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset is too low\n");
+				return false;
+			}
+
+			if (m_header.m_sgd_byte_offset + m_header.m_sgd_byte_length > m_data_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset and/or length is too high\n");
+				return false;
+			}
+		}
+
+		if (!m_levels.try_resize(m_header.m_level_count))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n");
+			return false;
+		}
+
+		const uint32_t level_index_size_in_bytes = basisu::maximum(1U, (uint32_t)m_header.m_level_count) * sizeof(ktx2_level_index);
+
+		if ((sizeof(ktx2_header) + level_index_size_in_bytes) > m_data_size)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is too small (can't read level index array)\n");
+			return false;
+		}
+
+		memcpy(&m_levels[0], m_pData + sizeof(ktx2_header), level_index_size_in_bytes);
+		
+		// Sanity check the level offsets and byte sizes
+		for (uint32_t i = 0; i < m_levels.size(); i++)
+		{
+			if (m_levels[i].m_byte_offset < sizeof(ktx2_header))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too low)\n");
+				return false;
+			}
+
+			if (!m_levels[i].m_byte_length)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level byte length\n");
+			}
+
+			if ((m_levels[i].m_byte_offset + m_levels[i].m_byte_length) > m_data_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset and/or length\n");
+				return false;
+			}
+			
+			const uint64_t MAX_SANE_LEVEL_UNCOMP_SIZE = 2048ULL * 1024ULL * 1024ULL;
+			
+			if (m_levels[i].m_uncompressed_byte_length >= MAX_SANE_LEVEL_UNCOMP_SIZE)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too large)\n");
+				return false;
+			}
+
+			if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ)
+			{
+				if (m_levels[i].m_uncompressed_byte_length)
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (0)\n");
+					return false;
+				}
+			}
+			else if (m_header.m_supercompression_scheme >= KTX2_SS_ZSTANDARD)
+			{
+				if (!m_levels[i].m_uncompressed_byte_length)
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (1)\n");
+					return false;
+				}
+			}
+		}
+
+		const uint32_t DFD_MINIMUM_SIZE = 44, DFD_MAXIMUM_SIZE = 60;
+		if ((m_header.m_dfd_byte_length != DFD_MINIMUM_SIZE) && (m_header.m_dfd_byte_length != DFD_MAXIMUM_SIZE))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD size\n");
+			return false;
+		}
+
+		if (((m_header.m_dfd_byte_offset + m_header.m_dfd_byte_length) > m_data_size) || (m_header.m_dfd_byte_offset < sizeof(ktx2_header)))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD offset and/or length\n");
+			return false;
+		}
+				
+		const uint8_t* pDFD = m_pData + m_header.m_dfd_byte_offset;
+
+		if (!m_dfd.try_resize(m_header.m_dfd_byte_length))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n");
+			return false;
+		}
+
+		memcpy(m_dfd.data(), pDFD, m_header.m_dfd_byte_length);
+		
+		// This is all hard coded for only ETC1S and UASTC.
+		uint32_t dfd_total_size = basisu::read_le_dword(pDFD);
+		
+		// 3.10.3: Sanity check
+		if (dfd_total_size != m_header.m_dfd_byte_length)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (1)\n");
+			return false;
+		}
+				
+		// 3.10.3: More sanity checking
+		if (m_header.m_kvd_byte_length)
+		{
+			if (dfd_total_size != m_header.m_kvd_byte_offset - m_header.m_dfd_byte_offset)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (2)\n");
+				return false;
+			}
+		}
+
+		const uint32_t dfd_bits = basisu::read_le_dword(pDFD + 3 * sizeof(uint32_t));
+		const uint32_t sample_channel0 = basisu::read_le_dword(pDFD + 7 * sizeof(uint32_t));
+		 
+		m_dfd_color_model = dfd_bits & 255;
+		m_dfd_color_prims = (ktx2_df_color_primaries)((dfd_bits >> 8) & 255);
+		m_dfd_transfer_func = (dfd_bits >> 16) & 255;
+		m_dfd_flags = (dfd_bits >> 24) & 255;
+
+		// See 3.10.1.Restrictions
+		if ((m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_LINEAR) && (m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_SRGB))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD transfer function\n");
+			return false;
+		}
+
+		if (m_dfd_color_model == KTX2_KDF_DF_MODEL_ETC1S)
+		{
+			m_format = basist::basis_tex_format::cETC1S;
+			
+			// 3.10.2: "Whether the image has 1 or 2 slices can be determined from the DFD�s sample count."
+			// If m_has_alpha is true it may be 2-channel RRRG or 4-channel RGBA, but we let the caller deal with that.
+			m_has_alpha = (m_header.m_dfd_byte_length == 60);
+			
+			m_dfd_samples = m_has_alpha ? 2 : 1;
+			m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15);
+
+			if (m_has_alpha)
+			{
+				const uint32_t sample_channel1 = basisu::read_le_dword(pDFD + 11 * sizeof(uint32_t));
+				m_dfd_chan1 = (ktx2_df_channel_id)((sample_channel1 >> 24) & 15);
+			}
+		}
+		else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC)
+		{
+			m_format = basist::basis_tex_format::cUASTC4x4;
+
+			m_dfd_samples = 1;
+			m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15);
+			
+			// We're assuming "DATA" means RGBA so it has alpha.
+			m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG);
+		}
+		else
+		{
+			// Unsupported DFD color model.
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD color model\n");
+			return false;
+		}
+				
+		if (!read_key_values())
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: read_key_values() failed\n");
+			return false;
+		}
+
+		// Check for a KTXanimData key
+		for (uint32_t i = 0; i < m_key_values.size(); i++)
+		{
+			if (strcmp(reinterpret_cast<const char*>(m_key_values[i].m_key.data()), "KTXanimData") == 0)
+			{
+				m_is_video = true;
 				break;
+			}
 		}
-		return 4;
+
+		return true;
 	}
 
-	uint32_t basis_get_block_height(transcoder_texture_format tex_type)
+	uint32_t ktx2_transcoder::get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const
 	{
-		(void)tex_type;
-		return 4;
+		const uint32_t etc1s_image_index =
+			(level_index * basisu::maximum<uint32_t>(m_header.m_layer_count, 1) * m_header.m_face_count) +
+			layer_index * m_header.m_face_count +
+			face_index;
+
+		if (etc1s_image_index >= get_etc1s_image_descs().size())
+		{
+			assert(0);
+			return 0;
+		}
+
+		return get_etc1s_image_descs()[etc1s_image_index].m_image_flags;
+	}
+
+	const basisu::uint8_vec* ktx2_transcoder::find_key(const std::string& key_name) const
+	{
+		for (uint32_t i = 0; i < m_key_values.size(); i++)
+			if (strcmp((const char *)m_key_values[i].m_key.data(), key_name.c_str()) == 0)
+				return &m_key_values[i].m_value;
+
+		return nullptr;
 	}
 	
-	bool basis_is_format_supported(transcoder_texture_format tex_type)
+	bool ktx2_transcoder::start_transcoding()
 	{
-		switch (tex_type)
+		if (!m_pData)
 		{
-			// ETC1 and uncompressed are always supported.
-		case transcoder_texture_format::cTFETC1_RGB: 
-		case transcoder_texture_format::cTFRGBA32: 
-		case transcoder_texture_format::cTFRGB565: 
-		case transcoder_texture_format::cTFBGR565: 
-		case transcoder_texture_format::cTFRGBA4444: 
-			return true;
-#if BASISD_SUPPORT_DXT1
-		case transcoder_texture_format::cTFBC1_RGB:
-			return true;
-#endif
-#if BASISD_SUPPORT_DXT5A
-		case transcoder_texture_format::cTFBC4_R:
-		case transcoder_texture_format::cTFBC5_RG:
-			return true;
-#endif
-#if BASISD_SUPPORT_DXT1 && BASISD_SUPPORT_DXT5A
-		case transcoder_texture_format::cTFBC3_RGBA:
-			return true;
-#endif
-#if BASISD_SUPPORT_PVRTC1
-		case transcoder_texture_format::cTFPVRTC1_4_RGB: 
-		case transcoder_texture_format::cTFPVRTC1_4_RGBA: 
-			return true;
-#endif
-#if BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY
-		case transcoder_texture_format::cTFBC7_M6_RGB: 
-			return true;
-#endif
-#if BASISD_SUPPORT_BC7_MODE5
-		case transcoder_texture_format::cTFBC7_M5_RGBA:
-			return true;
-#endif
-#if BASISD_SUPPORT_ETC2_EAC_A8
-		case transcoder_texture_format::cTFETC2_RGBA: 
-			return true;
-#endif
-#if BASISD_SUPPORT_ASTC		
-		case transcoder_texture_format::cTFASTC_4x4_RGBA: 
-			return true;
-#endif
-#if BASISD_SUPPORT_ATC
-		case transcoder_texture_format::cTFATC_RGB: 
-		case transcoder_texture_format::cTFATC_RGBA:
-			return true;
-#endif
-#if BASISD_SUPPORT_FXT1
-		case transcoder_texture_format::cTFFXT1_RGB:
-			return true;
+			BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: Must call init() first\n");
+			return false;
+		}
+
+		if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) 
+		{
+			// Check if we've already decompressed the ETC1S global data. If so don't unpack it again.
+			if (!m_etc1s_transcoder.get_endpoints().empty())
+				return true;
+
+			if (!decompress_etc1s_global_data())
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: decompress_etc1s_global_data() failed\n");
+				return false;
+			}
+			
+			if (!m_is_video)
+			{
+				// See if there are any P-frames. If so it must be a video, even if there wasn't a KTXanimData key.
+				// Video cannot be a cubemap, and it must be a texture array.
+				if ((m_header.m_face_count == 1) && (m_header.m_layer_count > 1))
+				{
+					for (uint32_t i = 0; i < m_etc1s_image_descs.size(); i++)
+					{
+						if (m_etc1s_image_descs[i].m_image_flags & KTX2_IMAGE_IS_P_FRAME)
+						{
+							m_is_video = true;
+							break;
+						}
+					}
+				}
+			}
+		}
+		else if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD)
+		{
+#if !BASISD_SUPPORT_KTX2_ZSTD
+			BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: File uses zstd supercompression, but zstd support was not enabled at compilation time (BASISD_SUPPORT_KTX2_ZSTD == 0)\n");
+			return false;
 #endif
-#if BASISD_SUPPORT_PVRTC2
-		case transcoder_texture_format::cTFPVRTC2_4_RGB:
-		case transcoder_texture_format::cTFPVRTC2_4_RGBA:
-			return true;
+		}
+
+		return true;
+	}
+
+	bool ktx2_transcoder::get_image_level_info(ktx2_image_level_info& level_info, uint32_t level_index, uint32_t layer_index, uint32_t face_index) const
+	{
+		if (level_index >= m_levels.size())
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: level_index >= m_levels.size()\n");
+			return false;
+		}
+
+		if (m_header.m_face_count > 1)
+		{
+			if (face_index >= 6)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index >= 6\n");
+				return false;
+			}
+		}
+		else if (face_index != 0)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index != 0\n");
+			return false;
+		}
+
+		if (layer_index >= basisu::maximum<uint32_t>(m_header.m_layer_count, 1))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: layer_index >= maximum<uint32_t>(m_header.m_layer_count, 1)\n");
+			return false;
+		}
+				
+		const uint32_t level_width = basisu::maximum<uint32_t>(m_header.m_pixel_width >> level_index, 1);
+		const uint32_t level_height = basisu::maximum<uint32_t>(m_header.m_pixel_height >> level_index, 1);
+		const uint32_t num_blocks_x = (level_width + 3) >> 2;
+		const uint32_t num_blocks_y = (level_height + 3) >> 2;
+
+		level_info.m_face_index = face_index;
+		level_info.m_layer_index = layer_index;
+		level_info.m_level_index = level_index;
+		level_info.m_orig_width = level_width;
+		level_info.m_orig_height = level_height;
+		level_info.m_width = num_blocks_x * 4;
+		level_info.m_height = num_blocks_y * 4;
+		level_info.m_num_blocks_x = num_blocks_x;
+		level_info.m_num_blocks_y = num_blocks_y;
+		level_info.m_total_blocks = num_blocks_x * num_blocks_y;
+		level_info.m_alpha_flag = m_has_alpha;
+		level_info.m_iframe_flag = false;
+		if (m_etc1s_image_descs.size())
+		{
+			const uint32_t etc1s_image_index =
+				(level_index * basisu::maximum<uint32_t>(m_header.m_layer_count, 1) * m_header.m_face_count) +
+				layer_index * m_header.m_face_count +
+				face_index;
+
+			level_info.m_iframe_flag = (m_etc1s_image_descs[etc1s_image_index].m_image_flags & KTX2_IMAGE_IS_P_FRAME) == 0;
+		}
+
+		return true;
+	}
+		
+	bool ktx2_transcoder::transcode_image_level(
+		uint32_t level_index, uint32_t layer_index, uint32_t face_index, 
+		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		basist::transcoder_texture_format fmt,
+		uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, uint32_t output_rows_in_pixels, int channel0, int channel1,
+		ktx2_transcoder_state* pState)
+	{
+		if (!m_pData)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Must call init() first\n");
+			return false;
+		}
+
+		if (!pState)
+			pState = &m_def_transcoder_state;
+										
+		if (level_index >= m_levels.size())
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: level_index >= m_levels.size()\n");
+			return false;
+		}
+
+		if (m_header.m_face_count > 1)
+		{
+			if (face_index >= 6)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index >= 6\n");
+				return false;
+			}
+		}
+		else if (face_index != 0)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index != 0\n");
+			return false;
+		}
+
+		if (layer_index >= basisu::maximum<uint32_t>(m_header.m_layer_count, 1))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: layer_index >= maximum<uint32_t>(m_header.m_layer_count, 1)\n");
+			return false;
+		}
+
+		const uint8_t* pComp_level_data = m_pData + m_levels[level_index].m_byte_offset;
+		uint64_t comp_level_data_size = m_levels[level_index].m_byte_length;
+		
+		const uint8_t* pUncomp_level_data = pComp_level_data;
+		uint64_t uncomp_level_data_size = comp_level_data_size;
+
+		if (uncomp_level_data_size > UINT32_MAX)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_level_data_size > UINT32_MAX\n");
+			return false;
+		}
+				
+		if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD)
+		{
+			// Check if we've already decompressed this level's supercompressed data.
+			if ((int)level_index != pState->m_uncomp_data_level_index)
+			{
+				// Uncompress the entire level's supercompressed data.
+				if (!decompress_level_data(level_index, pState->m_level_uncomp_data))
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: decompress_level_data() failed\n");
+					return false;
+				}
+				pState->m_uncomp_data_level_index = level_index;
+			}
+
+			pUncomp_level_data = pState->m_level_uncomp_data.data();
+			uncomp_level_data_size = pState->m_level_uncomp_data.size();
+		}
+				
+		const uint32_t level_width = basisu::maximum<uint32_t>(m_header.m_pixel_width >> level_index, 1);
+		const uint32_t level_height = basisu::maximum<uint32_t>(m_header.m_pixel_height >> level_index, 1);
+		const uint32_t num_blocks_x = (level_width + 3) >> 2;
+		const uint32_t num_blocks_y = (level_height + 3) >> 2;
+		
+		if (m_format == basist::basis_tex_format::cETC1S)
+		{
+			// Ensure start_transcoding() was called.
+			if (m_etc1s_transcoder.get_endpoints().empty())
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: must call start_transcoding() first\n");
+				return false;
+			}
+
+			const uint32_t etc1s_image_index =
+				(level_index * basisu::maximum<uint32_t>(m_header.m_layer_count, 1) * m_header.m_face_count) +
+				layer_index * m_header.m_face_count +
+				face_index;
+		
+			// Sanity check
+			if (etc1s_image_index >= m_etc1s_image_descs.size())
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: etc1s_image_index >= m_etc1s_image_descs.size()\n");
+				assert(0);
+				return false;
+			}
+
+			if (static_cast<uint32_t>(m_data_size) != m_data_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: File is too large\n");
+				return false;
+			}
+
+			const ktx2_etc1s_image_desc& image_desc = m_etc1s_image_descs[etc1s_image_index];
+
+			if (!m_etc1s_transcoder.transcode_image(fmt,
+				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, m_pData, static_cast<uint32_t>(m_data_size),
+				num_blocks_x, num_blocks_y, level_width, level_height,
+				level_index,
+				m_levels[level_index].m_byte_offset + image_desc.m_rgb_slice_byte_offset, image_desc.m_rgb_slice_byte_length,
+				image_desc.m_alpha_slice_byte_length ? (m_levels[level_index].m_byte_offset + image_desc.m_alpha_slice_byte_offset) : 0, image_desc.m_alpha_slice_byte_length,
+				decode_flags, m_has_alpha,
+				m_is_video, output_row_pitch_in_blocks_or_pixels, &pState->m_transcoder_state, output_rows_in_pixels))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: ETC1S transcode_image() failed, this is either a bug or the file is corrupted/invalid\n");
+				return false;
+			}
+		}
+		else if (m_format == basist::basis_tex_format::cUASTC4x4)
+		{
+			// Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices.
+			assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length);
+			const uint32_t total_2D_image_size = num_blocks_x * num_blocks_y * KTX2_UASTC_BLOCK_SIZE;
+						
+			const uint32_t uncomp_ofs = (layer_index * m_header.m_face_count + face_index) * total_2D_image_size;
+
+			// Sanity checks
+			if (uncomp_ofs >= uncomp_level_data_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_ofs >= total_2D_image_size\n");
+				return false;
+			}
+
+			if ((uncomp_level_data_size - uncomp_ofs) < total_2D_image_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: (uncomp_level_data_size - uncomp_ofs) < total_2D_image_size\n");
+				return false;
+			}
+
+			if (!m_uastc_transcoder.transcode_image(fmt,
+				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+				(const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index,
+				0, (uint32_t)total_2D_image_size,
+				decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n");
+				return false;
+			}
+		}
+		else
+		{
+			// Shouldn't get here.
+			BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Internal error\n");
+			assert(0);
+			return false;
+		}
+
+		return true;
+	}
+		
+	bool ktx2_transcoder::decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data)
+	{
+		const uint8_t* pComp_data = m_levels[level_index].m_byte_offset + m_pData;
+		const uint64_t comp_size = m_levels[level_index].m_byte_length;
+		
+		const uint64_t uncomp_size = m_levels[level_index].m_uncompressed_byte_length;
+
+		if (((size_t)comp_size) != comp_size)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Compressed data too large\n");
+			return false;
+		}
+		if (((size_t)uncomp_size) != uncomp_size)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Uncompressed data too large\n");
+			return false;
+		}
+
+		if (!uncomp_data.try_resize(uncomp_size))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Out of memory\n");
+			return false;
+		}
+		
+		if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD)
+		{
+#if BASISD_SUPPORT_KTX2_ZSTD
+			size_t actualUncompSize = ZSTD_decompress(uncomp_data.data(), (size_t)uncomp_size, pComp_data, (size_t)comp_size);
+			if (ZSTD_isError(actualUncompSize))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression failed, file is invalid or corrupted\n");
+				return false;
+			}
+			if (actualUncompSize != uncomp_size)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression returned too few bytes, file is invalid or corrupted\n");
+				return false;
+			}
+#else
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: File uses Zstd supercompression, but Zstd support was not enabled at compile time (BASISD_SUPPORT_KTX2_ZSTD is 0)\n");
+			return false;
 #endif
-#if BASISD_SUPPORT_ETC2_EAC_RG11
-		case transcoder_texture_format::cTFETC2_EAC_R11:
-		case transcoder_texture_format::cTFETC2_EAC_RG11:
+		}
+
+		return true;
+	}
+		
+	bool ktx2_transcoder::decompress_etc1s_global_data()
+	{
+		// Note: we don't actually support 3D textures in here yet
+		//uint32_t layer_pixel_depth = basisu::maximum<uint32_t>(m_header.m_pixel_depth, 1);
+		//for (uint32_t i = 1; i < m_header.m_level_count; i++)
+		//	layer_pixel_depth += basisu::maximum<uint32_t>(m_header.m_pixel_depth >> i, 1);
+
+		const uint32_t image_count = basisu::maximum<uint32_t>(m_header.m_layer_count, 1) * m_header.m_face_count * m_header.m_level_count;
+		assert(image_count);
+
+		const uint8_t* pSrc = m_pData + m_header.m_sgd_byte_offset;
+
+		memcpy(&m_etc1s_header, pSrc, sizeof(ktx2_etc1s_global_data_header));
+		pSrc += sizeof(ktx2_etc1s_global_data_header);
+
+		if ((!m_etc1s_header.m_endpoints_byte_length) || (!m_etc1s_header.m_selectors_byte_length) || (!m_etc1s_header.m_tables_byte_length))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Invalid ETC1S global data\n");
+			return false;
+		}
+
+		if ((!m_etc1s_header.m_endpoint_count) || (!m_etc1s_header.m_selector_count))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: endpoint and/or selector count is 0, file is invalid or corrupted\n");
+			return false;
+		}
+
+		// Sanity check the ETC1S header.
+		if ((sizeof(ktx2_etc1s_global_data_header) +
+			sizeof(ktx2_etc1s_image_desc) * image_count +
+			m_etc1s_header.m_endpoints_byte_length +
+			m_etc1s_header.m_selectors_byte_length +
+			m_etc1s_header.m_tables_byte_length +
+			m_etc1s_header.m_extended_byte_length) > m_header.m_sgd_byte_length)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: SGD byte length is too small, file is invalid or corrupted\n");
+			return false;
+		}
+				
+		if (!m_etc1s_image_descs.try_resize(image_count))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Out of memory\n");
+			return false;
+		}
+		
+		memcpy(m_etc1s_image_descs.data(), pSrc, sizeof(ktx2_etc1s_image_desc) * image_count);
+		pSrc += sizeof(ktx2_etc1s_image_desc) * image_count;
+
+		// Sanity check the ETC1S image descs
+		for (uint32_t i = 0; i < image_count; i++)
+		{
+			// m_etc1s_transcoder.transcode_image() will validate the slice offsets/lengths before transcoding.
+
+			if (!m_etc1s_image_descs[i].m_rgb_slice_byte_length)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (1)\n");
+				return false;
+			}
+
+			if (m_has_alpha)
+			{
+				if (!m_etc1s_image_descs[i].m_alpha_slice_byte_length)
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (2)\n");
+					return false;
+				}
+			}
+		}
+
+		const uint8_t* pEndpoint_data = pSrc;
+		const uint8_t* pSelector_data = pSrc + m_etc1s_header.m_endpoints_byte_length;
+		const uint8_t* pTables_data = pSrc + m_etc1s_header.m_endpoints_byte_length + m_etc1s_header.m_selectors_byte_length;
+
+		if (!m_etc1s_transcoder.decode_tables(pTables_data, m_etc1s_header.m_tables_byte_length))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_tables() failed, file is invalid or corrupted\n");
+			return false;
+		}
+				
+		if (!m_etc1s_transcoder.decode_palettes(
+			m_etc1s_header.m_endpoint_count,	pEndpoint_data, m_etc1s_header.m_endpoints_byte_length,
+			m_etc1s_header.m_selector_count,	pSelector_data, m_etc1s_header.m_selectors_byte_length))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_palettes() failed, file is likely corrupted\n");
+			return false;
+		}
+				
+		return true;
+	}
+
+	bool ktx2_transcoder::read_key_values()
+	{
+		if (!m_header.m_kvd_byte_length)
+		{
+			if (m_header.m_kvd_byte_offset)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset (it should be zero when the length is zero)\n");
+				return false;
+			}
+
 			return true;
-#endif
-		default:
-			break;
 		}
 
+		if (m_header.m_kvd_byte_offset < sizeof(ktx2_header))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset\n");
+			return false;
+		}
+
+		if ((m_header.m_kvd_byte_offset + m_header.m_kvd_byte_length) > m_data_size)
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset and/or length\n");
+			return false;
+		}
+
+		const uint8_t* pSrc = m_pData + m_header.m_kvd_byte_offset;
+		uint32_t src_left = m_header.m_kvd_byte_length;
+
+		if (!m_key_values.try_reserve(8))
+		{
+			BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n");
+			return false;
+		}
+
+		while (src_left > sizeof(uint32_t))
+		{
+			uint32_t l = basisu::read_le_dword(pSrc);
+			
+			pSrc += sizeof(uint32_t);
+			src_left -= sizeof(uint32_t);
+
+			if (l < 2)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (0)\n");
+				return false;
+			}
+
+			if (src_left < l)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (1)\n");
+				return false;
+			}
+
+			if (!m_key_values.try_resize(m_key_values.size() + 1))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n");
+				return false;
+			}
+			
+			basisu::uint8_vec& key_data = m_key_values.back().m_key;
+			basisu::uint8_vec& value_data = m_key_values.back().m_value;
+
+			do
+			{
+				if (!l)
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (2)\n");
+					return false;
+				}
+
+				if (!key_data.try_push_back(*pSrc++))
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n");
+					return false;
+				}
+
+				src_left--;
+				l--;
+
+			} while (key_data.back());
+						
+			if (!value_data.try_resize(l))
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n");
+				return false;
+			}
+
+			if (l)
+			{
+				memcpy(value_data.data(), pSrc, l);
+				pSrc += l;
+				src_left -= l;
+			}
+
+			uint32_t ofs = (uint32_t)(pSrc - m_pData) & 3;
+			uint32_t alignment_bytes = (4 - ofs) & 3;
+
+			if (src_left < alignment_bytes)
+			{
+				BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (3)\n");
+				return false;
+			}
+
+			pSrc += alignment_bytes;
+			src_left -= alignment_bytes;
+		}
+
+		return true;
+	}
+		
+#endif // BASISD_SUPPORT_KTX2
+
+	bool basisu_transcoder_supports_ktx2()
+	{
+#if BASISD_SUPPORT_KTX2
+		return true;
+#else
 		return false;
+#endif
 	}
 
-} // namespace basist
+	bool basisu_transcoder_supports_ktx2_zstd()
+	{
+#if BASISD_SUPPORT_KTX2_ZSTD
+		return true;
+#else
+		return false;
+#endif
+	}
 
+} // namespace basist
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.h b/thirdparty/basis_universal/transcoder/basisu_transcoder.h
index 770c64122d..bf3aed3dc3 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.h
@@ -1,5 +1,5 @@
 // basisu_transcoder.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,10 +15,25 @@
 // limitations under the License.
 #pragma once
 
-// Set BASISU_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development.
-//#define BASISU_DEVEL_MESSAGES 1
+// By default KTX2 support is enabled to simplify compilation. This implies the need for the Zstandard library (which we distribute as a single source file in the "zstd" directory) by default.
+// Set BASISD_SUPPORT_KTX2 to 0 to completely disable KTX2 support as well as Zstd/miniz usage which is only required for UASTC supercompression in KTX2 files.
+// Also see BASISD_SUPPORT_KTX2_ZSTD in basisu_transcoder.cpp, which individually disables Zstd usage.
+#ifndef BASISD_SUPPORT_KTX2
+	#define BASISD_SUPPORT_KTX2 1
+#endif
+
+// Set BASISD_SUPPORT_KTX2_ZSTD to 0 to disable Zstd usage and KTX2 UASTC Zstd supercompression support 
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#define BASISD_SUPPORT_KTX2_ZSTD 1
+#endif
+
+// Set BASISU_FORCE_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development.
+#ifndef BASISU_FORCE_DEVEL_MESSAGES
+	#define BASISU_FORCE_DEVEL_MESSAGES 0
+#endif
 
 #include "basisu_transcoder_internal.h"
+#include "basisu_transcoder_uastc.h"
 #include "basisu_global_selector_palette.h"
 #include "basisu_file_headers.h"
 
@@ -45,12 +60,11 @@ namespace basist
 		cTFBC3_RGBA = 3, 							// Opaque+alpha, BC4 followed by a BC1 block, alpha channel will be opaque for opaque .basis files
 		cTFBC4_R = 4,								// Red only, alpha slice is transcoded to output if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified
 		cTFBC5_RG = 5,								// XY: Two BC4 blocks, X=R and Y=Alpha, .basis file should have alpha data (if not Y will be all 255's)
-		cTFBC7_M6_RGB = 6,						// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
-		cTFBC7_M5_RGBA = 7,						// Opaque+alpha, alpha channel will be opaque for opaque .basis files
+		cTFBC7_RGBA = 6,							// RGB or RGBA, mode 5 for ETC1S, modes (1,2,3,5,6,7) for UASTC
 
 		// PVRTC1 4bpp (mobile, PowerVR devices)
 		cTFPVRTC1_4_RGB = 8,						// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified, nearly lowest quality of any texture format.
-		cTFPVRTC1_4_RGBA = 9,					// Opaque+alpha, most useful for simple opacity maps. If .basis file doens't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.
+		cTFPVRTC1_4_RGBA = 9,					// Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.
 
 		// ASTC (mobile, Intel devices, hopefully all desktop GPU's one day)
 		cTFASTC_4x4_RGBA = 10,					// Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions.
@@ -69,10 +83,10 @@ namespace basist
 
 		cTFETC2_EAC_R11 = 20,					// R only (ETC2 EAC R11 unsigned)
 		cTFETC2_EAC_RG11 = 21,					// RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps
-		
+
 		// Uncompressed (raw pixel) formats
 		cTFRGBA32 = 13,							// 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte.
-		cTFRGB565 = 14,							// 166pp RGB image stored in raster (not block) order in memory, R at bit position 11
+		cTFRGB565 = 14,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11
 		cTFBGR565 = 15,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0
 		cTFRGBA4444 = 16,							// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0
 
@@ -85,27 +99,62 @@ namespace basist
 		cTFBC3 = cTFBC3_RGBA,
 		cTFBC4 = cTFBC4_R,
 		cTFBC5 = cTFBC5_RG,
-		cTFBC7_M6_OPAQUE_ONLY = cTFBC7_M6_RGB,
-		cTFBC7_M5 = cTFBC7_M5_RGBA,
+
+		// Previously, the caller had some control over which BC7 mode the transcoder output. We've simplified this due to UASTC, which supports numerous modes.
+		cTFBC7_M6_RGB = cTFBC7_RGBA,			// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
+		cTFBC7_M5_RGBA = cTFBC7_RGBA,			// Opaque+alpha, alpha channel will be opaque for opaque .basis files
+		cTFBC7_M6_OPAQUE_ONLY = cTFBC7_RGBA,
+		cTFBC7_M5 = cTFBC7_RGBA,
+		cTFBC7_ALT = 7,
+
 		cTFASTC_4x4 = cTFASTC_4x4_RGBA,
+
 		cTFATC_RGBA_INTERPOLATED_ALPHA = cTFATC_RGBA,
 	};
 
-	uint32_t basis_get_bytes_per_block(transcoder_texture_format fmt);
+	// For compressed texture formats, this returns the # of bytes per block. For uncompressed, it returns the # of bytes per pixel.
+	// NOTE: Previously, this function was called basis_get_bytes_per_block(), and it always returned 16*bytes_per_pixel for uncompressed formats which was confusing.
+	uint32_t basis_get_bytes_per_block_or_pixel(transcoder_texture_format fmt);
+
+	// Returns format's name in ASCII
 	const char* basis_get_format_name(transcoder_texture_format fmt);
+
+	// Returns block format name in ASCII
+	const char* basis_get_block_format_name(block_format fmt);
+
+	// Returns true if the format supports an alpha channel.
 	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt);
+
+	// Returns the basisu::texture_format corresponding to the specified transcoder_texture_format.
 	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt);
+
+	// Returns the texture type's name in ASCII.
 	const char* basis_get_texture_type_name(basis_texture_type tex_type);
-	
+
+	// Returns true if the transcoder texture type is an uncompressed (raw pixel) format.
 	bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type);
+
+	// Returns the # of bytes per pixel for uncompressed formats, or 0 for block texture formats.
 	uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt);
-	
+
+	// Returns the block width for the specified texture format, which is currently either 4 or 8 for FXT1.
 	uint32_t basis_get_block_width(transcoder_texture_format tex_type);
+
+	// Returns the block height for the specified texture format, which is currently always 4.
 	uint32_t basis_get_block_height(transcoder_texture_format tex_type);
 
 	// Returns true if the specified format was enabled at compile time.
-	bool basis_is_format_supported(transcoder_texture_format tex_type);
-		
+	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S);
+
+	// Validates that the output buffer is large enough to hold the entire transcoded texture.
+	// For uncompressed texture formats, most input parameters are in pixels, not blocks. Blocks are 4x4 pixels.
+	bool basis_validate_output_buffer_size(transcoder_texture_format target_format,
+		uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		uint32_t orig_width, uint32_t orig_height,
+		uint32_t output_row_pitch_in_blocks_or_pixels,
+		uint32_t output_rows_in_pixels,
+		uint32_t total_slice_blocks);
+
 	class basisu_transcoder;
 
 	// This struct holds all state used during transcoding. For video, it needs to persist between image transcodes (it holds the previous frame).
@@ -118,46 +167,161 @@ namespace basist
 			uint8_t m_pred_bits;
 		};
 
-		std::vector<block_preds> m_block_endpoint_preds[2];
-		
+		basisu::vector<block_preds> m_block_endpoint_preds[2];
+
 		enum { cMaxPrevFrameLevels = 16 };
-		std::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+		basisu::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+
+		void clear()
+		{
+			for (uint32_t i = 0; i < 2; i++)
+			{
+				m_block_endpoint_preds[i].clear();
+
+				for (uint32_t j = 0; j < cMaxPrevFrameLevels; j++)
+					m_prev_frame_indices[i][j].clear();
+			}
+		}
 	};
-	
+
 	// Low-level helper class that does the actual transcoding.
-	class basisu_lowlevel_transcoder
+	class basisu_lowlevel_etc1s_transcoder
 	{
 		friend class basisu_transcoder;
-	
+
 	public:
-		basisu_lowlevel_transcoder(const basist::etc1_global_selector_codebook *pGlobal_sel_codebook);
+		basisu_lowlevel_etc1s_transcoder(const basist::etc1_global_selector_codebook* pGlobal_sel_codebook);
+
+		void set_global_codebooks(const basisu_lowlevel_etc1s_transcoder* pGlobal_codebook) { m_pGlobal_codebook = pGlobal_codebook; }
+		const basisu_lowlevel_etc1s_transcoder* get_global_codebooks() const { return m_pGlobal_codebook; }
 
 		bool decode_palettes(
-			uint32_t num_endpoints, const uint8_t *pEndpoints_data, uint32_t endpoints_data_size,
-			uint32_t num_selectors, const uint8_t *pSelectors_data, uint32_t selectors_data_size);
+			uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size,
+			uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size);
+
+		bool decode_tables(const uint8_t* pTable_data, uint32_t table_data_size);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks,
+				header.m_tex_type == cBASISTexTypeVideoFrames, (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0, slice_desc.m_level_index,
+				slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, pState,
+				astc_transcode_alpha,
+				pAlpha_blocks,
+				output_rows_in_pixels);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t rgb_offset, uint32_t rgb_length, uint32_t alpha_offset, uint32_t alpha_length,
+			uint32_t decode_flags = 0,
+			bool basis_file_has_alpha_slices = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0);
+
+		void clear()
+		{
+			m_local_endpoints.clear();
+			m_local_selectors.clear();
+			m_endpoint_pred_model.clear();
+			m_delta_endpoint_model.clear();
+			m_selector_model.clear();
+			m_selector_history_buf_rle_model.clear();
+			m_selector_history_buf_size = 0;
+		}
 
-		bool decode_tables(const uint8_t *pTable_data, uint32_t table_data_size);
+		// Low-level methods
+		typedef basisu::vector<endpoint> endpoint_vec;
+		const endpoint_vec& get_endpoints() const { return m_local_endpoints; }
 
-		bool transcode_slice(void *pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t *pImage_data, uint32_t image_data_size, block_format fmt, 
-			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header &header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
-			basisu_transcoder_state *pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+		typedef basisu::vector<selector> selector_vec;
+		const selector_vec& get_selectors() const { return m_local_selectors; }
+
+		const etc1_global_selector_codebook* get_global_sel_codebook() const { return m_pGlobal_sel_codebook; }
 
 	private:
-		typedef std::vector<endpoint> endpoint_vec;
-		endpoint_vec m_endpoints;
+		const basisu_lowlevel_etc1s_transcoder* m_pGlobal_codebook;
 
-		typedef std::vector<selector> selector_vec;
-		selector_vec m_selectors;
+		endpoint_vec m_local_endpoints;
+		selector_vec m_local_selectors;
 
-		const etc1_global_selector_codebook *m_pGlobal_sel_codebook;
+		const etc1_global_selector_codebook* m_pGlobal_sel_codebook;
 
 		huffman_decoding_table m_endpoint_pred_model, m_delta_endpoint_model, m_selector_model, m_selector_history_buf_rle_model;
 
 		uint32_t m_selector_history_buf_size;
-		
+
 		basisu_transcoder_state m_def_state;
 	};
 
+	enum basisu_decode_flags
+	{
+		// PVRTC1: decode non-pow2 ETC1S texture level to the next larger power of 2 (not implemented yet, but we're going to support it). Ignored if the slice's dimensions are already a power of 2.
+		cDecodeFlagsPVRTCDecodeToNextPow2 = 2,
+
+		// When decoding to an opaque texture format, if the basis file has alpha, decode the alpha slice instead of the color slice to the output texture format.
+		// This is primarily to allow decoding of textures with alpha to multiple ETC1 textures (one for color, another for alpha).
+		cDecodeFlagsTranscodeAlphaDataToOpaqueFormats = 4,
+
+		// Forbid usage of BC1 3 color blocks (we don't support BC1 punchthrough alpha yet).
+		// This flag is used internally when decoding to BC3.
+		cDecodeFlagsBC1ForbidThreeColorBlocks = 8,
+
+		// The output buffer contains alpha endpoint/selector indices. 
+		// Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format.
+		cDecodeFlagsOutputHasAlphaIndices = 16,
+
+		cDecodeFlagsHighQuality = 32
+	};
+
+	class basisu_lowlevel_uastc_transcoder
+	{
+		friend class basisu_transcoder;
+
+	public:
+		basisu_lowlevel_uastc_transcoder();
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt,
+				output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels,
+				pState, output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t slice_offset, uint32_t slice_length,
+			uint32_t decode_flags = 0,
+			bool has_alpha = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0,
+			int channel0 = -1, int channel1 = -1);
+	};
+
 	struct basisu_slice_info
 	{
 		uint32_t m_orig_width;
@@ -175,19 +339,19 @@ namespace basist
 		uint32_t m_slice_index;	// the slice index in the .basis file
 		uint32_t m_image_index;	// the source image index originally provided to the encoder
 		uint32_t m_level_index;	// the mipmap level within this image
-		
+
 		uint32_t m_unpacked_slice_crc16;
-		
+
 		bool m_alpha_flag;		// true if the slice has alpha data
 		bool m_iframe_flag;		// true if the slice is an I-Frame
 	};
 
-	typedef std::vector<basisu_slice_info> basisu_slice_info_vec;
+	typedef basisu::vector<basisu_slice_info> basisu_slice_info_vec;
 
 	struct basisu_image_info
 	{
 		uint32_t m_image_index;
-		uint32_t m_total_levels;	
+		uint32_t m_total_levels;
 
 		uint32_t m_orig_width;
 		uint32_t m_orig_height;
@@ -199,8 +363,8 @@ namespace basist
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;
 
-		uint32_t m_first_slice_index;	
-								
+		uint32_t m_first_slice_index;
+
 		bool m_alpha_flag;		// true if the image has alpha data
 		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
@@ -220,8 +384,13 @@ namespace basist
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;
 
-		uint32_t m_first_slice_index;	
-								
+		uint32_t m_first_slice_index;
+
+		uint32_t m_rgb_file_ofs;
+		uint32_t m_rgb_file_len;
+		uint32_t m_alpha_file_ofs;
+		uint32_t m_alpha_file_len;
+
 		bool m_alpha_flag;		// true if the image has alpha data
 		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
@@ -232,13 +401,19 @@ namespace basist
 		uint32_t m_total_header_size;
 
 		uint32_t m_total_selectors;
+		// will be 0 for UASTC or if the file uses global codebooks
+		uint32_t m_selector_codebook_ofs;
 		uint32_t m_selector_codebook_size;
 
 		uint32_t m_total_endpoints;
+		// will be 0 for UASTC or if the file uses global codebooks
+		uint32_t m_endpoint_codebook_ofs;
 		uint32_t m_endpoint_codebook_size;
 
+		uint32_t m_tables_ofs;
 		uint32_t m_tables_size;
-		uint32_t m_slices_size;	
+
+		uint32_t m_slices_size;
 
 		basis_texture_type m_tex_type;
 		uint32_t m_us_per_frame;
@@ -247,14 +422,16 @@ namespace basist
 		basisu_slice_info_vec m_slice_info;
 
 		uint32_t m_total_images;	 // total # of images
-		std::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image
+		basisu::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image
 
 		uint32_t m_userdata0;
 		uint32_t m_userdata1;
-		
-		bool m_etc1s;					// always true for basis universal
+
+		basis_tex_format m_tex_format; // ETC1S, UASTC, etc.
+
 		bool m_y_flipped;				// true if the image was Y flipped
-		bool m_has_alpha_slices;	// true if the texture has alpha slices (even slices RGB, odd slices alpha)
+		bool m_etc1s;					// true if the file is ETC1S
+		bool m_has_alpha_slices;	// true if the texture has alpha slices (for ETC1S: even slices RGB, odd slices alpha)
 	};
 
 	// High-level transcoder class which accepts .basis file data and allows the caller to query information about the file and transcode image levels to various texture formats.
@@ -265,81 +442,67 @@ namespace basist
 		basisu_transcoder& operator= (const basisu_transcoder&);
 
 	public:
-		basisu_transcoder(const etc1_global_selector_codebook *pGlobal_sel_codebook);
+		basisu_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook);
 
 		// Validates the .basis file. This computes a crc16 over the entire file, so it's slow.
-		bool validate_file_checksums(const void *pData, uint32_t data_size, bool full_validation) const;
+		bool validate_file_checksums(const void* pData, uint32_t data_size, bool full_validation) const;
 
 		// Quick header validation - no crc16 checks.
-		bool validate_header(const void *pData, uint32_t data_size) const;
+		bool validate_header(const void* pData, uint32_t data_size) const;
+
+		basis_texture_type get_texture_type(const void* pData, uint32_t data_size) const;
+		bool get_userdata(const void* pData, uint32_t data_size, uint32_t& userdata0, uint32_t& userdata1) const;
 
-		basis_texture_type get_texture_type(const void *pData, uint32_t data_size) const;
-		bool get_userdata(const void *pData, uint32_t data_size, uint32_t &userdata0, uint32_t &userdata1) const;
-		
 		// Returns the total number of images in the basis file (always 1 or more).
 		// Note that the number of mipmap levels for each image may differ, and that images may have different resolutions.
-		uint32_t get_total_images(const void *pData, uint32_t data_size) const;
+		uint32_t get_total_images(const void* pData, uint32_t data_size) const;
+
+		basis_tex_format get_tex_format(const void* pData, uint32_t data_size) const;
 
 		// Returns the number of mipmap levels in an image.
-		uint32_t get_total_image_levels(const void *pData, uint32_t data_size, uint32_t image_index) const;
-		
+		uint32_t get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const;
+
 		// Returns basic information about an image. Note that orig_width/orig_height may not be a multiple of 4.
-		bool get_image_level_desc(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t &orig_width, uint32_t &orig_height, uint32_t &total_blocks) const;
+		bool get_image_level_desc(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t& orig_width, uint32_t& orig_height, uint32_t& total_blocks) const;
 
 		// Returns information about the specified image.
-		bool get_image_info(const void *pData, uint32_t data_size, basisu_image_info &image_info, uint32_t image_index) const;
+		bool get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const;
 
 		// Returns information about the specified image's mipmap level.
-		bool get_image_level_info(const void *pData, uint32_t data_size, basisu_image_level_info &level_info, uint32_t image_index, uint32_t level_index) const;
-				
+		bool get_image_level_info(const void* pData, uint32_t data_size, basisu_image_level_info& level_info, uint32_t image_index, uint32_t level_index) const;
+
 		// Get a description of the basis file and low-level information about each slice.
-		bool get_file_info(const void *pData, uint32_t data_size, basisu_file_info &file_info) const;
-				
+		bool get_file_info(const void* pData, uint32_t data_size, basisu_file_info& file_info) const;
+
 		// start_transcoding() must be called before calling transcode_slice() or transcode_image_level().
-		// This decompresses the selector/endpoint codebooks, so ideally you would only call this once per .basis file (not each image/mipmap level).
-		bool start_transcoding(const void *pData, uint32_t data_size) const;
-		
+		// For ETC1S files, this call decompresses the selector/endpoint codebooks, so ideally you would only call this once per .basis file (not each image/mipmap level).
+		bool start_transcoding(const void* pData, uint32_t data_size);
+
+		bool stop_transcoding();
+
 		// Returns true if start_transcoding() has been called.
-		bool get_ready_to_transcode() const { return m_lowlevel_decoder.m_endpoints.size() > 0; }
+		bool get_ready_to_transcode() const { return m_ready_to_transcode; }
 
-		enum 
-		{
-			// PVRTC1: decode non-pow2 ETC1S texture level to the next larger power of 2 (not implemented yet, but we're going to support it). Ignored if the slice's dimensions are already a power of 2.
-			cDecodeFlagsPVRTCDecodeToNextPow2 = 2,	
-			
-			// When decoding to an opaque texture format, if the basis file has alpha, decode the alpha slice instead of the color slice to the output texture format.
-			// This is primarily to allow decoding of textures with alpha to multiple ETC1 textures (one for color, another for alpha).
-			cDecodeFlagsTranscodeAlphaDataToOpaqueFormats = 4,
-
-			// Forbid usage of BC1 3 color blocks (we don't support BC1 punchthrough alpha yet).
-			// This flag is used internally when decoding to BC3.
-			cDecodeFlagsBC1ForbidThreeColorBlocks = 8,
-
-			// The output buffer contains alpha endpoint/selector indices. 
-			// Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format.
-			cDecodeFlagsOutputHasAlphaIndices = 16
-		};
-								
 		// transcode_image_level() decodes a single mipmap level from the .basis file to any of the supported output texture formats.
 		// It'll first find the slice(s) to transcode, then call transcode_slice() one or two times to decode both the color and alpha texture data (or RG texture data from two slices for BC5).
 		// If the .basis file doesn't have alpha slices, the output alpha blocks will be set to fully opaque (all 255's).
 		// Currently, to decode to PVRTC1 the basis texture's dimensions in pixels must be a power of 2, due to PVRTC1 format requirements. 
 		// output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32.
 		// output_row_pitch_in_blocks_or_pixels: Number of blocks or pixels per row. If 0, the transcoder uses the slice's num_blocks_x or orig_width (NOT num_blocks_x * 4). Ignored for PVRTC1 (due to texture swizzling).
-		// output_rows_in_pixels: Ignored unless fmt is cRGBA32. The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
+		// output_rows_in_pixels: Ignored unless fmt is uncompressed (cRGBA32, etc.). The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
 		// Notes: 
 		// - basisu_transcoder_init() must have been called first to initialize the transcoder lookup tables before calling this function.
 		// - This method assumes the output texture buffer is readable. In some cases to handle alpha, the transcoder will write temporary data to the output texture in
 		// a first pass, which will be read in a second pass.
 		bool transcode_image_level(
-			const void *pData, uint32_t data_size, 
-			uint32_t image_index, uint32_t level_index, 
-			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const void* pData, uint32_t data_size,
+			uint32_t image_index, uint32_t level_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
 			transcoder_texture_format fmt,
-			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state *pState = nullptr, uint32_t output_rows_in_pixels = 0) const;
+			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0) const;
 
 		// Finds the basis slice corresponding to the specified image/level/alpha params, or -1 if the slice can't be found.
-		int find_slice(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const;
+		int find_slice(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const;
 
 		// transcode_slice() decodes a single slice from the .basis file. It's a low-level API - most likely you want to use transcode_image_level().
 		// This is a low-level API, and will be needed to be called multiple times to decode some texture formats (like BC3, BC5, or ETC2).
@@ -350,21 +513,39 @@ namespace basist
 		// output_rows_in_pixels: Ignored unless fmt is cRGBA32. The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
 		// Notes:
 		// - basisu_transcoder_init() must have been called first to initialize the transcoder lookup tables before calling this function.
-		bool transcode_slice(const void *pData, uint32_t data_size, uint32_t slice_index, 
-			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
-			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state * pState = nullptr, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0) const;
+		bool transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state* pState = nullptr, void* pAlpha_blocks = nullptr,
+			uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1) const;
+
+		static void write_opaque_alpha_blocks(
+			uint32_t num_blocks_x, uint32_t num_blocks_y,
+			void* pOutput_blocks, block_format fmt,
+			uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels);
+
+		void set_global_codebooks(const basisu_lowlevel_etc1s_transcoder* pGlobal_codebook) { m_lowlevel_etc1s_decoder.set_global_codebooks(pGlobal_codebook); }
+		const basisu_lowlevel_etc1s_transcoder* get_global_codebooks() const { return m_lowlevel_etc1s_decoder.get_global_codebooks(); }
+
+		const basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() const { return m_lowlevel_etc1s_decoder; }
+		basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() { return m_lowlevel_etc1s_decoder; }
+
+		const basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; }
+		basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; }
 
 	private:
-		mutable basisu_lowlevel_transcoder m_lowlevel_decoder;
+		mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder;
+		mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder;
+
+		bool m_ready_to_transcode;
 
 		int find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const;
-		
+
 		bool validate_header_quick(const void* pData, uint32_t data_size) const;
 	};
 
-	// basisu_transcoder_init() must be called before a .basis file can be transcoded.
+	// basisu_transcoder_init() MUST be called before a .basis file can be transcoded.
 	void basisu_transcoder_init();
-
+		
 	enum debug_flags_t
 	{
 		cDebugFlagVisCRs = 1,
@@ -374,4 +555,387 @@ namespace basist
 	uint32_t get_debug_flags();
 	void set_debug_flags(uint32_t f);
 
+	// ------------------------------------------------------------------------------------------------------ 
+	// Optional .KTX2 file format support
+	// KTX2 reading optionally requires miniz or Zstd decompressors for supercompressed UASTC files.
+	// ------------------------------------------------------------------------------------------------------ 
+#if BASISD_SUPPORT_KTX2
+#pragma pack(push)
+#pragma pack(1)
+	struct ktx2_header
+	{
+		uint8_t m_identifier[12];
+		basisu::packed_uint<4> m_vk_format;
+		basisu::packed_uint<4> m_type_size;
+		basisu::packed_uint<4> m_pixel_width;
+		basisu::packed_uint<4> m_pixel_height;
+		basisu::packed_uint<4> m_pixel_depth;
+		basisu::packed_uint<4> m_layer_count;
+		basisu::packed_uint<4> m_face_count;
+		basisu::packed_uint<4> m_level_count;
+		basisu::packed_uint<4> m_supercompression_scheme;
+		basisu::packed_uint<4> m_dfd_byte_offset;
+		basisu::packed_uint<4> m_dfd_byte_length;
+		basisu::packed_uint<4> m_kvd_byte_offset;
+		basisu::packed_uint<4> m_kvd_byte_length;
+		basisu::packed_uint<8> m_sgd_byte_offset;
+		basisu::packed_uint<8> m_sgd_byte_length;
+	};
+
+	struct ktx2_level_index
+	{
+		basisu::packed_uint<8> m_byte_offset;
+		basisu::packed_uint<8> m_byte_length;
+		basisu::packed_uint<8> m_uncompressed_byte_length;
+	};
+
+	struct ktx2_etc1s_global_data_header
+	{
+		basisu::packed_uint<2> m_endpoint_count;
+		basisu::packed_uint<2> m_selector_count;
+		basisu::packed_uint<4> m_endpoints_byte_length;
+		basisu::packed_uint<4> m_selectors_byte_length;
+		basisu::packed_uint<4> m_tables_byte_length;
+		basisu::packed_uint<4> m_extended_byte_length;
+	};
+
+	struct ktx2_etc1s_image_desc
+	{
+		basisu::packed_uint<4> m_image_flags;
+		basisu::packed_uint<4> m_rgb_slice_byte_offset;
+		basisu::packed_uint<4> m_rgb_slice_byte_length;
+		basisu::packed_uint<4> m_alpha_slice_byte_offset;
+		basisu::packed_uint<4> m_alpha_slice_byte_length;
+	};
+
+	struct ktx2_animdata
+	{
+		basisu::packed_uint<4> m_duration;
+		basisu::packed_uint<4> m_timescale;
+		basisu::packed_uint<4> m_loopcount;
+	};
+#pragma pack(pop)
+
+	const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0;
+	const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166;
+	const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163;
+	const uint32_t KTX2_IMAGE_IS_P_FRAME = 2;
+	const uint32_t KTX2_UASTC_BLOCK_SIZE = 16;
+	const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased
+
+	// The KTX2 transfer functions supported by KTX2
+	const uint32_t KTX2_KHR_DF_TRANSFER_LINEAR = 1;
+	const uint32_t KTX2_KHR_DF_TRANSFER_SRGB = 2;
+
+	enum ktx2_supercompression
+	{
+		KTX2_SS_NONE = 0,
+		KTX2_SS_BASISLZ = 1,
+		KTX2_SS_ZSTANDARD = 2
+	};
+
+	extern const uint8_t g_ktx2_file_identifier[12];
+
+	enum ktx2_df_channel_id
+	{
+		KTX2_DF_CHANNEL_ETC1S_RGB = 0U,
+		KTX2_DF_CHANNEL_ETC1S_RRR = 3U,
+		KTX2_DF_CHANNEL_ETC1S_GGG = 4U,
+		KTX2_DF_CHANNEL_ETC1S_AAA = 15U,
+
+		KTX2_DF_CHANNEL_UASTC_DATA = 0U,
+		KTX2_DF_CHANNEL_UASTC_RGB = 0U,
+		KTX2_DF_CHANNEL_UASTC_RGBA = 3U,
+		KTX2_DF_CHANNEL_UASTC_RRR = 4U,
+		KTX2_DF_CHANNEL_UASTC_RRRG = 5U,
+		KTX2_DF_CHANNEL_UASTC_RG = 6U,
+	};
+
+	inline const char* ktx2_get_etc1s_df_channel_id_str(ktx2_df_channel_id id)
+	{
+		switch (id)
+		{
+		case KTX2_DF_CHANNEL_ETC1S_RGB: return "RGB";
+		case KTX2_DF_CHANNEL_ETC1S_RRR: return "RRR";
+		case KTX2_DF_CHANNEL_ETC1S_GGG: return "GGG";
+		case KTX2_DF_CHANNEL_ETC1S_AAA: return "AAA";
+		default: break;
+		}
+		return "?";
+	}
+
+	inline const char* ktx2_get_uastc_df_channel_id_str(ktx2_df_channel_id id)
+	{
+		switch (id)
+		{
+		case KTX2_DF_CHANNEL_UASTC_RGB: return "RGB";
+		case KTX2_DF_CHANNEL_UASTC_RGBA: return "RGBA";
+		case KTX2_DF_CHANNEL_UASTC_RRR: return "RRR";
+		case KTX2_DF_CHANNEL_UASTC_RRRG: return "RRRG";
+		case KTX2_DF_CHANNEL_UASTC_RG: return "RG";
+		default: break;
+		}
+		return "?";
+	}
+
+	enum ktx2_df_color_primaries
+	{
+		KTX2_DF_PRIMARIES_UNSPECIFIED = 0,
+		KTX2_DF_PRIMARIES_BT709 = 1,
+		KTX2_DF_PRIMARIES_SRGB = 1,
+		KTX2_DF_PRIMARIES_BT601_EBU = 2,
+		KTX2_DF_PRIMARIES_BT601_SMPTE = 3,
+		KTX2_DF_PRIMARIES_BT2020 = 4,
+		KTX2_DF_PRIMARIES_CIEXYZ = 5,
+		KTX2_DF_PRIMARIES_ACES = 6,
+		KTX2_DF_PRIMARIES_ACESCC = 7,
+		KTX2_DF_PRIMARIES_NTSC1953 = 8,
+		KTX2_DF_PRIMARIES_PAL525 = 9,
+		KTX2_DF_PRIMARIES_DISPLAYP3 = 10,
+		KTX2_DF_PRIMARIES_ADOBERGB = 11
+	};
+
+	inline const char* ktx2_get_df_color_primaries_str(ktx2_df_color_primaries p)
+	{
+		switch (p)
+		{
+		case KTX2_DF_PRIMARIES_UNSPECIFIED: return "UNSPECIFIED";
+		case KTX2_DF_PRIMARIES_BT709: return "BT709";
+		case KTX2_DF_PRIMARIES_BT601_EBU: return "EBU"; 
+		case KTX2_DF_PRIMARIES_BT601_SMPTE: return "SMPTE";
+		case KTX2_DF_PRIMARIES_BT2020: return "BT2020";
+		case KTX2_DF_PRIMARIES_CIEXYZ: return "CIEXYZ";
+		case KTX2_DF_PRIMARIES_ACES: return "ACES";
+		case KTX2_DF_PRIMARIES_ACESCC: return "ACESCC"; 
+		case KTX2_DF_PRIMARIES_NTSC1953: return "NTSC1953";
+		case KTX2_DF_PRIMARIES_PAL525: return "PAL525";
+		case KTX2_DF_PRIMARIES_DISPLAYP3: return "DISPLAYP3";
+		case KTX2_DF_PRIMARIES_ADOBERGB: return "ADOBERGB";
+		default: break;
+		}
+		return "?";
+	}	
+
+	// Information about a single 2D texture "image" in a KTX2 file.
+	struct ktx2_image_level_info
+	{
+		// The mipmap level index (0=largest), texture array layer index, and cubemap face index of the image.
+		uint32_t m_level_index;
+		uint32_t m_layer_index;
+		uint32_t m_face_index;
+
+		// The image's actual (or the original source image's) width/height in pixels, which may not be divisible by 4 pixels.
+		uint32_t m_orig_width;
+		uint32_t m_orig_height;
+
+		// The image's physical width/height, which will always be divisible by 4 pixels.
+		uint32_t m_width;
+		uint32_t m_height;
+
+		// The texture's dimensions in 4x4 texel blocks.
+		uint32_t m_num_blocks_x;
+		uint32_t m_num_blocks_y;
+
+		// The total number of blocks
+		uint32_t m_total_blocks;
+
+		// true if the image has alpha data
+		bool m_alpha_flag;
+
+		// true if the image is an I-Frame. Currently, for ETC1S textures, the first frame will always be an I-Frame, and subsequent frames will always be P-Frames.
+		bool m_iframe_flag;
+	};
+		
+	// Thread-specific ETC1S/supercompressed UASTC transcoder state. (If you're not doing multithreading transcoding you can ignore this.)
+	struct ktx2_transcoder_state
+	{
+		basist::basisu_transcoder_state m_transcoder_state;
+		basisu::uint8_vec m_level_uncomp_data;
+		int m_uncomp_data_level_index;
+
+		void clear()
+		{
+			m_transcoder_state.clear();
+			m_level_uncomp_data.clear();
+			m_uncomp_data_level_index = -1;
+		}
+	};
+
+	// This class is quite similar to basisu_transcoder. It treats KTX2 files as a simple container for ETC1S/UASTC texture data.
+	// It does not support 1D or 3D textures.
+	// It only supports 2D and cubemap textures, with or without mipmaps, texture arrays of 2D/cubemap textures, and texture video files. 
+	// It only supports raw non-supercompressed UASTC, ETC1S, UASTC+Zstd, or UASTC+zlib compressed files.
+	// DFD (Data Format Descriptor) parsing is purposely as simple as possible. 
+	// If you need to know how to interpret the texture channels you'll need to parse the DFD yourself after calling get_dfd().
+	class ktx2_transcoder
+	{
+	public:
+		ktx2_transcoder(basist::etc1_global_selector_codebook* pGlobal_sel_codebook);
+
+		// Frees all allocations, resets object.
+		void clear();
+
+		// init() parses the KTX2 header, level index array, DFD, and key values, but nothing else.
+		// Importantly, it does not parse or decompress the ETC1S global supercompressed data, so some things (like which frames are I/P-Frames) won't be available until start_transcoding() is called.
+		// This method holds a pointer to the file data until clear() is called.
+		bool init(const void* pData, uint32_t data_size);
+
+		// Returns the data/size passed to init().
+		const uint8_t* get_data() const { return m_pData; }
+		uint32_t get_data_size() const { return m_data_size; }
+
+		// Returns the KTX2 header. Valid after init().
+		const ktx2_header& get_header() const { return m_header; }
+
+		// Returns the KTX2 level index array. There will be one entry for each mipmap level. Valid after init().
+		const basisu::vector<ktx2_level_index>& get_level_index() const { return m_levels; }
+
+		// Returns the texture's width in texels. Always non-zero, might not be divisible by 4. Valid after init().
+		uint32_t get_width() const { return m_header.m_pixel_width; }
+
+		// Returns the texture's height in texels. Always non-zero, might not be divisible by 4. Valid after init().
+		uint32_t get_height() const { return m_header.m_pixel_height; }
+
+		// Returns the texture's number of mipmap levels. Always returns 1 or higher. Valid after init().
+		uint32_t get_levels() const { return m_header.m_level_count; }
+
+		// Returns the number of faces. Returns 1 for 2D textures and or 6 for cubemaps. Valid after init().
+		uint32_t get_faces() const { return m_header.m_face_count; }
+
+		// Returns 0 or the number of layers in the texture array or texture video. Valid after init().
+		uint32_t get_layers() const { return m_header.m_layer_count; }
+
+		// Returns cETC1S or cUASTC4x4. Valid after init().
+		basist::basis_tex_format get_format() const { return m_format; } 
+
+		bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; }
+
+		bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; }
+
+		// Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init().
+		uint32_t get_has_alpha() const { return m_has_alpha; }
+
+		// Returns the entire Data Format Descriptor (DFD) from the KTX2 file. Valid after init().
+		// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#_the_khronos_data_format_descriptor_overview
+		const basisu::uint8_vec& get_dfd() const { return m_dfd; }
+
+		// Some basic DFD accessors. Valid after init().
+		uint32_t get_dfd_color_model() const { return m_dfd_color_model; }
+
+		// Returns the DFD color primary.
+		// We do not validate the color primaries, so the returned value may not be in the ktx2_df_color_primaries enum.
+		ktx2_df_color_primaries get_dfd_color_primaries() const { return m_dfd_color_prims; }
+		
+		// Returns KTX2_KHR_DF_TRANSFER_LINEAR or KTX2_KHR_DF_TRANSFER_SRGB.
+		uint32_t get_dfd_transfer_func() const { return m_dfd_transfer_func; }
+
+		uint32_t get_dfd_flags() const { return m_dfd_flags; }
+
+		// Returns 1 (ETC1S/UASTC) or 2 (ETC1S with an internal alpha channel).
+		uint32_t get_dfd_total_samples() const { return m_dfd_samples;	}
+		
+		// Returns the channel mapping for each DFD "sample". UASTC always has 1 sample, ETC1S can have one or two. 
+		// Note the returned value SHOULD be one of the ktx2_df_channel_id enums, but we don't validate that. 
+		// It's up to the caller to decide what to do if the value isn't in the enum.
+		ktx2_df_channel_id get_dfd_channel_id0() const { return m_dfd_chan0; }
+		ktx2_df_channel_id get_dfd_channel_id1() const { return m_dfd_chan1; }
+
+		// Key value field data.
+		struct key_value
+		{
+			// The key field is UTF8 and always zero terminated.
+			basisu::uint8_vec m_key;
+
+			// The value may be empty. It consists of raw bytes which may or may not be zero terminated.
+			basisu::uint8_vec m_value;
+
+			bool operator< (const key_value& rhs) const { return strcmp((const char*)m_key.data(), (const char *)rhs.m_key.data()) < 0; }
+		};
+		typedef basisu::vector<key_value> key_value_vec;
+
+		// Returns the array of key-value entries. This may be empty. Valid after init().
+		// The order of key values fields in this array exactly matches the order they were stored in the file. The keys are supposed to be sorted by their Unicode code points.
+		const key_value_vec& get_key_values() const { return m_key_values; }
+
+		const basisu::uint8_vec *find_key(const std::string& key_name) const;
+
+		// Low-level ETC1S specific accessors
+
+		// Returns the ETC1S global supercompression data header, which is only valid after start_transcoding() is called.
+		const ktx2_etc1s_global_data_header& get_etc1s_header() const { return m_etc1s_header; }
+
+		// Returns the array of ETC1S image descriptors, which is only valid after get_etc1s_image_descs() is called.
+		const basisu::vector<ktx2_etc1s_image_desc>& get_etc1s_image_descs() const { return m_etc1s_image_descs; }
+
+		// Must have called startTranscoding() first
+		uint32_t get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const;
+
+		// is_video() is only valid after start_transcoding() is called.
+		// For ETC1S data, if this returns true you must currently transcode the file from first to last frame, in order, without skipping any frames.
+		bool is_video() const { return m_is_video; }
+				
+		// start_transcoding() MUST be called before calling transcode_image().
+		// This method decompresses the ETC1S global endpoint/selector codebooks, which is not free, so try to avoid calling it excessively.
+		bool start_transcoding();
+								
+		// get_image_level_info() be called after init(), but the m_iframe_flag's won't be valid until start_transcoding() is called.
+		// You can call this method before calling transcode_image_level() to retrieve basic information about the mipmap level's dimensions, etc.
+		bool get_image_level_info(ktx2_image_level_info& level_info, uint32_t level_index, uint32_t layer_index, uint32_t face_index) const;
+
+		// transcode_image_level() transcodes a single 2D texture or cubemap face from the KTX2 file.
+		// Internally it uses the same low-level transcode API's as basisu_transcoder::transcode_image_level().
+		// If the file is UASTC and is supercompressed with Zstandard, and the file is a texture array or cubemap, it's highly recommended that each mipmap level is 
+		// completely transcoded before switching to another level. Every time the mipmap level is changed all supercompressed level data must be decompressed using Zstandard as a single unit.
+		// Currently ETC1S videos must always be transcoded from first to last frame (or KTX2 "layer"), in order, with no skipping of frames.
+		// By default this method is not thread safe unless you specify a pointer to a user allocated thread-specific transcoder_state struct.
+		bool transcode_image_level(
+			uint32_t level_index, uint32_t layer_index, uint32_t face_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			basist::transcoder_texture_format fmt,
+			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1,
+			ktx2_transcoder_state *pState = nullptr);
+				
+	private:
+		const uint8_t* m_pData;
+		uint32_t m_data_size;
+
+		ktx2_header m_header;
+		basisu::vector<ktx2_level_index> m_levels;
+		basisu::uint8_vec m_dfd;
+		key_value_vec m_key_values;
+		
+		ktx2_etc1s_global_data_header m_etc1s_header;
+		basisu::vector<ktx2_etc1s_image_desc> m_etc1s_image_descs;
+
+		basist::basis_tex_format m_format;
+					
+		uint32_t m_dfd_color_model;
+		ktx2_df_color_primaries m_dfd_color_prims;
+		uint32_t m_dfd_transfer_func;
+		uint32_t m_dfd_flags;
+		uint32_t m_dfd_samples;
+		ktx2_df_channel_id m_dfd_chan0, m_dfd_chan1;
+								
+		basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder;
+		basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder;
+				
+		ktx2_transcoder_state m_def_transcoder_state;
+
+		bool m_has_alpha;
+		bool m_is_video;
+
+		bool decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data);
+		bool decompress_etc1s_global_data();
+		bool read_key_values();
+	};
+
+#endif // BASISD_SUPPORT_KTX2
+
+	// Returns true if the transcoder was compiled with KTX2 support.
+	bool basisu_transcoder_supports_ktx2();
+
+	// Returns true if the transcoder was compiled with Zstandard support.
+	bool basisu_transcoder_supports_ktx2_zstd();
+
 } // namespace basisu
+
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
index a9c6823d92..2422d788a9 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
@@ -1,5 +1,5 @@
 // basisu_transcoder_internal.h - Universal texture format transcoder library.
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
@@ -20,8 +20,8 @@
 #pragma warning (disable: 4127) //  conditional expression is constant
 #endif
 
-#define BASISD_LIB_VERSION 107
-#define BASISD_VERSION_STRING "01.11"
+#define BASISD_LIB_VERSION 115
+#define BASISD_VERSION_STRING "01.15"
 
 #ifdef _DEBUG
 #define BASISD_BUILD_DEBUG
@@ -45,38 +45,44 @@ namespace basist
 	enum class block_format
 	{
 		cETC1,								// ETC1S RGB 
+		cETC2_RGBA,							// full ETC2 EAC RGBA8 block
 		cBC1,									// DXT1 RGB 
+		cBC3,									// BC4 block followed by a four color BC1 block
 		cBC4,									// DXT5A (alpha block only)
+		cBC5,									// two BC4 blocks
 		cPVRTC1_4_RGB,						// opaque-only PVRTC1 4bpp
 		cPVRTC1_4_RGBA,					// PVRTC1 4bpp RGBA
-		cBC7_M6_OPAQUE_ONLY,				// RGB BC7 mode 6
+		cBC7,									// Full BC7 block, any mode
 		cBC7_M5_COLOR,						// RGB BC7 mode 5 color (writes an opaque mode 5 block)
 		cBC7_M5_ALPHA,						// alpha portion of BC7 mode 5 (cBC7_M5_COLOR output data must have been written to the output buffer first to set the mode/rot fields etc.)
 		cETC2_EAC_A8,						// alpha block of ETC2 EAC (first 8 bytes of the 16-bit ETC2 EAC RGBA format)
 		cASTC_4x4,							// ASTC 4x4 (either color-only or color+alpha). Note that the transcoder always currently assumes sRGB is not enabled when outputting ASTC 
 												// data. If you use a sRGB ASTC format you'll get ~1 LSB of additional error, because of the different way ASTC decoders scale 8-bit endpoints to 16-bits during unpacking.
+		
 		cATC_RGB,
 		cATC_RGBA_INTERPOLATED_ALPHA,
 		cFXT1_RGB,							// Opaque-only, has oddball 8x4 pixel block size
+
+		cPVRTC2_4_RGB,
+		cPVRTC2_4_RGBA,
+
+		cETC2_EAC_R11,
+		cETC2_EAC_RG11,
 												
 		cIndices,							// Used internally: Write 16-bit endpoint and selector indices directly to output (output block must be at least 32-bits)
 
 		cRGB32,								// Writes RGB components to 32bpp output pixels
 		cRGBA32,								// Writes RGB255 components to 32bpp output pixels
 		cA32,									// Writes alpha component to 32bpp output pixels
-
+				
 		cRGB565,
 		cBGR565,
 		
 		cRGBA4444_COLOR,
 		cRGBA4444_ALPHA,
 		cRGBA4444_COLOR_OPAQUE,
-
-		cPVRTC2_4_RGB,
-		cPVRTC2_4_RGBA,
-
-		cETC2_EAC_R11,
-		
+		cRGBA4444,
+						
 		cTotalBlockFormats
 	};
 
@@ -116,7 +122,7 @@ namespace basist
 			basisu::clear_vector(m_tree);
 		}
 
-		bool init(uint32_t total_syms, const uint8_t *pCode_sizes)
+		bool init(uint32_t total_syms, const uint8_t *pCode_sizes, uint32_t fast_lookup_bits = basisu::cHuffmanFastLookupBits)
 		{
 			if (!total_syms)
 			{
@@ -127,8 +133,10 @@ namespace basist
 			m_code_sizes.resize(total_syms);
 			memcpy(&m_code_sizes[0], pCode_sizes, total_syms);
 
+			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;
+
 			m_lookup.resize(0);
-			m_lookup.resize(basisu::cHuffmanFastLookupSize);
+			m_lookup.resize(huffman_fast_lookup_size);
 
 			m_tree.resize(0);
 			m_tree.resize(total_syms * 2);
@@ -166,10 +174,10 @@ namespace basist
 				for (l = code_size; l > 0; l--, cur_code >>= 1)
 					rev_code = (rev_code << 1) | (cur_code & 1);
 
-				if (code_size <= basisu::cHuffmanFastLookupBits)
+				if (code_size <= fast_lookup_bits)
 				{
 					uint32_t k = (code_size << 16) | sym_index;
-					while (rev_code < basisu::cHuffmanFastLookupSize)
+					while (rev_code < huffman_fast_lookup_size)
 					{
 						if (m_lookup[rev_code] != 0)
 						{
@@ -184,9 +192,9 @@ namespace basist
 				}
 
 				int tree_cur;
-				if (0 == (tree_cur = m_lookup[rev_code & (basisu::cHuffmanFastLookupSize - 1)]))
+				if (0 == (tree_cur = m_lookup[rev_code & (huffman_fast_lookup_size - 1)]))
 				{
-					const uint32_t idx = rev_code & (basisu::cHuffmanFastLookupSize - 1);
+					const uint32_t idx = rev_code & (huffman_fast_lookup_size - 1);
 					if (m_lookup[idx] != 0)
 					{
 						// Supplied codesizes can't create a valid prefix code.
@@ -204,9 +212,9 @@ namespace basist
 					return false;
 				}
 
-				rev_code >>= (basisu::cHuffmanFastLookupBits - 1);
+				rev_code >>= (fast_lookup_bits - 1);
 
-				for (int j = code_size; j > (basisu::cHuffmanFastLookupBits + 1); j--)
+				for (int j = code_size; j > ((int)fast_lookup_bits + 1); j--)
 				{
 					tree_cur -= ((rev_code >>= 1) & 1);
 
@@ -254,6 +262,8 @@ namespace basist
 		}
 
 		const basisu::uint8_vec &get_code_sizes() const { return m_code_sizes; }
+		const basisu::int_vec get_lookup() const { return m_lookup; }
+		const basisu::int16_vec get_tree() const { return m_tree; }
 
 		bool is_valid() const { return m_code_sizes.size() > 0; }
 
@@ -430,9 +440,11 @@ namespace basist
 			return v;
 		}
 
-		inline uint32_t decode_huffman(const huffman_decoding_table &ct)
+		inline uint32_t decode_huffman(const huffman_decoding_table &ct, int fast_lookup_bits = basisu::cHuffmanFastLookupBits)
 		{
 			assert(ct.m_code_sizes.size());
+
+			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;
 						
 			while (m_bit_buf_size < 16)
 			{
@@ -448,14 +460,14 @@ namespace basist
 			int code_len;
 
 			int sym;
-			if ((sym = ct.m_lookup[m_bit_buf & (basisu::cHuffmanFastLookupSize - 1)]) >= 0)
+			if ((sym = ct.m_lookup[m_bit_buf & (huffman_fast_lookup_size - 1)]) >= 0)
 			{
 				code_len = sym >> 16;
 				sym &= 0xFFFF;
 			}
 			else
 			{
-				code_len = basisu::cHuffmanFastLookupBits;
+				code_len = fast_lookup_bits;
 				do
 				{
 					sym = ct.m_tree[~sym + ((m_bit_buf >> code_len++) & 1)]; // ~sym = -sym - 1
@@ -635,6 +647,11 @@ namespace basist
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}
 
+	enum eNoClamp
+	{
+		cNoClamp = 0
+	};
+
 	struct color32
 	{
 		union
@@ -655,21 +672,33 @@ namespace basist
 		color32() { }
 
 		color32(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }
+		color32(eNoClamp unused, uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { (void)unused; set_noclamp_rgba(vr, vg, vb, va); }
 
 		void set(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); c[3] = static_cast<uint8_t>(va); }
 
+		void set_noclamp_rgb(uint32_t vr, uint32_t vg, uint32_t vb) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); }
+		void set_noclamp_rgba(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }
+
 		void set_clamped(int vr, int vg, int vb, int va) { c[0] = clamp255(vr); c[1] = clamp255(vg);	c[2] = clamp255(vb); c[3] = clamp255(va); }
 
 		uint8_t operator[] (uint32_t idx) const { assert(idx < 4); return c[idx]; }
 		uint8_t &operator[] (uint32_t idx) { assert(idx < 4); return c[idx]; }
 
 		bool operator== (const color32&rhs) const { return m == rhs.m; }
+
+		static color32 comp_min(const color32& a, const color32& b) { return color32(cNoClamp, basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
+		static color32 comp_max(const color32& a, const color32& b) { return color32(cNoClamp, basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
 	};
 
 	struct endpoint
 	{
 		color32 m_color5;
 		uint8_t m_inten5;
+		bool operator== (const endpoint& rhs) const
+		{
+			return (m_color5.r == rhs.m_color5.r) && (m_color5.g == rhs.m_color5.g) && (m_color5.b == rhs.m_color5.b) && (m_inten5 == rhs.m_inten5);
+		}
+		bool operator!= (const endpoint& rhs) const { return !(*this == rhs); }
 	};
 
 	struct selector
@@ -682,6 +711,17 @@ namespace basist
 
 		uint8_t m_lo_selector, m_hi_selector;
 		uint8_t m_num_unique_selectors;
+		bool operator== (const selector& rhs) const
+		{
+			return (m_selectors[0] == rhs.m_selectors[0]) &&
+				(m_selectors[1] == rhs.m_selectors[1]) &&
+				(m_selectors[2] == rhs.m_selectors[2]) &&
+				(m_selectors[3] == rhs.m_selectors[3]);
+		}
+		bool operator!= (const selector& rhs) const
+		{
+			return !(*this == rhs);
+		}
 
 		void init_flags()
 		{
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc
index 7f38f4a863..cd634c0df5 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc
@@ -478,4 +478,4 @@
 {31,1,10801},{47,1,12162},{14,1,6117},{14,1,6117},{8,1,50},{20,1,7322},{0,1,1241},{21,1,914},{21,1,914},{21,1,914},{7,1,274},{35,5,1513},{9,1,585},{9,1,585},{26,1,0},{27,1,1513},{26,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{0,1,0},{1,1,0},{0,1,0},{47,0,9250},{47,0,9250},{47,0,9250},{47,0,9250},{12,1,3690},
 {12,1,3690},{12,1,3690},{8,1,50},{0,1,1241},{0,1,1241},{45,1,65535},{14,1,33274},{42,1,19608},{42,1,13375},{47,1,62627},{42,1,22211},{10,1,6045},{24,1,138},{36,1,39015},{0,1,1732},{35,1,1048},{5,1,766},{5,1,666},{37,1,212},{3,3,1473},{7,1,675},{23,1,410},{14,1,1},{3,3,1473},{14,1,1},{13,1,14121},{13,1,14121},{13,1,14121},{45,1,10571},{45,1,11434},{30,1,6081},{30,1,6081},
 {40,1,137},{36,1,6926},{2,1,1445},{5,1,666},{5,1,666},{5,1,666},{37,1,212},{35,3,1105},{23,1,410},{23,1,410},{14,1,1},{25,1,1105},{14,1,1},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{0,1,0},{1,1,0},{0,1,0},{15,0,9256},{15,0,9256},{15,0,9256},{15,0,9256},{14,1,3985},{14,1,3985},{14,1,3985},{40,1,137},{2,1,1445},
-{2,1,1445},
-\ No newline at end of file
+{2,1,1445},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc
index 5e7a75396d..da4e7fee98 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc
@@ -478,4 +478,4 @@
 {137,255,10742},{135,255,12066},{107,255,6089},{107,255,6089},{67,255,45},{37,255,7233},{1,255,1184},{218,255,900},{218,255,900},{218,255,900},{204,255,272},{255,167,1513},{189,255,562},{189,255,562},{86,255,0},{253,213,1513},{86,255,0},{255,252,0},{255,254,0},{254,255,0},{252,255,0},{255,252,0},{255,254,0},{252,255,0},{0,255,0},{255,254,0},{0,255,0},{132,0,9248},{132,0,9248},{132,0,9248},{132,0,9248},{98,255,3656},
 {98,255,3656},{98,255,3656},{67,255,45},{1,255,1184},{1,255,1184},{138,255,65535},{107,255,33448},{95,255,19729},{89,255,13446},{135,255,62717},{95,255,22307},{79,255,6021},{73,255,105},{40,255,38959},{0,254,1627},{230,255,996},{224,255,756},{221,255,653},{213,255,194},{255,204,1473},{207,255,675},{198,255,405},{110,255,0},{255,230,1473},{110,255,0},{162,255,14060},{162,255,14060},{162,255,14060},{146,255,10545},{141,255,11378},{116,255,6077},{116,255,6077},
 {76,255,137},{40,255,6873},{7,255,1412},{221,255,653},{221,255,653},{221,255,653},{213,255,194},{255,180,1105},{198,255,405},{198,255,405},{110,255,0},{255,218,1105},{110,255,0},{255,252,0},{255,254,0},{254,255,0},{252,255,0},{255,252,0},{255,254,0},{252,255,0},{0,255,0},{255,254,0},{0,255,0},{140,0,9248},{140,0,9248},{140,0,9248},{140,0,9248},{107,255,3929},{107,255,3929},{107,255,3929},{76,255,137},{7,255,1412},
-{7,255,1412},
-\ No newline at end of file
+{7,255,1412},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc
index 61f7476efc..7acedd6a6f 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc
@@ -478,4 +478,4 @@
 {17,31,11312},{16,31,11037},{13,31,6429},{13,31,6429},{8,31,260},{6,31,10457},{0,31,2642},{26,31,872},{26,31,872},{26,31,872},{25,31,397},{31,22,1513},{23,31,794},{23,31,794},{13,31,1},{29,27,1513},{13,31,1},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,31,3074},
 {12,31,3074},{12,31,3074},{8,31,260},{0,31,2642},{0,31,2642},{17,31,58848},{15,31,39619},{13,31,24975},{12,31,19007},{16,31,54474},{13,31,27057},{10,31,8569},{9,31,461},{8,31,51302},{0,31,5046},{28,31,979},{27,31,806},{27,31,637},{26,31,292},{31,26,1473},{26,31,953},{24,31,605},{16,31,0},{29,29,1473},{16,31,0},{19,31,13604},{19,31,13604},{19,31,13604},{18,31,11057},{16,31,10429},{14,31,6339},{14,31,6339},
 {10,31,424},{8,31,9713},{1,31,2900},{27,31,637},{27,31,637},{27,31,637},{26,31,292},{30,25,1105},{24,31,605},{24,31,605},{16,31,0},{30,27,1105},{16,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{12,31,3330},{12,31,3330},{12,31,3330},{10,31,424},{1,31,2900},
-{1,31,2900},
-\ No newline at end of file
+{1,31,2900},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc
index f57a232a85..2b56c0944c 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc
@@ -478,4 +478,4 @@
 {17,63,11312},{16,63,11037},{13,63,6429},{13,63,6429},{8,63,260},{6,63,10457},{0,63,2642},{26,63,872},{26,63,872},{26,63,872},{25,63,397},{31,45,1513},{23,63,794},{23,63,794},{13,63,1},{31,52,1513},{13,63,1},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{0,63,0},{31,63,0},{0,63,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,63,3074},
 {12,63,3074},{12,63,3074},{8,63,260},{0,63,2642},{0,63,2642},{17,63,58848},{15,63,39619},{13,63,24975},{12,63,19007},{16,63,54474},{13,63,27057},{10,63,8569},{9,63,461},{8,63,51302},{0,63,5046},{28,63,979},{27,63,806},{27,63,637},{26,63,292},{30,56,1473},{26,63,953},{24,63,605},{16,63,0},{30,58,1473},{16,63,0},{19,63,13604},{19,63,13604},{19,63,13604},{18,63,11057},{16,63,10429},{14,63,6339},{14,63,6339},
 {10,63,424},{8,63,9713},{1,63,2900},{27,63,637},{27,63,637},{27,63,637},{26,63,292},{31,48,1105},{24,63,605},{24,63,605},{16,63,0},{31,54,1105},{16,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{0,63,0},{31,63,0},{0,63,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{12,63,3330},{12,63,3330},{12,63,3330},{10,63,424},{1,63,2900},
-{1,63,2900},
-\ No newline at end of file
+{1,63,2900},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc
index 433b126a71..6669852923 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc
@@ -46,4 +46,4 @@
 {76,0,3},{255,1,27},{255,7,24},{255,1,27},{179,39,8},{255,22,16},{85,0,3},{255,2,27},{255,22,24},{255,7,27},{187,47,8},{255,47,16},{93,0,3},{255,4,27},{251,100,28},{182,0,7},{195,55,8},{255,71,16},{101,0,3},{255,4,27},{253,108,28},{191,0,7},{203,63,8},{255,95,16},{109,0,3},{255,7,27},{255,118,28},{200,0,7},{212,72,8},{255,123,16},{118,0,3},{246,0,7},
 {255,129,28},{209,0,7},{220,80,8},{255,147,16},{126,0,3},{246,0,7},{255,138,28},{218,0,7},{228,88,8},{255,172,16},{134,0,3},{249,3,7},{245,91,8},{228,3,7},{236,96,8},{255,196,16},{142,6,3},{251,14,7},{250,102,8},{237,12,7},{245,105,8},{255,223,16},{151,15,3},{253,22,7},{254,112,8},{245,20,7},{253,113,8},{255,248,16},{159,23,3},{253,31,7},{255,124,8},{249,28,7},
 {255,124,8},{255,0,0},{167,31,3},{254,39,7},{255,10,4},{252,37,7},{255,10,4},{255,0,0},{175,39,3},{255,48,7},{255,38,4},{254,48,7},{255,38,4},{255,0,0},{184,48,3},{255,56,7},{255,62,4},{255,56,7},{255,62,4},{255,0,0},{192,56,3},{255,65,7},{255,86,4},{255,65,7},{255,86,4},{255,0,0},{200,64,3},{255,74,7},{255,111,4},{255,77,7},{255,111,4},{255,0,0},
-{208,5,2},
-\ No newline at end of file
+{208,5,2},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc
index 357b14b7a1..c0780988d8 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc
@@ -478,4 +478,4 @@
 {70,127,10779},{68,127,12146},{54,127,6176},{54,127,6176},{34,127,52},{14,127,7281},{2,127,1213},{109,127,937},{109,127,937},{109,127,937},{102,127,281},{127,84,1513},{93,127,565},{93,127,565},{43,127,0},{127,106,1513},{43,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{0,127,0},{127,127,0},{0,127,0},{65,0,9250},{65,0,9250},{65,0,9250},{65,0,9250},{49,127,3656},
 {49,127,3656},{49,127,3656},{34,127,52},{2,127,1213},{2,127,1213},{71,127,63180},{60,127,37225},{52,127,26137},{48,127,18128},{68,127,59595},{51,127,22636},{42,127,8480},{37,127,164},{22,127,37455},{0,126,2073},{114,127,1019},{111,127,766},{111,127,666},{105,127,205},{127,102,1473},{102,127,681},{99,127,405},{56,127,0},{127,115,1473},{56,127,0},{79,127,14066},{79,127,14066},{79,127,14066},{73,127,10571},{71,127,11450},{59,127,6166},{59,127,6166},
 {37,127,148},{25,127,6914},{5,127,1413},{111,127,666},{111,127,666},{111,127,666},{105,127,205},{127,90,1105},{99,127,405},{99,127,405},{56,127,0},{127,109,1105},{56,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{0,127,0},{127,127,0},{0,127,0},{69,0,9250},{69,0,9250},{69,0,9250},{69,0,9250},{52,127,3940},{52,127,3940},{52,127,3940},{37,127,148},{5,127,1413},
-{5,127,1413},
-\ No newline at end of file
+{5,127,1413},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m6.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m6.inc
deleted file mode 100644
index 6b814e6132..0000000000
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m6.inc
+++ /dev/null
@@ -1,4383 +0,0 @@
-// Copyright (C) 2017-2019 Binomial LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-static const uint32_t g_etc1_to_bc7_m6_table0[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,0x80000,0x1,0x40000,0x40000,0x40000,0x80000,0x80000,0x1,0x80000,0x80000,0x1,0x1,0x40000,0x40000,0x40000,0x80000,0x80000,0x1,0x80000,0x80000,0x1,0x1,0x80000,
-0x80000,0x1,0x1,0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,0x80000,0x80000,0x40000,0x40000,0x80000,0x1,0x80000,0x100000,0x180000,0x2C0000,0x6000001,0x180000,0x2C0000,0x6000001,0x2C0000,0x6000001,0x6000001,0x180000,0x2C0000,0x6000001,0x2C0000,0x6000001,
-0x6000001,0x2C0000,0x6000001,0x6000001,0x6000001,0x180000,0x2C0000,0x6000001,0x2C0000,0x6000001,0x6000001,0x2C0000,0x6000001,0x6000001,0x6000001,0x2C0000,0x6000001,0x6000001,0x6000001,0x6000001,0x140000,0x100000,0x100000,0x180000,0x240000,0x440000,0x6000001,0x6000001,0x140000,0x1C0000,0xD40000,0x6000001,
-0x200000,0x4002C,0x16000004,0xA000005,0x6000005,0xE000012,0x8000005,0x6000001,0x6000012,0x400000A,0x4000012,0x8000023,0x600000D,0x6000005,0x6000016,0x400000E,0x4000016,0x4000023,0x4000013,0x400001B,0x2000023,0x4002C,0x6000011,0x6000009,0x600001A,0x4000012,0x400001A,0x2040027,0x4000017,0x200001F,0x2000024,0x8002C,
-0x400001C,0x2000022,0x2000027,0x200002C,0x1600000B,0x48000012,0x78000004,0xE00000C,0xA000009,0x600000C,0x6000009,0x4000011,0x1000000F,0xA000012,0x4000013,0x200001F,0x8002C,0x80024,0x12040004,0xA040004,0x6000005,0xE000012,0x8000005,0x6000001,0x6000012,0x400000A,0x4000012,0x80023,0x600000D,0x6000005,0x6000016,0x400000E,
-0x4000016,0xC0023,0x4000013,0x400001B,0x2000023,0x80023,0x600000D,0x6000005,0x6000016,0x400000E,0x4000016,0xC0023,0x4000013,0x400001B,0x2000023,0xC0023,0x4000013,0x400001B,0x2000023,0x2000023,0x1600000B,0x48000012,0x5A040004,0xE00000C,0xA000009,0x600000C,0x6000009,0x4000011,0x1000000E,0xA000011,0x4000013,0x400001B,
-0xC0023,0x4,0x4,0x4,0x4,0x4000000,0x4000000,0x4000000,0x2000000,0x2000000,0x1,0x2000002,0x2000002,0x2000002,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2000003,0x2000003,0x2000003,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x3,
-0x3,0x3,0x3,0x3,0x4000001,0x18000000,0x4,0x2000001,0x4000001,0x1,0x1,0x1,0x2000001,0x4000001,0x2,0x2,0x3,0x4,0x4,0x4,0x4,0x4000000,0x4000000,0x4000000,0x2000000,0x2000000,0x1,0x2000002,0x2000002,0x2000002,0x1,0x1,
-0x1,0x2,0x2,0x2,0x2,0x2000002,0x2000002,0x2000002,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x2,0x2,0x2,0x2,0x4000001,0x18000000,0x4,0x2000001,0x4000001,0x1,0x1,0x1,0x2000001,0x4000001,0x2,0x2,
-0x2,0x80014,0x12040000,0xA040000,0x6040001,0x20C0012,0x8000005,0x6000001,0x180012,0x400000A,0x4000012,0x20C0012,0x8000005,0x6000001,0x180012,0x400000A,0x4000012,0x180012,0x400000A,0x4000012,0x4000012,0x20C0012,0x8000005,0x6000001,0x180012,0x400000A,0x4000012,0x180012,0x400000A,0x4000012,0x4000012,0x180012,
-0x400000A,0x4000012,0x4000012,0x4000012,0x1C000008,0xC080012,0x5A040000,0xA040008,0xA000005,0x6000008,0x6000005,0x600000A,0x14000008,0xC000008,0x6000005,0x4000012,0x140012,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table1[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x100000,0x100000,0x100000,0x100000,
-0x100000,0x200000,0x200000,0x200000,0x4000001,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x200000,0x200000,0x200000,0x4000001,0x200000,0x200000,0x200000,0x4000001,0x4000001,0xC0000,0xC0000,0xC0000,0x20C0000,0x40C0000,0x100000,0x100000,0x140000,0x20C0000,0x40C0000,0x180000,0x200000,
-0x180000,0x140000,0x140000,0x140000,0x140000,0x1C0000,0x1C0000,0x1C0000,0x380000,0x380000,0x8000001,0x1C0000,0x1C0000,0x1C0000,0x380000,0x380000,0x8000001,0x380000,0x380000,0x8000001,0x8000001,0x1C0000,0x1C0000,0x1C0000,0x380000,0x380000,0x8000001,0x380000,0x380000,0x8000001,0x8000001,0x380000,
-0x380000,0x8000001,0x8000001,0x8000001,0x180000,0x2140000,0x140000,0x2180000,0x200000,0x280000,0x2C0000,0x440000,0x180000,0x1C0000,0x280000,0x8000001,0x280000,0x200000,0x300000,0x5C0000,0xE000001,0x300000,0x5C0000,0xE000001,0x5C0000,0xE000001,0xE000001,0x300000,0x5C0000,0xE000001,0x5C0000,0xE000001,
-0xE000001,0x5C0000,0xE000001,0xE000001,0xE000001,0x300000,0x5C0000,0xE000001,0x5C0000,0xE000001,0xE000001,0x5C0000,0xE000001,0xE000001,0xE000001,0x5C0000,0xE000001,0xE000001,0xE000001,0xE000001,0x280000,0x8200000,0x8200000,0x340000,0x4C0000,0x940000,0xE000001,0xE000001,0x2C0000,0x3C0000,0x1D40000,0xE000001,
-0x400000,0x100088,0x220C0034,0x140C0034,0xE0C0035,0x1E080026,0x14080015,0xE080019,0x10080026,0xE040016,0xC040026,0x20000033,0x16000009,0x1004000F,0x12000012,0xE000002,0xC000016,0x10000033,0xE000011,0xA00001B,0xA000033,0x180088,0x1200003D,0xE000034,0x12000036,0xE00001B,0xC000026,0xC00004B,0xC000023,0xA00002B,0xA000043,0x2C0088,
-0xA000054,0xA00004C,0x800005F,0x600008C,0x48000002,0x8C080026,0x9E0C0034,0x26000001,0x18000002,0x12000002,0x10000002,0xE000002,0x3200000E,0x1E000005,0x1000000B,0xA00002B,0x200088,0x140034,0x1E100008,0x12100009,0xE100009,0x1A0C0012,0x120C0001,0x100C0001,0x100C0012,0xE0C0005,0xC0C0012,0x200033,0x14040009,0xE080009,0x12000012,0xE000002,
-0xC040012,0x3C0033,0xE000011,0xA00001B,0xA000033,0x200033,0x14040009,0xE080009,0x12000012,0xE000002,0xC040012,0x3C0033,0xE000011,0xA00001B,0xA000033,0x3C0033,0xE000011,0xA00001B,0xA000033,0xA000033,0x44040001,0x6E0C0012,0x80100008,0x20040001,0x18000002,0x12000002,0xE040002,0xE000002,0x32000005,0x1E000001,0x1000000A,0xA00001B,
-0x2C0033,0xC0034,0xC0034,0xC0034,0xC0034,0x14080014,0x14080014,0x14080014,0xC080014,0xC080014,0x8040015,0x16000008,0x16000008,0x16000008,0xE000001,0xE000001,0xA000005,0xA00000A,0xA00000A,0x8000001,0x600000A,0x20C0033,0x20C0033,0x20C0033,0xC000015,0xC000015,0x8000013,0x8000019,0x8000019,0x800000A,0x600000E,0x180033,
-0x180033,0x6000023,0x4000023,0x4000033,0x48000001,0x5C080014,0xC0034,0x24000001,0x18000001,0x12000001,0x10000001,0xC000001,0x24000009,0x1C000004,0xE000009,0x800000A,0x140033,0x100008,0x100008,0x100008,0x100008,0x100C0000,0x100C0000,0x100C0000,0xA0C0001,0xA0C0001,0x80C0001,0x180008,0x180008,0x180008,0xA080001,0xA080001,
-0x8080001,0x2C0008,0x2C0008,0x8000001,0x600000A,0x180008,0x180008,0x180008,0xA080001,0xA080001,0x8080001,0x2C0008,0x2C0008,0x8000001,0x600000A,0x2C0008,0x2C0008,0x8000001,0x600000A,0x600000A,0x3A040000,0x3E0C0000,0x100008,0x1E040000,0x14040000,0x10040000,0xC080001,0xC040000,0x26040001,0x1C000000,0x200008,0x8000001,
-0x200008,0x180014,0x1A140000,0x12140000,0xE140001,0x2240012,0x120C0001,0xE100001,0x4C0012,0xE000001,0xC000012,0x2240012,0x120C0001,0xE100001,0x4C0012,0xE000001,0xC000012,0x4C0012,0xE000001,0xC000012,0xC000012,0x2240012,0x120C0001,0xE100001,0x4C0012,0xE000001,0xC000012,0x4C0012,0xE000001,0xC000012,0xC000012,0x4C0012,
-0xE000001,0xC000012,0xC000012,0xC000012,0x44040001,0x1C0012,0x62140000,0x26000000,0x18000001,0x12000001,0xE040001,0xE000002,0x36000002,0x1E000001,0x10040000,0xC000012,0x340012,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x10000000,0x10000000,0x10000000,0x10000000,0x10000000,
-0x10000000,0x8000000,0x8000000,0x8000000,0x4000001,0x80012,0x80012,0x80012,0x80012,0x80012,0x80012,0x6000005,0x6000005,0x6000005,0x4000005,0xC0012,0xC0012,0xC0012,0x400000A,0x2000012,0x58000000,0x40014,0x40014,0x28000000,0x1C000000,0x14000000,0x14000000,0xE000000,0x20000005,0x16000002,0xA000001,0x6000005,
-0xC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table2[] = {
-0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x2C0000,
-0x2C0000,0x2C0000,0x2C0000,0x6000001,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x180000,0x200000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x280000,0x280000,0x280000,0x280000,0x280000,
-0x280000,0x500000,0x500000,0x500000,0xC000001,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x500000,0x500000,0x500000,0xC000001,0x500000,0x500000,0x500000,0xC000001,0xC000001,0x61C0000,0x1C0000,0x1C0000,0x200000,0x4200000,0x2240000,0x2240000,0x22C0000,0x200000,0x4200000,0x380000,0x500000,
-0x380000,0x240000,0x240000,0x240000,0x240000,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x680000,0x680000,0x10000001,0x10000001,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x680000,0x680000,0x10000001,0x10000001,0x680000,
-0x680000,0x10000001,0x10000001,0x10000001,0x2280000,0xA240000,0x240000,0x300000,0x3C0000,0x4C0000,0x540000,0x800000,0x2C0000,0x340000,0x4C0000,0x10000001,0x4C0000,0x300000,0x2440000,0x8C0000,0x16000001,0x2440000,0x8C0000,0x16000001,0x8C0000,0x16000001,0x16000001,0x2440000,0x8C0000,0x16000001,0x8C0000,0x16000001,
-0x16000001,0x8C0000,0x16000001,0x16000001,0x16000001,0x2440000,0x8C0000,0x16000001,0x8C0000,0x16000001,0x16000001,0x8C0000,0x16000001,0x16000001,0x16000001,0x8C0000,0x16000001,0x16000001,0x16000001,0x16000001,0x3C0000,0x340000,0x340000,0x500000,0x740000,0xE00000,0x16000001,0x16000001,0x400000,0x580000,0x9E40000,0x16000001,
-0x640000,0x200088,0x2A1C0034,0x1C1C0034,0x161C0035,0x26180026,0x1C180015,0x16180019,0x18180026,0x16140016,0x14140026,0x28100033,0x1E100009,0x1814000F,0x1A100012,0x16100002,0x14100016,0x18100033,0x160C000F,0x140C001A,0x12100033,0x300088,0x22040033,0x16100034,0x1C040026,0x18080013,0x140C0024,0x1A000035,0x1604000A,0x14000013,0x12040033,0x5C0088,
-0x1600003C,0x12000034,0x10000044,0xE00008C,0x50100002,0x94180026,0xA61C0034,0x2E100001,0x20100002,0x1A100002,0x18100002,0x16100002,0x4A080001,0x2A0C0001,0x180C0009,0x14000013,0x400088,0x240034,0x26200008,0x1A200009,0x16200009,0x221C0012,0x1A1C0001,0x181C0001,0x181C0012,0x161C0005,0x141C0012,0x380033,0x1C140009,0x16180009,0x1A100012,0x16100002,
-0x14140012,0x700033,0x16040009,0x14000012,0x12000033,0x380033,0x1C140009,0x16180009,0x1A100012,0x16100002,0x14140012,0x700033,0x16040009,0x14000012,0x12000033,0x700033,0x16040009,0x14000012,0x12000033,0x12000033,0x4C140001,0x761C0012,0x88200008,0x28140001,0x20100002,0x1A100002,0x16140002,0x160C0002,0x4A080001,0x26100001,0x180C0008,0x14000012,
-0x500033,0x1C0034,0x1C0034,0x1C0034,0x1C0034,0x1C180014,0x1C180014,0x1C180014,0x14180014,0x14180014,0x10140015,0x1E100008,0x1E100008,0x1E100008,0x16100001,0x16100001,0x12100005,0x1210000A,0x1210000A,0x10100001,0xE10000A,0x2240033,0x2240033,0x2240033,0x18080012,0x18080012,0x10100013,0x16040009,0x16040009,0x10080002,0xE08000A,0x4C0033,
-0x4C0033,0x10000013,0xE00000E,0xC000033,0x50100001,0x64180014,0x1C0034,0x2C100001,0x20100001,0x1A100001,0x18100001,0x14100001,0x48080000,0x2A0C0000,0x16100009,0x10080002,0x340033,0x200008,0x200008,0x200008,0x200008,0x181C0000,0x181C0000,0x181C0000,0x121C0001,0x121C0001,0x101C0001,0x300008,0x300008,0x300008,0x12180001,0x12180001,
-0x10180001,0x5C0008,0x5C0008,0x10100001,0xE00000A,0x300008,0x300008,0x300008,0x12180001,0x12180001,0x10180001,0x5C0008,0x5C0008,0x10100001,0xE00000A,0x5C0008,0x5C0008,0x10100001,0xE00000A,0xE00000A,0x42140000,0x461C0000,0x200008,0x26140000,0x1C140000,0x18140000,0x14180001,0x14140000,0x3E0C0000,0x24100000,0x400008,0x10100001,
-0x400008,0x280014,0x22240000,0x1A240000,0x16240001,0x23C0012,0x1A1C0001,0x16200001,0x7C0012,0x16100001,0x14000012,0x23C0012,0x1A1C0001,0x16200001,0x7C0012,0x16100001,0x14000012,0x7C0012,0x16100001,0x14000012,0x14000012,0x23C0012,0x1A1C0001,0x16200001,0x7C0012,0x16100001,0x14000012,0x7C0012,0x16100001,0x14000012,0x14000012,0x7C0012,
-0x16100001,0x14000012,0x14000012,0x14000012,0x5C0C0000,0x2C0012,0x6A240000,0x2E100000,0x20100001,0x1C0C0000,0x16140001,0x16080001,0x52040000,0x2E080000,0x18140000,0x14000012,0x580012,0x140014,0x140014,0x140014,0x140014,0x140014,0x140014,0x140014,0x140014,0x140014,0x140014,0x18100000,0x18100000,0x18100000,0x18100000,0x18100000,
-0x18100000,0x10100000,0x10100000,0x10100000,0xC100001,0x200012,0x200012,0x200012,0x200012,0x200012,0x200012,0x10080001,0x10080001,0x10080001,0xC0C0001,0x3C0012,0x3C0012,0x3C0012,0xC000002,0xA000012,0x60100000,0x140014,0x140014,0x30100000,0x24100000,0x1C100000,0x1C100000,0x16100000,0x48080000,0x2A0C0000,0x12100001,0x10080001,
-0x2C0012,};
-static const uint32_t g_etc1_to_bc7_m6_table3[] = {
-0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x5C0000,
-0x5C0000,0x5C0000,0x5C0000,0xE000001,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x8200000,0x8200000,0x8200000,0x300000,0x400000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x400000,0x400000,0x400000,0x400000,0x400000,
-0x400000,0x800000,0x800000,0x800000,0x14000001,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x800000,0x800000,0x800000,0x14000001,0x800000,0x800000,0x800000,0x14000001,0x14000001,0xE2C0000,0x2C0000,0x2C0000,0x340000,0x4340000,0x3C0000,0x3C0000,0x480000,0x340000,0x4340000,0x5C0000,0x800000,
-0x5C0000,0x340000,0x340000,0x340000,0x340000,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x980000,0x980000,0x18000001,0x18000001,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x980000,0x980000,0x18000001,0x18000001,0x980000,
-0x980000,0x18000001,0x18000001,0x18000001,0x3C0000,0x380000,0x340000,0x2440000,0x2540000,0x6C0000,0x7C0000,0xBC0000,0x400000,0x4C0000,0x6C0000,0x18000001,0x6C0000,0x400000,0x25C0000,0xBC0000,0x1E000001,0x25C0000,0xBC0000,0x1E000001,0xBC0000,0x1E000001,0x1E000001,0x25C0000,0xBC0000,0x1E000001,0xBC0000,0x1E000001,
-0x1E000001,0xBC0000,0x1E000001,0x1E000001,0x1E000001,0x25C0000,0xBC0000,0x1E000001,0xBC0000,0x1E000001,0x1E000001,0xBC0000,0x1E000001,0x1E000001,0x1E000001,0xBC0000,0x1E000001,0x1E000001,0x1E000001,0x1E000001,0x500000,0x440000,0x440000,0x6C0000,0x9C0000,0x1300000,0x1E000001,0x1E000001,0x2540000,0x780000,0x11F40000,0x1E000001,
-0x880000,0x300088,0x322C0034,0x242C0034,0x1E2C0035,0x2E280026,0x24280015,0x1E280019,0x20280026,0x1E240016,0x1C240026,0x30200033,0x26200009,0x2024000F,0x22200012,0x1E200002,0x1C200016,0x20200033,0x1E1C000F,0x1C1C001A,0x1A200033,0x2440088,0x2A140033,0x1E200034,0x24140026,0x20180013,0x1C1C0024,0x240C0033,0x1E14000A,0x1C100013,0x1A140033,0x8C0088,
-0x1E000034,0x1C000024,0x1A000037,0x1600008C,0x58200002,0x9C280026,0xAE2C0034,0x36200001,0x28200002,0x22200002,0x20200002,0x1E200002,0x52180001,0x321C0001,0x201C0009,0x1C100013,0x640088,0x340034,0x2E300008,0x22300009,0x1E300009,0x2A2C0012,0x222C0001,0x202C0001,0x202C0012,0x1E2C0005,0x1C2C0012,0x500033,0x24240009,0x1E280009,0x22200012,0x1E200002,
-0x1C240012,0xA00033,0x1E140009,0x1C100012,0x1A000033,0x500033,0x24240009,0x1E280009,0x22200012,0x1E200002,0x1C240012,0xA00033,0x1E140009,0x1C100012,0x1A000033,0xA00033,0x1E140009,0x1C100012,0x1A000033,0x1A000033,0x54240001,0x7E2C0012,0x90300008,0x30240001,0x28200002,0x22200002,0x1E240002,0x1E1C0002,0x52180001,0x2E200001,0x201C0008,0x1C100012,
-0x700033,0x2C0034,0x2C0034,0x2C0034,0x2C0034,0x24280014,0x24280014,0x24280014,0x1C280014,0x1C280014,0x18240015,0x26200008,0x26200008,0x26200008,0x1E200001,0x1E200001,0x1A200005,0x1A20000A,0x1A20000A,0x18200001,0x1620000A,0x23C0033,0x23C0033,0x23C0033,0x20180012,0x20180012,0x18200013,0x1E140009,0x1E140009,0x18180002,0x1618000A,0x7C0033,
-0x7C0033,0x180C0013,0x1604000A,0x14000033,0x58200001,0x6C280014,0x2C0034,0x34200001,0x28200001,0x22200001,0x20200001,0x1C200001,0x50180000,0x321C0000,0x1E200009,0x18180002,0x580033,0x300008,0x300008,0x300008,0x300008,0x202C0000,0x202C0000,0x202C0000,0x1A2C0001,0x1A2C0001,0x182C0001,0x2440008,0x2440008,0x2440008,0x1A280001,0x1A280001,
-0x18280001,0x8C0008,0x8C0008,0x18200001,0x1600000A,0x2440008,0x2440008,0x2440008,0x1A280001,0x1A280001,0x18280001,0x8C0008,0x8C0008,0x18200001,0x1600000A,0x8C0008,0x8C0008,0x18200001,0x1600000A,0x1600000A,0x4A240000,0x4E2C0000,0x300008,0x2E240000,0x24240000,0x20240000,0x1C280001,0x1C240000,0x461C0000,0x2C200000,0x640008,0x18200001,
-0x640008,0x380014,0x2A340000,0x22340000,0x1E340001,0x540012,0x222C0001,0x1E300001,0xAC0012,0x1E200001,0x1C000012,0x540012,0x222C0001,0x1E300001,0xAC0012,0x1E200001,0x1C000012,0xAC0012,0x1E200001,0x1C000012,0x1C000012,0x540012,0x222C0001,0x1E300001,0xAC0012,0x1E200001,0x1C000012,0xAC0012,0x1E200001,0x1C000012,0x1C000012,0xAC0012,
-0x1E200001,0x1C000012,0x1C000012,0x1C000012,0x641C0000,0x63C0012,0x72340000,0x36200000,0x28200001,0x241C0000,0x1E240001,0x1E180001,0x5A140000,0x36180000,0x20240000,0x1C000012,0x780012,0x240014,0x240014,0x240014,0x240014,0x240014,0x240014,0x240014,0x240014,0x240014,0x240014,0x20200000,0x20200000,0x20200000,0x20200000,0x20200000,
-0x20200000,0x18200000,0x18200000,0x18200000,0x14200001,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x18180001,0x18180001,0x18180001,0x141C0001,0x700012,0x700012,0x700012,0x140C0001,0x12000012,0x68200000,0x240014,0x240014,0x38200000,0x2C200000,0x24200000,0x24200000,0x1E200000,0x50180000,0x321C0000,0x1A200001,0x18180001,
-0x500012,};
-static const uint32_t g_etc1_to_bc7_m6_table4[] = {
-0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x940000,
-0x940000,0x940000,0x940000,0x18000000,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x2340000,0x2340000,0x2340000,0x480000,0x680000,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,
-0x5C0000,0xB80000,0xB80000,0xB80000,0x1E000000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0xB80000,0xB80000,0xB80000,0x1E000000,0xB80000,0xB80000,0xB80000,0x1E000000,0x1E000000,0x8400000,0x3C0001,0x3C0001,0x480000,0x24C0000,0x540000,0x540000,0x680000,0x480000,0x24C0000,0x800000,0xB80000,
-0x800000,0x440001,0x440001,0x440001,0x440001,0x680000,0x680000,0x680000,0xD00000,0xD00000,0x22000000,0x680000,0x680000,0x680000,0xD00000,0xD00000,0x22000000,0xD00000,0xD00000,0x22000000,0x22000000,0x680000,0x680000,0x680000,0xD00000,0xD00000,0x22000000,0xD00000,0xD00000,0x22000000,0x22000000,0xD00000,
-0xD00000,0x22000000,0x22000000,0x22000000,0x500000,0xC480000,0x440001,0x25C0000,0x740000,0x940000,0xA80000,0x1000000,0x580000,0x680000,0x940000,0x22000000,0x940000,0x500001,0x780000,0xF40000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,
-0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0x28000000,0x4640000,0x580000,0x580000,0x880000,0xC80000,0x1880000,0x28000000,0x28000000,0x700000,0x980000,0x1BE80000,0x28000000,
-0xAC0000,0x40008C,0x3E3C0033,0x2E3C0033,0x283C0033,0x38380024,0x2E380013,0x2A38001A,0x2A380024,0x26380016,0x24380026,0x36340034,0x2E34000A,0x2A34000F,0x2C300013,0x28300002,0x24340016,0x28340034,0x2630000F,0x24300019,0x22300035,0x600088,0x32280033,0x28300033,0x2C280026,0x282C0012,0x24300026,0x2E1C0033,0x28200009,0x24240015,0x22280034,0xC40088,
-0x280C0033,0x24140026,0x22080034,0x20000088,0x62340002,0xB4380024,0xC63C0033,0x3C340002,0x30340002,0x2A340002,0x28340002,0x28300002,0x5E280001,0x3E2C0001,0x28300009,0x24240015,0x8C0088,0x480033,0x3644000A,0x2C40000A,0x2840000A,0x30400013,0x2C3C0002,0x28400001,0x28400013,0x283C0005,0x243C0015,0x2680033,0x2E340009,0x2838000A,0x2C300012,0x28300001,
-0x24380014,0xD80033,0x28200008,0x24240014,0x22000034,0x2680033,0x2E340009,0x2838000A,0x2C300012,0x28300001,0x24380014,0xD80033,0x28200008,0x24240014,0x22000034,0xD80033,0x28200008,0x24240014,0x22000034,0x22000034,0x62340001,0x78400013,0x8A44000A,0x3C340001,0x30340001,0x2A340001,0x28340001,0x28300001,0x5E280001,0x3A300001,0x28300009,0x24240014,
-0x980033,0x3C0033,0x3C0033,0x3C0033,0x3C0033,0x30380012,0x30380012,0x30380012,0x26380012,0x26380012,0x22380012,0x2E340009,0x2E340009,0x2E340009,0x26340002,0x26340002,0x22340005,0x24300009,0x24300009,0x22300001,0x20300009,0x580033,0x580033,0x580033,0x282C0012,0x282C0012,0x22300012,0x26280009,0x26280009,0x22280001,0x20280009,0xB00033,
-0xB00033,0x221C0012,0x20100008,0x1E000034,0x56340002,0x84380012,0x3C0033,0x3A340001,0x30340001,0x2A340001,0x28340002,0x26300002,0x5E280000,0x38300001,0x28300008,0x22280001,0x7C0033,0x40000A,0x40000A,0x40000A,0x40000A,0x28400001,0x28400001,0x28400001,0x243C0001,0x243C0001,0x223C0001,0x600008,0x600008,0x600008,0x24380001,0x24380001,
-0x22380001,0xC40008,0xC40008,0x222C0000,0x20000008,0x600008,0x600008,0x600008,0x24380001,0x24380001,0x22380001,0xC40008,0xC40008,0x222C0000,0x20000008,0xC40008,0xC40008,0x222C0000,0x20000008,0x20000008,0x4A380000,0x48400001,0x40000A,0x3A340000,0x2C380000,0x28380000,0x26380000,0x24380001,0x4C300000,0x34340000,0x8C0008,0x222C0000,
-0x8C0008,0x4C0012,0x32480001,0x2A480001,0x28440001,0x700012,0x2C3C0001,0x28400000,0xE40012,0x282C0000,0x24000014,0x700012,0x2C3C0001,0x28400000,0xE40012,0x282C0000,0x24000014,0xE40012,0x282C0000,0x24000014,0x24000014,0x700012,0x2C3C0001,0x28400000,0xE40012,0x282C0000,0x24000014,0xE40012,0x282C0000,0x24000014,0x24000014,0xE40012,
-0x282C0000,0x24000014,0x24000014,0x24000014,0x722C0000,0x500012,0x6C480001,0x40300000,0x30340001,0x2C300000,0x28340000,0x28240000,0x66240000,0x3E2C0000,0x28380001,0x24000014,0xA00012,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x380012,0x28340001,0x28340001,0x28340001,0x28340001,0x28340001,
-0x28340001,0x20340001,0x20340001,0x20340001,0x1E300001,0x2500012,0x2500012,0x2500012,0x2500012,0x2500012,0x2500012,0x22280001,0x22280001,0x22280001,0x1E2C0000,0xA40012,0xA40012,0xA40012,0x1E180000,0x1A000014,0x62340001,0x380012,0x380012,0x3A340001,0x30340001,0x2A340001,0x2A340001,0x26340001,0x5E280000,0x402C0000,0x24300000,0x22280001,
-0x740012,};
-static const uint32_t g_etc1_to_bc7_m6_table5[] = {
-0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0xC40000,
-0xC40000,0xC40000,0xC40000,0x20000000,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0xA440000,0xA440000,0xA440000,0x600000,0x8C0000,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x740000,0x740000,0x740000,0x740000,0x740000,
-0x740000,0xE80000,0xE80000,0xE80000,0x26000000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xE80000,0xE80000,0xE80000,0x26000000,0xE80000,0xE80000,0xE80000,0x26000000,0x26000000,0x540000,0x4C0001,0x4C0001,0x4580000,0x2600000,0x2680000,0x2680000,0x2800000,0x4580000,0x2600000,0xA40000,0xE80000,
-0xA40000,0x540001,0x540001,0x540001,0x540001,0x800000,0x800000,0x800000,0x1000000,0x1000000,0x2A000000,0x800000,0x800000,0x800000,0x1000000,0x1000000,0x2A000000,0x1000000,0x1000000,0x2A000000,0x2A000000,0x800000,0x800000,0x800000,0x1000000,0x1000000,0x2A000000,0x1000000,0x1000000,0x2A000000,0x2A000000,0x1000000,
-0x1000000,0x2A000000,0x2A000000,0x2A000000,0x640000,0x5C0000,0x540001,0x740000,0x900000,0xB40000,0xD00000,0x13C0000,0x6C0000,0x800000,0xB40000,0x2A000000,0xB40000,0x600001,0x900000,0x1240000,0x30000000,0x900000,0x1240000,0x30000000,0x1240000,0x30000000,0x30000000,0x900000,0x1240000,0x30000000,0x1240000,0x30000000,
-0x30000000,0x1240000,0x30000000,0x30000000,0x30000000,0x900000,0x1240000,0x30000000,0x1240000,0x30000000,0x30000000,0x1240000,0x30000000,0x30000000,0x30000000,0x1240000,0x30000000,0x30000000,0x30000000,0x30000000,0x4780000,0x680000,0x680000,0xA40000,0xEC0000,0x1D80000,0x30000000,0x30000000,0x840000,0xB80000,0x23F80000,0x30000000,
-0xD00000,0x50008C,0x464C0033,0x364C0033,0x304C0033,0x40480024,0x36480013,0x3248001A,0x32480024,0x2E480016,0x2C480026,0x3E440034,0x3644000A,0x3244000F,0x34400013,0x30400002,0x2C440016,0x30440034,0x2E40000F,0x2C400019,0x2A400035,0x780088,0x3A380033,0x30400033,0x34380026,0x303C0012,0x2C400026,0x362C0033,0x30300009,0x2C340015,0x2A380034,0xF40088,
-0x301C0033,0x2C240026,0x2A180034,0x28000088,0x6A440002,0xBC480024,0xCE4C0033,0x44440002,0x38440002,0x32440002,0x30440002,0x30400002,0x66380001,0x463C0001,0x30400009,0x2C340015,0xAC0088,0x580033,0x3E54000A,0x3450000A,0x3050000A,0x38500013,0x344C0002,0x30500001,0x30500013,0x304C0005,0x2C4C0015,0x2800033,0x36440009,0x3048000A,0x34400012,0x30400001,
-0x2C480014,0x1080033,0x30300008,0x2C340014,0x2A000034,0x2800033,0x36440009,0x3048000A,0x34400012,0x30400001,0x2C480014,0x1080033,0x30300008,0x2C340014,0x2A000034,0x1080033,0x30300008,0x2C340014,0x2A000034,0x2A000034,0x6A440001,0x80500013,0x9254000A,0x44440001,0x38440001,0x32440001,0x30440001,0x30400001,0x66380001,0x42400001,0x30400009,0x2C340014,
-0xB80033,0x4C0033,0x4C0033,0x4C0033,0x4C0033,0x38480012,0x38480012,0x38480012,0x2E480012,0x2E480012,0x2A480012,0x36440009,0x36440009,0x36440009,0x2E440002,0x2E440002,0x2A440005,0x2C400009,0x2C400009,0x2A400001,0x28400009,0x700033,0x700033,0x700033,0x303C0012,0x303C0012,0x2A400012,0x2E380009,0x2E380009,0x2A380001,0x28380009,0xE40033,
-0xE40033,0x2A2C0012,0x28200008,0x26000034,0x5E440002,0x8C480012,0x4C0033,0x42440001,0x38440001,0x32440001,0x30440002,0x2E400002,0x66380000,0x40400001,0x30400008,0x2A380001,0xA00033,0x50000A,0x50000A,0x50000A,0x50000A,0x30500001,0x30500001,0x30500001,0x2C4C0001,0x2C4C0001,0x2A4C0001,0x780008,0x780008,0x780008,0x2C480001,0x2C480001,
-0x2A480001,0xF40008,0xF40008,0x2A3C0000,0x28000008,0x780008,0x780008,0x780008,0x2C480001,0x2C480001,0x2A480001,0xF40008,0xF40008,0x2A3C0000,0x28000008,0xF40008,0xF40008,0x2A3C0000,0x28000008,0x28000008,0x52480000,0x50500001,0x50000A,0x42440000,0x34480000,0x30480000,0x2E480000,0x2C480001,0x54400000,0x3C440000,0xAC0008,0x2A3C0000,
-0xAC0008,0x5C0012,0x3A580001,0x32580001,0x30540001,0x880012,0x344C0001,0x30500000,0x1140012,0x303C0000,0x2C000014,0x880012,0x344C0001,0x30500000,0x1140012,0x303C0000,0x2C000014,0x1140012,0x303C0000,0x2C000014,0x2C000014,0x880012,0x344C0001,0x30500000,0x1140012,0x303C0000,0x2C000014,0x1140012,0x303C0000,0x2C000014,0x2C000014,0x1140012,
-0x303C0000,0x2C000014,0x2C000014,0x2C000014,0x7A3C0000,0x8600012,0x74580001,0x48400000,0x38440001,0x34400000,0x30440000,0x30340000,0x6E340000,0x463C0000,0x30480001,0x2C000014,0xC00012,0x480012,0x480012,0x480012,0x480012,0x480012,0x480012,0x480012,0x480012,0x480012,0x480012,0x30440001,0x30440001,0x30440001,0x30440001,0x30440001,
-0x30440001,0x28440001,0x28440001,0x28440001,0x26400001,0x2680012,0x2680012,0x2680012,0x2680012,0x2680012,0x2680012,0x2A380001,0x2A380001,0x2A380001,0x263C0000,0xD80012,0xD80012,0xD80012,0x26280000,0x22000014,0x6A440001,0x480012,0x480012,0x42440001,0x38440001,0x32440001,0x32440001,0x2E440001,0x66380000,0x483C0000,0x2C400000,0x2A380001,
-0x980012,};
-static const uint32_t g_etc1_to_bc7_m6_table6[] = {
-0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0xF40000,
-0xF40000,0xF40000,0xF40000,0x28000000,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x580000,0x580000,0x580000,0x780000,0xAC0000,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,
-0x8C0000,0x1180000,0x1180000,0x1180000,0x2E000000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x1180000,0x1180000,0x1180000,0x2E000000,0x1180000,0x1180000,0x1180000,0x2E000000,0x2E000000,0x640000,0x5C0001,0x5C0001,0x6C0000,0x2740000,0x800000,0x800000,0x9C0000,0x6C0000,0x2740000,0xC80000,0x1180000,
-0xC80000,0x640001,0x640001,0x640001,0x640001,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x1300000,0x1300000,0x32000000,0x32000000,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x1300000,0x1300000,0x32000000,0x32000000,0x1300000,
-0x1300000,0x32000000,0x32000000,0x32000000,0x4740000,0x6C0000,0x640001,0x2880000,0xAC0000,0xD80000,0xF80000,0x1780000,0x800000,0x980000,0xD80000,0x32000000,0xD80000,0x700001,0xA80000,0x1580000,0x38000000,0xA80000,0x1580000,0x38000000,0x1580000,0x38000000,0x38000000,0xA80000,0x1580000,0x38000000,0x1580000,0x38000000,
-0x38000000,0x1580000,0x38000000,0x38000000,0x38000000,0xA80000,0x1580000,0x38000000,0x1580000,0x38000000,0x38000000,0x1580000,0x38000000,0x38000000,0x38000000,0x1580000,0x38000000,0x38000000,0x38000000,0x38000000,0x48C0000,0x4780000,0x4780000,0xC00000,0x1140000,0x7F80000,0x38000000,0x38000000,0x2980000,0xD40000,0x2DCC0000,0x38000000,
-0xF00000,0x60008C,0x4E5C0033,0x3E5C0033,0x385C0033,0x48580024,0x3E580013,0x3A58001A,0x3A580024,0x36580016,0x34580026,0x46540034,0x3E54000A,0x3A54000F,0x3C500013,0x38500002,0x34540016,0x38540034,0x3650000F,0x34500019,0x32500035,0x900088,0x42480033,0x38500033,0x3C480026,0x384C0012,0x34500026,0x3E3C0033,0x38400009,0x34440015,0x32480034,0x1240088,
-0x382C0033,0x34340026,0x32280034,0x30000088,0x72540002,0xC4580024,0xD65C0033,0x4C540002,0x40540002,0x3A540002,0x38540002,0x38500002,0x6E480001,0x4E4C0001,0x38500009,0x34440015,0xD00088,0x680033,0x4664000A,0x3C60000A,0x3860000A,0x40600013,0x3C5C0002,0x38600001,0x38600013,0x385C0005,0x345C0015,0x2980033,0x3E540009,0x3858000A,0x3C500012,0x38500001,
-0x34580014,0x1380033,0x38400008,0x34440014,0x32000034,0x2980033,0x3E540009,0x3858000A,0x3C500012,0x38500001,0x34580014,0x1380033,0x38400008,0x34440014,0x32000034,0x1380033,0x38400008,0x34440014,0x32000034,0x32000034,0x72540001,0x88600013,0x9A64000A,0x4C540001,0x40540001,0x3A540001,0x38540001,0x38500001,0x6E480001,0x4A500001,0x38500009,0x34440014,
-0xDC0033,0x5C0033,0x5C0033,0x5C0033,0x5C0033,0x40580012,0x40580012,0x40580012,0x36580012,0x36580012,0x32580012,0x3E540009,0x3E540009,0x3E540009,0x36540002,0x36540002,0x32540005,0x34500009,0x34500009,0x32500001,0x30500009,0x880033,0x880033,0x880033,0x384C0012,0x384C0012,0x32500012,0x36480009,0x36480009,0x32480001,0x30480009,0x1140033,
-0x1140033,0x323C0012,0x30300008,0x2E000034,0x66540002,0x94580012,0x5C0033,0x4A540001,0x40540001,0x3A540001,0x38540002,0x36500002,0x6E480000,0x48500001,0x38500008,0x32480001,0xC00033,0x60000A,0x60000A,0x60000A,0x60000A,0x38600001,0x38600001,0x38600001,0x345C0001,0x345C0001,0x325C0001,0x900008,0x900008,0x900008,0x34580001,0x34580001,
-0x32580001,0x1240008,0x1240008,0x324C0000,0x30000008,0x900008,0x900008,0x900008,0x34580001,0x34580001,0x32580001,0x1240008,0x1240008,0x324C0000,0x30000008,0x1240008,0x1240008,0x324C0000,0x30000008,0x30000008,0x5A580000,0x58600001,0x60000A,0x4A540000,0x3C580000,0x38580000,0x36580000,0x34580001,0x5C500000,0x44540000,0xD00008,0x324C0000,
-0xD00008,0x6C0012,0x42680001,0x3A680001,0x38640001,0xA00012,0x3C5C0001,0x38600000,0x1440012,0x384C0000,0x34000014,0xA00012,0x3C5C0001,0x38600000,0x1440012,0x384C0000,0x34000014,0x1440012,0x384C0000,0x34000014,0x34000014,0xA00012,0x3C5C0001,0x38600000,0x1440012,0x384C0000,0x34000014,0x1440012,0x384C0000,0x34000014,0x34000014,0x1440012,
-0x384C0000,0x34000014,0x34000014,0x34000014,0x824C0000,0x740012,0x7C680001,0x50500000,0x40540001,0x3C500000,0x38540000,0x38440000,0x76440000,0x4E4C0000,0x38580001,0x34000014,0xE40012,0x580012,0x580012,0x580012,0x580012,0x580012,0x580012,0x580012,0x580012,0x580012,0x580012,0x38540001,0x38540001,0x38540001,0x38540001,0x38540001,
-0x38540001,0x30540001,0x30540001,0x30540001,0x2E500001,0x2800012,0x2800012,0x2800012,0x2800012,0x2800012,0x2800012,0x32480001,0x32480001,0x32480001,0x2E4C0000,0x1080012,0x1080012,0x1080012,0x2E380000,0x2A000014,0x72540001,0x580012,0x580012,0x4A540001,0x40540001,0x3A540001,0x3A540001,0x36540001,0x6E480000,0x504C0000,0x34500000,0x32480001,
-0xB80012,};
-static const uint32_t g_etc1_to_bc7_m6_table7[] = {
-0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x1240000,
-0x1240000,0x1240000,0x1240000,0x30000000,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x680000,0x680000,0x680000,0x900000,0xD00000,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,
-0xA40000,0x14C0000,0x14C0000,0x14C0000,0x36000000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0x14C0000,0x14C0000,0x14C0000,0x36000000,0x14C0000,0x14C0000,0x14C0000,0x36000000,0x36000000,0x2740000,0x6C0001,0x6C0001,0x800000,0x2880000,0x940000,0x940000,0xB80000,0x800000,0x2880000,0xE80000,0x14C0000,
-0xE80000,0x740001,0x740001,0x740001,0x740001,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0x1640000,0x1640000,0x3A000000,0x3A000000,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0x1640000,0x1640000,0x3A000000,0x3A000000,0x1640000,
-0x1640000,0x3A000000,0x3A000000,0x3A000000,0x880000,0x67C0000,0x740001,0xA00000,0xC40000,0xF80000,0x1200000,0x1B40000,0x940000,0xB00000,0xF80000,0x3A000000,0xF80000,0x800001,0xC00000,0x1880000,0x40000000,0xC00000,0x1880000,0x40000000,0x1880000,0x40000000,0x40000000,0xC00000,0x1880000,0x40000000,0x1880000,0x40000000,
-0x40000000,0x1880000,0x40000000,0x40000000,0x40000000,0xC00000,0x1880000,0x40000000,0x1880000,0x40000000,0x40000000,0x1880000,0x40000000,0x40000000,0x40000000,0x1880000,0x40000000,0x40000000,0x40000000,0x40000000,0x4A00000,0xC880000,0xC880000,0xD80000,0x13C0000,0x11F80000,0x40000000,0x40000000,0xB00000,0xF40000,0x35DC0000,0x40000000,
-0x1140000,0x70008C,0x566C0033,0x466C0033,0x406C0033,0x50680024,0x46680013,0x4268001A,0x42680024,0x3E680016,0x3C680026,0x4E640034,0x4664000A,0x4264000F,0x44600013,0x40600002,0x3C640016,0x40640034,0x3E60000F,0x3C600019,0x3A600035,0xA80088,0x4A580033,0x40600033,0x44580026,0x405C0012,0x3C600026,0x464C0033,0x40500009,0x3C540015,0x3A580034,0x1580088,
-0x403C0033,0x3C440026,0x3A380034,0x38000088,0x7A640002,0xCC680024,0xDE6C0033,0x54640002,0x48640002,0x42640002,0x40640002,0x40600002,0x76580001,0x565C0001,0x40600009,0x3C540015,0xF00088,0x780033,0x4E74000A,0x4470000A,0x4070000A,0x48700013,0x446C0002,0x40700001,0x40700013,0x406C0005,0x3C6C0015,0x2B00033,0x46640009,0x4068000A,0x44600012,0x40600001,
-0x3C680014,0x1680033,0x40500008,0x3C540014,0x3A000034,0x2B00033,0x46640009,0x4068000A,0x44600012,0x40600001,0x3C680014,0x1680033,0x40500008,0x3C540014,0x3A000034,0x1680033,0x40500008,0x3C540014,0x3A000034,0x3A000034,0x7A640001,0x90700013,0xA274000A,0x54640001,0x48640001,0x42640001,0x40640001,0x40600001,0x76580001,0x52600001,0x40600009,0x3C540014,
-0xFC0033,0x6C0033,0x6C0033,0x6C0033,0x6C0033,0x48680012,0x48680012,0x48680012,0x3E680012,0x3E680012,0x3A680012,0x46640009,0x46640009,0x46640009,0x3E640002,0x3E640002,0x3A640005,0x3C600009,0x3C600009,0x3A600001,0x38600009,0xA00033,0xA00033,0xA00033,0x405C0012,0x405C0012,0x3A600012,0x3E580009,0x3E580009,0x3A580001,0x38580009,0x1440033,
-0x1440033,0x3A4C0012,0x38400008,0x36000034,0x6E640002,0x9C680012,0x6C0033,0x52640001,0x48640001,0x42640001,0x40640002,0x3E600002,0x76580000,0x50600001,0x40600008,0x3A580001,0xE40033,0x70000A,0x70000A,0x70000A,0x70000A,0x40700001,0x40700001,0x40700001,0x3C6C0001,0x3C6C0001,0x3A6C0001,0xA80008,0xA80008,0xA80008,0x3C680001,0x3C680001,
-0x3A680001,0x1580008,0x1580008,0x3A5C0000,0x38000008,0xA80008,0xA80008,0xA80008,0x3C680001,0x3C680001,0x3A680001,0x1580008,0x1580008,0x3A5C0000,0x38000008,0x1580008,0x1580008,0x3A5C0000,0x38000008,0x38000008,0x62680000,0x60700001,0x70000A,0x52640000,0x44680000,0x40680000,0x3E680000,0x3C680001,0x64600000,0x4C640000,0xF00008,0x3A5C0000,
-0xF00008,0x7C0012,0x4A780001,0x42780001,0x40740001,0xB80012,0x446C0001,0x40700000,0x1740012,0x405C0000,0x3C000014,0xB80012,0x446C0001,0x40700000,0x1740012,0x405C0000,0x3C000014,0x1740012,0x405C0000,0x3C000014,0x3C000014,0xB80012,0x446C0001,0x40700000,0x1740012,0x405C0000,0x3C000014,0x1740012,0x405C0000,0x3C000014,0x3C000014,0x1740012,
-0x405C0000,0x3C000014,0x3C000014,0x3C000014,0x8A5C0000,0x840012,0x84780001,0x58600000,0x48640001,0x44600000,0x40640000,0x40540000,0x7E540000,0x565C0000,0x40680001,0x3C000014,0x1080012,0x680012,0x680012,0x680012,0x680012,0x680012,0x680012,0x680012,0x680012,0x680012,0x680012,0x40640001,0x40640001,0x40640001,0x40640001,0x40640001,
-0x40640001,0x38640001,0x38640001,0x38640001,0x36600001,0x2980012,0x2980012,0x2980012,0x2980012,0x2980012,0x2980012,0x3A580001,0x3A580001,0x3A580001,0x365C0000,0x1380012,0x1380012,0x1380012,0x36480000,0x32000014,0x7A640001,0x680012,0x680012,0x52640001,0x48640001,0x42640001,0x42640001,0x3E640001,0x76580000,0x585C0000,0x3C600000,0x3A580001,
-0xDC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table8[] = {
-0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0x15C0000,
-0x15C0000,0x15C0000,0x15C0000,0x38000001,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x7C0000,0x7C0000,0x7C0000,0xAC0000,0xF40000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,
-0x2BC0000,0x1800000,0x1800000,0x1800000,0x3E000001,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x1800000,0x1800000,0x1800000,0x3E000001,0x1800000,0x1800000,0x1800000,0x3E000001,0x3E000001,0x880000,0x800000,0x800000,0x940000,0xA00000,0x2AC0000,0x2AC0000,0x2D40000,0x940000,0xA00000,0x1100000,0x1800000,
-0x1100000,0x880000,0x880000,0x880000,0x880000,0xC80000,0xC80000,0xC80000,0x1980000,0x1980000,0x42000001,0xC80000,0xC80000,0xC80000,0x1980000,0x1980000,0x42000001,0x1980000,0x1980000,0x42000001,0x42000001,0xC80000,0xC80000,0xC80000,0x1980000,0x1980000,0x42000001,0x1980000,0x1980000,0x42000001,0x42000001,0x1980000,
-0x1980000,0x42000001,0x42000001,0x42000001,0x49C0000,0x900000,0x880000,0xB80000,0xE40000,0x1200000,0x14C0000,0x1F80000,0x2A80000,0xC80000,0x1200000,0x42000001,0x1200000,0x940000,0xDC0000,0x1BC0000,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,
-0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0x48000001,0xB80000,0x69C0000,0x69C0000,0xF80000,0x1680000,0x1DF40000,0x48000001,0x48000001,0xC80000,0x1140000,0x3FD00000,0x48000001,
-0x1380000,0x840088,0x5C800034,0x4E800034,0x48800035,0x587C0026,0x4E7C0015,0x487C0019,0x4A7C0026,0x48780016,0x46780026,0x5A740033,0x50740009,0x4A78000F,0x4C740012,0x48740002,0x46740016,0x4A740033,0x4870000F,0x4670001A,0x44740033,0xC40088,0x54680033,0x48740034,0x4E680026,0x4A6C0013,0x46700024,0x4E600033,0x4868000A,0x46640013,0x44680033,0x18C0088,
-0x48540034,0x46500024,0x44440033,0x4000008C,0x82740002,0xC67C0026,0xD8800034,0x60740001,0x52740002,0x4C740002,0x4A740002,0x48740002,0x7C6C0001,0x5C700001,0x4A700009,0x46640013,0x1180088,0x880034,0x58840008,0x4C840009,0x48840009,0x54800012,0x4C800001,0x4A800001,0x4A800012,0x48800005,0x46800012,0xCC0033,0x4E780009,0x487C0009,0x4C740012,0x48740002,
-0x46780012,0x1A00033,0x48680009,0x46640012,0x44000033,0xCC0033,0x4E780009,0x487C0009,0x4C740012,0x48740002,0x46780012,0x1A00033,0x48680009,0x46640012,0x44000033,0x1A00033,0x48680009,0x46640012,0x44000033,0x44000033,0x7E780001,0xA8800012,0xBA840008,0x5A780001,0x52740002,0x4C740002,0x48780002,0x48700002,0x7C6C0001,0x58740001,0x4A700008,0x46640012,
-0x1240033,0x800034,0x800034,0x800034,0x800034,0x4E7C0014,0x4E7C0014,0x4E7C0014,0x467C0014,0x467C0014,0x42780015,0x50740008,0x50740008,0x50740008,0x48740001,0x48740001,0x44740005,0x4474000A,0x4474000A,0x42740001,0x4074000A,0xBC0033,0xBC0033,0xBC0033,0x4A6C0012,0x4A6C0012,0x42740013,0x48680009,0x48680009,0x426C0002,0x406C000A,0x17C0033,
-0x17C0033,0x42600013,0x4058000A,0x3E000033,0x82740001,0x967C0014,0x800034,0x5E740001,0x52740001,0x4C740001,0x4A740001,0x46740001,0x7A6C0000,0x5C700000,0x48740009,0x426C0002,0x10C0033,0x840008,0x840008,0x840008,0x840008,0x4A800000,0x4A800000,0x4A800000,0x44800001,0x44800001,0x42800001,0xC40008,0xC40008,0xC40008,0x447C0001,0x447C0001,
-0x427C0001,0x18C0008,0x18C0008,0x42740001,0x4000000A,0xC40008,0xC40008,0xC40008,0x447C0001,0x447C0001,0x427C0001,0x18C0008,0x18C0008,0x42740001,0x4000000A,0x18C0008,0x18C0008,0x42740001,0x4000000A,0x4000000A,0x74780000,0x78800000,0x840008,0x58780000,0x4E780000,0x4A780000,0x467C0001,0x46780000,0x70700000,0x56740000,0x1180008,0x42740001,
-0x1180008,0x8C0014,0x54880000,0x4C880000,0x48880001,0x2D00012,0x4C800001,0x48840001,0x1AC0012,0x48740001,0x46000012,0x2D00012,0x4C800001,0x48840001,0x1AC0012,0x48740001,0x46000012,0x1AC0012,0x48740001,0x46000012,0x46000012,0x2D00012,0x4C800001,0x48840001,0x1AC0012,0x48740001,0x46000012,0x1AC0012,0x48740001,0x46000012,0x46000012,0x1AC0012,
-0x48740001,0x46000012,0x46000012,0x46000012,0x8E700000,0x980012,0x9C880000,0x60740000,0x52740001,0x4E700000,0x48780001,0x486C0001,0x84680000,0x606C0000,0x4A780000,0x46000012,0x12C0012,0x780014,0x780014,0x780014,0x780014,0x780014,0x780014,0x780014,0x780014,0x780014,0x780014,0x4A740000,0x4A740000,0x4A740000,0x4A740000,0x4A740000,
-0x4A740000,0x42740000,0x42740000,0x42740000,0x3E740001,0xB40012,0xB40012,0xB40012,0xB40012,0xB40012,0xB40012,0x426C0001,0x426C0001,0x426C0001,0x3E700001,0x1700012,0x1700012,0x1700012,0x3E600001,0x3C000012,0x92740000,0x780014,0x780014,0x62740000,0x56740000,0x4E740000,0x4E740000,0x48740000,0x7A6C0000,0x5C700000,0x44740001,0x426C0001,
-0x1000012,};
-static const uint32_t g_etc1_to_bc7_m6_table9[] = {
-0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x18C0000,
-0x18C0000,0x18C0000,0x18C0000,0x40000001,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x8C0000,0x8C0000,0x8C0000,0xC40000,0x1180000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,
-0xD40000,0x1B00000,0x1B00000,0x1B00000,0x46000001,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x1B00000,0x1B00000,0x1B00000,0x46000001,0x1B00000,0x1B00000,0x1B00000,0x46000001,0x46000001,0x4980000,0x900000,0x900000,0xA80000,0xB40000,0xC40000,0xC40000,0xF00000,0xA80000,0xB40000,0x1300000,0x1B00000,
-0x1300000,0x980000,0x980000,0x980000,0x980000,0xE00000,0xE00000,0xE00000,0x1CC0000,0x1CC0000,0x4A000001,0xE00000,0xE00000,0xE00000,0x1CC0000,0x1CC0000,0x4A000001,0x1CC0000,0x1CC0000,0x4A000001,0x4A000001,0xE00000,0xE00000,0xE00000,0x1CC0000,0x1CC0000,0x4A000001,0x1CC0000,0x1CC0000,0x4A000001,0x4A000001,0x1CC0000,
-0x1CC0000,0x4A000001,0x4A000001,0x4A000001,0xB00000,0x8A00000,0x980000,0x2CC0000,0x1000000,0x1400000,0x1740000,0xBFC0000,0x2BC0000,0xE00000,0x1400000,0x4A000001,0x1400000,0xA40000,0xF40000,0x1F00000,0x50000001,0xF40000,0x1F00000,0x50000001,0x1F00000,0x50000001,0x50000001,0xF40000,0x1F00000,0x50000001,0x1F00000,0x50000001,
-0x50000001,0x1F00000,0x50000001,0x50000001,0x50000001,0xF40000,0x1F00000,0x50000001,0x1F00000,0x50000001,0x50000001,0x1F00000,0x50000001,0x50000001,0x50000001,0x1F00000,0x50000001,0x50000001,0x50000001,0x50000001,0x2CC0000,0xEAC0000,0xEAC0000,0x1140000,0x1900000,0x27F40000,0x50000001,0x50000001,0x2DC0000,0x1340000,0x47E00000,0x50000001,
-0x15C0000,0x940088,0x64900034,0x56900034,0x50900035,0x608C0026,0x568C0015,0x508C0019,0x528C0026,0x50880016,0x4E880026,0x62840033,0x58840009,0x5288000F,0x54840012,0x50840002,0x4E840016,0x52840033,0x5080000F,0x4E80001A,0x4C840033,0xDC0088,0x5C780033,0x50840034,0x56780026,0x527C0013,0x4E800024,0x56700033,0x5078000A,0x4E740013,0x4C780033,0x1BC0088,
-0x50640034,0x4E600024,0x4C540033,0x4800008C,0x8A840002,0xCE8C0026,0xE0900034,0x68840001,0x5A840002,0x54840002,0x52840002,0x50840002,0x847C0001,0x64800001,0x52800009,0x4E740013,0x1380088,0x980034,0x60940008,0x54940009,0x50940009,0x5C900012,0x54900001,0x52900001,0x52900012,0x50900005,0x4E900012,0xE40033,0x56880009,0x508C0009,0x54840012,0x50840002,
-0x4E880012,0x1D00033,0x50780009,0x4E740012,0x4C000033,0xE40033,0x56880009,0x508C0009,0x54840012,0x50840002,0x4E880012,0x1D00033,0x50780009,0x4E740012,0x4C000033,0x1D00033,0x50780009,0x4E740012,0x4C000033,0x4C000033,0x86880001,0xB0900012,0xC2940008,0x62880001,0x5A840002,0x54840002,0x50880002,0x50800002,0x847C0001,0x60840001,0x52800008,0x4E740012,
-0x1480033,0x900034,0x900034,0x900034,0x900034,0x568C0014,0x568C0014,0x568C0014,0x4E8C0014,0x4E8C0014,0x4A880015,0x58840008,0x58840008,0x58840008,0x50840001,0x50840001,0x4C840005,0x4C84000A,0x4C84000A,0x4A840001,0x4884000A,0x2D00033,0x2D00033,0x2D00033,0x527C0012,0x527C0012,0x4A840013,0x50780009,0x50780009,0x4A7C0002,0x487C000A,0x1AC0033,
-0x1AC0033,0x4A700013,0x4868000A,0x46000033,0x8A840001,0x9E8C0014,0x900034,0x66840001,0x5A840001,0x54840001,0x52840001,0x4E840001,0x827C0000,0x64800000,0x50840009,0x4A7C0002,0x12C0033,0x940008,0x940008,0x940008,0x940008,0x52900000,0x52900000,0x52900000,0x4C900001,0x4C900001,0x4A900001,0xDC0008,0xDC0008,0xDC0008,0x4C8C0001,0x4C8C0001,
-0x4A8C0001,0x1BC0008,0x1BC0008,0x4A840001,0x4800000A,0xDC0008,0xDC0008,0xDC0008,0x4C8C0001,0x4C8C0001,0x4A8C0001,0x1BC0008,0x1BC0008,0x4A840001,0x4800000A,0x1BC0008,0x1BC0008,0x4A840001,0x4800000A,0x4800000A,0x7C880000,0x80900000,0x940008,0x60880000,0x56880000,0x52880000,0x4E8C0001,0x4E880000,0x78800000,0x5E840000,0x1380008,0x4A840001,
-0x1380008,0x9C0014,0x5C980000,0x54980000,0x50980001,0x2E80012,0x54900001,0x50940001,0x1DC0012,0x50840001,0x4E000012,0x2E80012,0x54900001,0x50940001,0x1DC0012,0x50840001,0x4E000012,0x1DC0012,0x50840001,0x4E000012,0x4E000012,0x2E80012,0x54900001,0x50940001,0x1DC0012,0x50840001,0x4E000012,0x1DC0012,0x50840001,0x4E000012,0x4E000012,0x1DC0012,
-0x50840001,0x4E000012,0x4E000012,0x4E000012,0x96800000,0xA80012,0xA4980000,0x68840000,0x5A840001,0x56800000,0x50880001,0x507C0001,0x8C780000,0x687C0000,0x52880000,0x4E000012,0x1500012,0x880014,0x880014,0x880014,0x880014,0x880014,0x880014,0x880014,0x880014,0x880014,0x880014,0x52840000,0x52840000,0x52840000,0x52840000,0x52840000,
-0x52840000,0x4A840000,0x4A840000,0x4A840000,0x46840001,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0x4A7C0001,0x4A7C0001,0x4A7C0001,0x46800001,0x1A00012,0x1A00012,0x1A00012,0x46700001,0x44000012,0x9A840000,0x880014,0x880014,0x6A840000,0x5E840000,0x56840000,0x56840000,0x50840000,0x827C0000,0x64800000,0x4C840001,0x4A7C0001,
-0x1240012,};
-static const uint32_t g_etc1_to_bc7_m6_table10[] = {
-0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0x1BC0000,
-0x1BC0000,0x1BC0000,0x1BC0000,0x48000001,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x69C0000,0x69C0000,0x69C0000,0xDC0000,0x1380000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,
-0xEC0000,0x1E40000,0x1E40000,0x1E40000,0x4E000001,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0x1E40000,0x1E40000,0x1E40000,0x4E000001,0x1E40000,0x1E40000,0x1E40000,0x4E000001,0x4E000001,0xCA80000,0xA00000,0xA00000,0x4B80000,0xC80000,0xD80000,0xD80000,0x10C0000,0x4B80000,0xC80000,0x1540000,0x1E40000,
-0x1540000,0xA80000,0xA80000,0xA80000,0xA80000,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0x1FC0000,0x1FC0000,0x52000001,0x52000001,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0x1FC0000,0x1FC0000,0x52000001,0x52000001,0x1FC0000,
-0x1FC0000,0x52000001,0x52000001,0x52000001,0xC40000,0xB40000,0xA80000,0xE40000,0x1180000,0x1640000,0x19C0000,0x17F80000,0x4D00000,0xF80000,0x1640000,0x52000001,0x1640000,0xB40000,0x10C0000,0xBF80000,0x58000001,0x10C0000,0xBF80000,0x58000001,0xBF80000,0x58000001,0x58000001,0x10C0000,0xBF80000,0x58000001,0xBF80000,0x58000001,
-0x58000001,0xBF80000,0x58000001,0x58000001,0x58000001,0x10C0000,0xBF80000,0x58000001,0xBF80000,0x58000001,0x58000001,0xBF80000,0x58000001,0x58000001,0x58000001,0xBF80000,0x58000001,0x58000001,0x58000001,0x58000001,0x2E00000,0xC00000,0xC00000,0x32C0000,0x1B80000,0x31F40000,0x58000001,0x58000001,0xF40000,0x1500000,0x4FF00000,0x58000001,
-0x17C0000,0xA40088,0x6CA00034,0x5EA00034,0x58A00035,0x689C0026,0x5E9C0015,0x589C0019,0x5A9C0026,0x58980016,0x56980026,0x6A940033,0x60940009,0x5A98000F,0x5C940012,0x58940002,0x56940016,0x5A940033,0x5890000F,0x5690001A,0x54940033,0xF40088,0x64880033,0x58940034,0x5E880026,0x5A8C0013,0x56900024,0x5E800033,0x5888000A,0x56840013,0x54880033,0x1F00088,
-0x58740034,0x56700024,0x54640033,0x5000008C,0x92940002,0xD69C0026,0xE8A00034,0x70940001,0x62940002,0x5C940002,0x5A940002,0x58940002,0x8C8C0001,0x6C900001,0x5A900009,0x56840013,0x15C0088,0xA80034,0x68A40008,0x5CA40009,0x58A40009,0x64A00012,0x5CA00001,0x5AA00001,0x5AA00012,0x58A00005,0x56A00012,0xFC0033,0x5E980009,0x589C0009,0x5C940012,0x58940002,
-0x56980012,0x3F80033,0x58880009,0x56840012,0x54000033,0xFC0033,0x5E980009,0x589C0009,0x5C940012,0x58940002,0x56980012,0x3F80033,0x58880009,0x56840012,0x54000033,0x3F80033,0x58880009,0x56840012,0x54000033,0x54000033,0x8E980001,0xB8A00012,0xCAA40008,0x6A980001,0x62940002,0x5C940002,0x58980002,0x58900002,0x8C8C0001,0x68940001,0x5A900008,0x56840012,
-0x1680033,0xA00034,0xA00034,0xA00034,0xA00034,0x5E9C0014,0x5E9C0014,0x5E9C0014,0x569C0014,0x569C0014,0x52980015,0x60940008,0x60940008,0x60940008,0x58940001,0x58940001,0x54940005,0x5494000A,0x5494000A,0x52940001,0x5094000A,0x2E80033,0x2E80033,0x2E80033,0x5A8C0012,0x5A8C0012,0x52940013,0x58880009,0x58880009,0x528C0002,0x508C000A,0x1DC0033,
-0x1DC0033,0x52800013,0x5078000A,0x4E000033,0x92940001,0xA69C0014,0xA00034,0x6E940001,0x62940001,0x5C940001,0x5A940001,0x56940001,0x8A8C0000,0x6C900000,0x58940009,0x528C0002,0x1500033,0xA40008,0xA40008,0xA40008,0xA40008,0x5AA00000,0x5AA00000,0x5AA00000,0x54A00001,0x54A00001,0x52A00001,0xF40008,0xF40008,0xF40008,0x549C0001,0x549C0001,
-0x529C0001,0x1F00008,0x1F00008,0x52940001,0x5000000A,0xF40008,0xF40008,0xF40008,0x549C0001,0x549C0001,0x529C0001,0x1F00008,0x1F00008,0x52940001,0x5000000A,0x1F00008,0x1F00008,0x52940001,0x5000000A,0x5000000A,0x84980000,0x88A00000,0xA40008,0x68980000,0x5E980000,0x5A980000,0x569C0001,0x56980000,0x80900000,0x66940000,0x15C0008,0x52940001,
-0x15C0008,0xAC0014,0x64A80000,0x5CA80000,0x58A80001,0x3000012,0x5CA00001,0x58A40001,0x5FC0012,0x58940001,0x56000012,0x3000012,0x5CA00001,0x58A40001,0x5FC0012,0x58940001,0x56000012,0x5FC0012,0x58940001,0x56000012,0x56000012,0x3000012,0x5CA00001,0x58A40001,0x5FC0012,0x58940001,0x56000012,0x5FC0012,0x58940001,0x56000012,0x56000012,0x5FC0012,
-0x58940001,0x56000012,0x56000012,0x56000012,0x9E900000,0x4B80012,0xACA80000,0x70940000,0x62940001,0x5E900000,0x58980001,0x588C0001,0x94880000,0x708C0000,0x5A980000,0x56000012,0x1700012,0x980014,0x980014,0x980014,0x980014,0x980014,0x980014,0x980014,0x980014,0x980014,0x980014,0x5A940000,0x5A940000,0x5A940000,0x5A940000,0x5A940000,
-0x5A940000,0x52940000,0x52940000,0x52940000,0x4E940001,0xE40012,0xE40012,0xE40012,0xE40012,0xE40012,0xE40012,0x528C0001,0x528C0001,0x528C0001,0x4E900001,0x1D00012,0x1D00012,0x1D00012,0x4E800001,0x4C000012,0xA2940000,0x980014,0x980014,0x72940000,0x66940000,0x5E940000,0x5E940000,0x58940000,0x8A8C0000,0x6C900000,0x54940001,0x528C0001,
-0x1480012,};
-static const uint32_t g_etc1_to_bc7_m6_table11[] = {
-0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0x1F00000,
-0x1F00000,0x1F00000,0x1F00000,0x50000001,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xEAC0000,0xEAC0000,0xEAC0000,0xF40000,0x15C0000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,
-0x1040000,0x7FC0000,0x7FC0000,0x7FC0000,0x56000001,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x7FC0000,0x7FC0000,0x7FC0000,0x56000001,0x7FC0000,0x7FC0000,0x7FC0000,0x56000001,0x56000001,0xBC0000,0xB00000,0xB00000,0xCC0000,0xDC0000,0xF00000,0xF00000,0x1280000,0xCC0000,0xDC0000,0x1740000,0x7FC0000,
-0x1740000,0xB80000,0xB80000,0xB80000,0xB80000,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0xDFC0000,0xDFC0000,0x5A000001,0x5A000001,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0xDFC0000,0xDFC0000,0x5A000001,0x5A000001,0xDFC0000,
-0xDFC0000,0x5A000001,0x5A000001,0x5A000001,0x2D40000,0xC40000,0xB80000,0x2F80000,0x1340000,0x1880000,0x1C00000,0x21FC0000,0x4E40000,0x1100000,0x1880000,0x5A000001,0x1880000,0xC40000,0x1240000,0x17F80000,0x60000001,0x1240000,0x17F80000,0x60000001,0x17F80000,0x60000001,0x60000001,0x1240000,0x17F80000,0x60000001,0x17F80000,0x60000001,
-0x60000001,0x17F80000,0x60000001,0x60000001,0x60000001,0x1240000,0x17F80000,0x60000001,0x17F80000,0x60000001,0x60000001,0x17F80000,0x60000001,0x60000001,0x60000001,0x17F80000,0x60000001,0x60000001,0x60000001,0x60000001,0x2F40000,0xD00000,0xD00000,0x1480000,0x1E00000,0x3BF40000,0x60000001,0x60000001,0x3080000,0x1700000,0x59C40000,0x60000001,
-0x1A00000,0xB40088,0x74B00034,0x66B00034,0x60B00035,0x70AC0026,0x66AC0015,0x60AC0019,0x62AC0026,0x60A80016,0x5EA80026,0x72A40033,0x68A40009,0x62A8000F,0x64A40012,0x60A40002,0x5EA40016,0x62A40033,0x60A0000F,0x5EA0001A,0x5CA40033,0x10C0088,0x6C980033,0x60A40034,0x66980026,0x629C0013,0x5EA00024,0x66900033,0x6098000A,0x5E940013,0x5C980033,0xBF80088,
-0x60840034,0x5E800024,0x5C740033,0x5800008C,0x9AA40002,0xDEAC0026,0xF0B00034,0x78A40001,0x6AA40002,0x64A40002,0x62A40002,0x60A40002,0x949C0001,0x74A00001,0x62A00009,0x5E940013,0x17C0088,0xB80034,0x70B40008,0x64B40009,0x60B40009,0x6CB00012,0x64B00001,0x62B00001,0x62B00012,0x60B00005,0x5EB00012,0x1140033,0x66A80009,0x60AC0009,0x64A40012,0x60A40002,
-0x5EA80012,0xFF80033,0x60980009,0x5E940012,0x5C000033,0x1140033,0x66A80009,0x60AC0009,0x64A40012,0x60A40002,0x5EA80012,0xFF80033,0x60980009,0x5E940012,0x5C000033,0xFF80033,0x60980009,0x5E940012,0x5C000033,0x5C000033,0x96A80001,0xC0B00012,0xD2B40008,0x72A80001,0x6AA40002,0x64A40002,0x60A80002,0x60A00002,0x949C0001,0x70A40001,0x62A00008,0x5E940012,
-0x18C0033,0xB00034,0xB00034,0xB00034,0xB00034,0x66AC0014,0x66AC0014,0x66AC0014,0x5EAC0014,0x5EAC0014,0x5AA80015,0x68A40008,0x68A40008,0x68A40008,0x60A40001,0x60A40001,0x5CA40005,0x5CA4000A,0x5CA4000A,0x5AA40001,0x58A4000A,0x3000033,0x3000033,0x3000033,0x629C0012,0x629C0012,0x5AA40013,0x60980009,0x60980009,0x5A9C0002,0x589C000A,0x5FC0033,
-0x5FC0033,0x5A900013,0x5888000A,0x56000033,0x9AA40001,0xAEAC0014,0xB00034,0x76A40001,0x6AA40001,0x64A40001,0x62A40001,0x5EA40001,0x929C0000,0x74A00000,0x60A40009,0x5A9C0002,0x1700033,0xB40008,0xB40008,0xB40008,0xB40008,0x62B00000,0x62B00000,0x62B00000,0x5CB00001,0x5CB00001,0x5AB00001,0x10C0008,0x10C0008,0x10C0008,0x5CAC0001,0x5CAC0001,
-0x5AAC0001,0xBF80008,0xBF80008,0x5AA40001,0x5800000A,0x10C0008,0x10C0008,0x10C0008,0x5CAC0001,0x5CAC0001,0x5AAC0001,0xBF80008,0xBF80008,0x5AA40001,0x5800000A,0xBF80008,0xBF80008,0x5AA40001,0x5800000A,0x5800000A,0x8CA80000,0x90B00000,0xB40008,0x70A80000,0x66A80000,0x62A80000,0x5EAC0001,0x5EA80000,0x88A00000,0x6EA40000,0x17C0008,0x5AA40001,
-0x17C0008,0xBC0014,0x6CB80000,0x64B80000,0x60B80001,0x3180012,0x64B00001,0x60B40001,0x11FC0012,0x60A40001,0x5E000012,0x3180012,0x64B00001,0x60B40001,0x11FC0012,0x60A40001,0x5E000012,0x11FC0012,0x60A40001,0x5E000012,0x5E000012,0x3180012,0x64B00001,0x60B40001,0x11FC0012,0x60A40001,0x5E000012,0x11FC0012,0x60A40001,0x5E000012,0x5E000012,0x11FC0012,
-0x60A40001,0x5E000012,0x5E000012,0x5E000012,0xA6A00000,0xCC80012,0xB4B80000,0x78A40000,0x6AA40001,0x66A00000,0x60A80001,0x609C0001,0x9C980000,0x789C0000,0x62A80000,0x5E000012,0x1940012,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0xA80014,0x62A40000,0x62A40000,0x62A40000,0x62A40000,0x62A40000,
-0x62A40000,0x5AA40000,0x5AA40000,0x5AA40000,0x56A40001,0xFC0012,0xFC0012,0xFC0012,0xFC0012,0xFC0012,0xFC0012,0x5A9C0001,0x5A9C0001,0x5A9C0001,0x56A00001,0x3F80012,0x3F80012,0x3F80012,0x56900001,0x54000012,0xAAA40000,0xA80014,0xA80014,0x7AA40000,0x6EA40000,0x66A40000,0x66A40000,0x60A40000,0x929C0000,0x74A00000,0x5CA40001,0x5A9C0001,
-0x1680012,};
-static const uint32_t g_etc1_to_bc7_m6_table12[] = {
-0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0xBFC0000,
-0xBFC0000,0xBFC0000,0xBFC0000,0x5A000000,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0x8C00000,0x8C00000,0x8C00000,0x30C0000,0x1800000,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,
-0x1200000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x60000000,0xECC0000,0xC00001,0xC00001,0x2E00000,0xF40000,0x1080000,0x1080000,0x1440000,0x2E00000,0xF40000,0x19C0000,0x15F80000,
-0x19C0000,0xC80001,0xC80001,0xC80001,0xC80001,0x12C0000,0x12C0000,0x12C0000,0x1BF80000,0x1BF80000,0x64000000,0x12C0000,0x12C0000,0x12C0000,0x1BF80000,0x1BF80000,0x64000000,0x1BF80000,0x1BF80000,0x64000000,0x64000000,0x12C0000,0x12C0000,0x12C0000,0x1BF80000,0x1BF80000,0x64000000,0x1BF80000,0x1BF80000,0x64000000,0x64000000,0x1BF80000,
-0x1BF80000,0x64000000,0x64000000,0x64000000,0x6E80000,0xD80000,0xC80001,0x3100000,0x1540000,0x1AC0000,0x1EC0000,0x2DFC0000,0xFC0000,0x12C0000,0x1AC0000,0x64000000,0x1AC0000,0xD40001,0x33C0000,0x23FC0000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,
-0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x6A000000,0x10C0000,0xE40000,0xE40000,0x1680000,0x7F80000,0x45FC0000,0x6A000000,0x6A000000,0x1240000,0x1900000,0x61F40000,0x6A000000,
-0x1C80000,0xC4008C,0x80C00033,0x70C00033,0x6AC00033,0x7ABC0024,0x70BC0013,0x6CBC001A,0x6CBC0024,0x68BC0016,0x66BC0026,0x78B80034,0x70B8000A,0x6CB8000F,0x6EB40013,0x6AB40002,0x66B80016,0x6AB80034,0x68B4000F,0x66B40019,0x64B40035,0x3240088,0x74AC0033,0x6AB40033,0x6EAC0026,0x6AB00012,0x66B40026,0x70A00033,0x6AA40009,0x66A80015,0x64AC0034,0x17FC0088,
-0x6A900033,0x66980026,0x648C0034,0x62000088,0xA4B80002,0xF6BC0024,0xF8C00034,0x7EB80002,0x72B80002,0x6CB80002,0x6AB80002,0x6AB40002,0xA0AC0001,0x80B00001,0x6AB40009,0x66A80015,0x1A40088,0xCC0033,0x78C8000A,0x6EC4000A,0x6AC4000A,0x72C40013,0x6EC00002,0x6AC40001,0x6AC40013,0x6AC00005,0x66C00015,0x1300033,0x70B80009,0x6ABC000A,0x6EB40012,0x6AB40001,
-0x66BC0014,0x1DF40033,0x6AA40008,0x66A80014,0x64000034,0x1300033,0x70B80009,0x6ABC000A,0x6EB40012,0x6AB40001,0x66BC0014,0x1DF40033,0x6AA40008,0x66A80014,0x64000034,0x1DF40033,0x6AA40008,0x66A80014,0x64000034,0x64000034,0xA4B80001,0xBAC40013,0xCCC8000A,0x7EB80001,0x72B80001,0x6CB80001,0x6AB80001,0x6AB40001,0xA0AC0001,0x7CB40001,0x6AB40009,0x66A80014,
-0x1B00033,0xC00033,0xC00033,0xC00033,0xC00033,0x72BC0012,0x72BC0012,0x72BC0012,0x68BC0012,0x68BC0012,0x64BC0012,0x70B80009,0x70B80009,0x70B80009,0x68B80002,0x68B80002,0x64B80005,0x66B40009,0x66B40009,0x64B40001,0x62B40009,0x11C0033,0x11C0033,0x11C0033,0x6AB00012,0x6AB00012,0x64B40012,0x68AC0009,0x68AC0009,0x64AC0001,0x62AC0009,0x13FC0033,
-0x13FC0033,0x64A00012,0x62940008,0x60000034,0x98B80002,0xC6BC0012,0xC00033,0x7CB80001,0x72B80001,0x6CB80001,0x6AB80002,0x68B40002,0xA0AC0000,0x7AB40001,0x6AB40008,0x64AC0001,0x1980033,0xC4000A,0xC4000A,0xC4000A,0xC4000A,0x6AC40001,0x6AC40001,0x6AC40001,0x66C00001,0x66C00001,0x64C00001,0x3240008,0x3240008,0x3240008,0x66BC0001,0x66BC0001,
-0x64BC0001,0x17FC0008,0x17FC0008,0x64B00000,0x62000008,0x3240008,0x3240008,0x3240008,0x66BC0001,0x66BC0001,0x64BC0001,0x17FC0008,0x17FC0008,0x64B00000,0x62000008,0x17FC0008,0x17FC0008,0x64B00000,0x62000008,0x62000008,0x8CBC0000,0x8AC40001,0xC4000A,0x7CB80000,0x6EBC0000,0x6ABC0000,0x68BC0000,0x66BC0001,0x8EB40000,0x76B80000,0x1A40008,0x64B00000,
-0x1A40008,0xD00012,0x74CC0001,0x6CCC0001,0x6AC80001,0x1340012,0x6EC00001,0x6AC40000,0x1FF80012,0x6AB00000,0x66000014,0x1340012,0x6EC00001,0x6AC40000,0x1FF80012,0x6AB00000,0x66000014,0x1FF80012,0x6AB00000,0x66000014,0x66000014,0x1340012,0x6EC00001,0x6AC40000,0x1FF80012,0x6AB00000,0x66000014,0x1FF80012,0x6AB00000,0x66000014,0x66000014,0x1FF80012,
-0x6AB00000,0x66000014,0x66000014,0x66000014,0xB4B00000,0x6DC0012,0xAECC0001,0x82B40000,0x72B80001,0x6EB40000,0x6AB80000,0x6AA80000,0xA8A80000,0x80B00000,0x6ABC0001,0x66000014,0x1B80012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0xBC0012,0x6AB80001,0x6AB80001,0x6AB80001,0x6AB80001,0x6AB80001,
-0x6AB80001,0x62B80001,0x62B80001,0x62B80001,0x60B40001,0x1180012,0x1180012,0x1180012,0x1180012,0x1180012,0x1180012,0x64AC0001,0x64AC0001,0x64AC0001,0x60B00000,0x11F80012,0x11F80012,0x11F80012,0x609C0000,0x5C000014,0xA4B80001,0xBC0012,0xBC0012,0x7CB80001,0x72B80001,0x6CB80001,0x6CB80001,0x68B80001,0xA0AC0000,0x82B00000,0x66B40000,0x64AC0001,
-0x1900012,};
-static const uint32_t g_etc1_to_bc7_m6_table13[] = {
-0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x17FC0000,
-0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xD40000,0xD40000,0xD40000,0x3240000,0x1A40000,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,
-0x1380000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x68000000,0xE00000,0xD00001,0xD00001,0xF40000,0x1080000,0x11C0000,0x11C0000,0x1600000,0xF40000,0x1080000,0x1BC0000,0x21F80000,
-0x1BC0000,0xD80001,0xD80001,0xD80001,0xD80001,0x1440000,0x1440000,0x1440000,0x27F80000,0x27F80000,0x6C000000,0x1440000,0x1440000,0x1440000,0x27F80000,0x27F80000,0x6C000000,0x27F80000,0x27F80000,0x6C000000,0x6C000000,0x1440000,0x1440000,0x1440000,0x27F80000,0x27F80000,0x6C000000,0x27F80000,0x27F80000,0x6C000000,0x6C000000,0x27F80000,
-0x27F80000,0x6C000000,0x6C000000,0x6C000000,0x2FC0000,0xE80000,0xD80001,0x1280000,0x36C0000,0x1D00000,0x9FC0000,0x39F80000,0x1100000,0x1440000,0x1D00000,0x6C000000,0x1D00000,0xE40001,0x1540000,0x2FFC0000,0x72000000,0x1540000,0x2FFC0000,0x72000000,0x2FFC0000,0x72000000,0x72000000,0x1540000,0x2FFC0000,0x72000000,0x2FFC0000,0x72000000,
-0x72000000,0x2FFC0000,0x72000000,0x72000000,0x72000000,0x1540000,0x2FFC0000,0x72000000,0x2FFC0000,0x72000000,0x72000000,0x2FFC0000,0x72000000,0x72000000,0x72000000,0x2FFC0000,0x72000000,0x72000000,0x72000000,0x72000000,0x1200000,0x2F40000,0x2F40000,0x3800000,0x15F80000,0x4FFC0000,0x72000000,0x72000000,0x1380000,0x1B00000,0x6BC80000,0x72000000,
-0x1E80000,0xD4008C,0x88D00033,0x78D00033,0x72D00033,0x82CC0024,0x78CC0013,0x74CC001A,0x74CC0024,0x70CC0016,0x6ECC0026,0x80C80034,0x78C8000A,0x74C8000F,0x76C40013,0x72C40002,0x6EC80016,0x72C80034,0x70C4000F,0x6EC40019,0x6CC40035,0x33C0088,0x7CBC0033,0x72C40033,0x76BC0026,0x72C00012,0x6EC40026,0x78B00033,0x72B40009,0x6EB80015,0x6CBC0034,0x23FC0088,
-0x72A00033,0x6EA80026,0x6C9C0034,0x6A000088,0xACC80002,0xFECC0024,0xF0D00037,0x86C80002,0x7AC80002,0x74C80002,0x72C80002,0x72C40002,0xA8BC0001,0x88C00001,0x72C40009,0x6EB80015,0x1C80088,0xDC0033,0x80D8000A,0x76D4000A,0x72D4000A,0x7AD40013,0x76D00002,0x72D40001,0x72D40013,0x72D00005,0x6ED00015,0x3440033,0x78C80009,0x72CC000A,0x76C40012,0x72C40001,
-0x6ECC0014,0x27FC0033,0x72B40008,0x6EB80014,0x6C000034,0x3440033,0x78C80009,0x72CC000A,0x76C40012,0x72C40001,0x6ECC0014,0x27FC0033,0x72B40008,0x6EB80014,0x6C000034,0x27FC0033,0x72B40008,0x6EB80014,0x6C000034,0x6C000034,0xACC80001,0xC2D40013,0xD4D8000A,0x86C80001,0x7AC80001,0x74C80001,0x72C80001,0x72C40001,0xA8BC0001,0x84C40001,0x72C40009,0x6EB80014,
-0x1D40033,0xD00033,0xD00033,0xD00033,0xD00033,0x7ACC0012,0x7ACC0012,0x7ACC0012,0x70CC0012,0x70CC0012,0x6CCC0012,0x78C80009,0x78C80009,0x78C80009,0x70C80002,0x70C80002,0x6CC80005,0x6EC40009,0x6EC40009,0x6CC40001,0x6AC40009,0x1340033,0x1340033,0x1340033,0x72C00012,0x72C00012,0x6CC40012,0x70BC0009,0x70BC0009,0x6CBC0001,0x6ABC0009,0x1FF80033,
-0x1FF80033,0x6CB00012,0x6AA40008,0x68000034,0xA0C80002,0xCECC0012,0xD00033,0x84C80001,0x7AC80001,0x74C80001,0x72C80002,0x70C40002,0xA8BC0000,0x82C40001,0x72C40008,0x6CBC0001,0x1B80033,0xD4000A,0xD4000A,0xD4000A,0xD4000A,0x72D40001,0x72D40001,0x72D40001,0x6ED00001,0x6ED00001,0x6CD00001,0x33C0008,0x33C0008,0x33C0008,0x6ECC0001,0x6ECC0001,
-0x6CCC0001,0x23FC0008,0x23FC0008,0x6CC00000,0x6A000008,0x33C0008,0x33C0008,0x33C0008,0x6ECC0001,0x6ECC0001,0x6CCC0001,0x23FC0008,0x23FC0008,0x6CC00000,0x6A000008,0x23FC0008,0x23FC0008,0x6CC00000,0x6A000008,0x6A000008,0x94CC0000,0x92D40001,0xD4000A,0x84C80000,0x76CC0000,0x72CC0000,0x70CC0000,0x6ECC0001,0x96C40000,0x7EC80000,0x1C80008,0x6CC00000,
-0x1C80008,0xE00012,0x7CDC0001,0x74DC0001,0x72D80001,0x14C0012,0x76D00001,0x72D40000,0x2BF80012,0x72C00000,0x6E000014,0x14C0012,0x76D00001,0x72D40000,0x2BF80012,0x72C00000,0x6E000014,0x2BF80012,0x72C00000,0x6E000014,0x6E000014,0x14C0012,0x76D00001,0x72D40000,0x2BF80012,0x72C00000,0x6E000014,0x2BF80012,0x72C00000,0x6E000014,0x6E000014,0x2BF80012,
-0x72C00000,0x6E000014,0x6E000014,0x6E000014,0xBCC00000,0xEEC0012,0xB6DC0001,0x8AC40000,0x7AC80001,0x76C40000,0x72C80000,0x72B80000,0xB0B80000,0x88C00000,0x72CC0001,0x6E000014,0x1DC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0xCC0012,0x72C80001,0x72C80001,0x72C80001,0x72C80001,0x72C80001,
-0x72C80001,0x6AC80001,0x6AC80001,0x6AC80001,0x68C40001,0x1300012,0x1300012,0x1300012,0x1300012,0x1300012,0x1300012,0x6CBC0001,0x6CBC0001,0x6CBC0001,0x68C00000,0x1DF40012,0x1DF40012,0x1DF40012,0x68AC0000,0x64000014,0xACC80001,0xCC0012,0xCC0012,0x84C80001,0x7AC80001,0x74C80001,0x74C80001,0x70C80001,0xA8BC0000,0x8AC00000,0x6EC40000,0x6CBC0001,
-0x1B00012,};
-static const uint32_t g_etc1_to_bc7_m6_table14[] = {
-0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x23FC0000,
-0x23FC0000,0x23FC0000,0x23FC0000,0x6A000000,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xE40000,0xE40000,0xE40000,0x33C0000,0x1C80000,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,
-0x1500000,0x2DF80000,0x2DF80000,0x2DF80000,0x70000000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x2DF80000,0x2DF80000,0x2DF80000,0x70000000,0x2DF80000,0x2DF80000,0x2DF80000,0x70000000,0x70000000,0xF00000,0xE00001,0xE00001,0x7040000,0x11C0000,0x1340000,0x1340000,0x17C0000,0x7040000,0x11C0000,0x1E00000,0x2DF80000,
-0x1E00000,0xE80001,0xE80001,0xE80001,0xE80001,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x33F80000,0x33F80000,0x74000000,0x74000000,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x33F80000,0x33F80000,0x74000000,0x74000000,0x33F80000,
-0x33F80000,0x74000000,0x74000000,0x74000000,0x1100000,0x4F80000,0xE80001,0x33C0000,0x1880000,0x1F00000,0x17FC0000,0x43FC0000,0x1240000,0x15C0000,0x1F00000,0x74000000,0x1F00000,0xF40001,0x16C0000,0x3BFC0000,0x7A000000,0x16C0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x7A000000,0x7A000000,0x16C0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x7A000000,
-0x7A000000,0x3BFC0000,0x7A000000,0x7A000000,0x7A000000,0x16C0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x7A000000,0x7A000000,0x3BFC0000,0x7A000000,0x7A000000,0x7A000000,0x3BFC0000,0x7A000000,0x7A000000,0x7A000000,0x7A000000,0x1340000,0xB040000,0xB040000,0x19C0000,0x21FC0000,0x59FC0000,0x7A000000,0x7A000000,0x34C0000,0x1CC0000,0x73D80000,0x7A000000,
-0x7FC0000,0xE4008C,0x90E00033,0x80E00033,0x7AE00033,0x8ADC0024,0x80DC0013,0x7CDC001A,0x7CDC0024,0x78DC0016,0x76DC0026,0x88D80034,0x80D8000A,0x7CD8000F,0x7ED40013,0x7AD40002,0x76D80016,0x7AD80034,0x78D4000F,0x76D40019,0x74D40035,0x1540088,0x84CC0033,0x7AD40033,0x7ECC0026,0x7AD00012,0x76D40026,0x80C00033,0x7AC40009,0x76C80015,0x74CC0034,0x2FFC0088,
-0x7AB00033,0x76B80026,0x74AC0034,0x72000088,0xB4D80002,0xF6DC0026,0xF8E00037,0x8ED80002,0x82D80002,0x7CD80002,0x7AD80002,0x7AD40002,0xB0CC0001,0x90D00001,0x7AD40009,0x76C80015,0x1E80088,0xEC0033,0x88E8000A,0x7EE4000A,0x7AE4000A,0x82E40013,0x7EE00002,0x7AE40001,0x7AE40013,0x7AE00005,0x76E00015,0x35C0033,0x80D80009,0x7ADC000A,0x7ED40012,0x7AD40001,
-0x76DC0014,0x33FC0033,0x7AC40008,0x76C80014,0x74000034,0x35C0033,0x80D80009,0x7ADC000A,0x7ED40012,0x7AD40001,0x76DC0014,0x33FC0033,0x7AC40008,0x76C80014,0x74000034,0x33FC0033,0x7AC40008,0x76C80014,0x74000034,0x74000034,0xB4D80001,0xCAE40013,0xDCE8000A,0x8ED80001,0x82D80001,0x7CD80001,0x7AD80001,0x7AD40001,0xB0CC0001,0x8CD40001,0x7AD40009,0x76C80014,
-0x1F40033,0xE00033,0xE00033,0xE00033,0xE00033,0x82DC0012,0x82DC0012,0x82DC0012,0x78DC0012,0x78DC0012,0x74DC0012,0x80D80009,0x80D80009,0x80D80009,0x78D80002,0x78D80002,0x74D80005,0x76D40009,0x76D40009,0x74D40001,0x72D40009,0x14C0033,0x14C0033,0x14C0033,0x7AD00012,0x7AD00012,0x74D40012,0x78CC0009,0x78CC0009,0x74CC0001,0x72CC0009,0x2BF80033,
-0x2BF80033,0x74C00012,0x72B40008,0x70000034,0xA8D80002,0xD6DC0012,0xE00033,0x8CD80001,0x82D80001,0x7CD80001,0x7AD80002,0x78D40002,0xB0CC0000,0x8AD40001,0x7AD40008,0x74CC0001,0x1DC0033,0xE4000A,0xE4000A,0xE4000A,0xE4000A,0x7AE40001,0x7AE40001,0x7AE40001,0x76E00001,0x76E00001,0x74E00001,0x1540008,0x1540008,0x1540008,0x76DC0001,0x76DC0001,
-0x74DC0001,0x2FFC0008,0x2FFC0008,0x74D00000,0x72000008,0x1540008,0x1540008,0x1540008,0x76DC0001,0x76DC0001,0x74DC0001,0x2FFC0008,0x2FFC0008,0x74D00000,0x72000008,0x2FFC0008,0x2FFC0008,0x74D00000,0x72000008,0x72000008,0x9CDC0000,0x9AE40001,0xE4000A,0x8CD80000,0x7EDC0000,0x7ADC0000,0x78DC0000,0x76DC0001,0x9ED40000,0x86D80000,0x1E80008,0x74D00000,
-0x1E80008,0xF00012,0x84EC0001,0x7CEC0001,0x7AE80001,0x1640012,0x7EE00001,0x7AE40000,0x37F80012,0x7AD00000,0x76000014,0x1640012,0x7EE00001,0x7AE40000,0x37F80012,0x7AD00000,0x76000014,0x37F80012,0x7AD00000,0x76000014,0x76000014,0x1640012,0x7EE00001,0x7AE40000,0x37F80012,0x7AD00000,0x76000014,0x37F80012,0x7AD00000,0x76000014,0x76000014,0x37F80012,
-0x7AD00000,0x76000014,0x76000014,0x76000014,0xC4D00000,0x1000012,0xBEEC0001,0x92D40000,0x82D80001,0x7ED40000,0x7AD80000,0x7AC80000,0xB8C80000,0x90D00000,0x7ADC0001,0x76000014,0x1FC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0xDC0012,0x7AD80001,0x7AD80001,0x7AD80001,0x7AD80001,0x7AD80001,
-0x7AD80001,0x72D80001,0x72D80001,0x72D80001,0x70D40001,0x3440012,0x3440012,0x3440012,0x3440012,0x3440012,0x3440012,0x74CC0001,0x74CC0001,0x74CC0001,0x70D00000,0x27FC0012,0x27FC0012,0x27FC0012,0x70BC0000,0x6C000014,0xB4D80001,0xDC0012,0xDC0012,0x8CD80001,0x82D80001,0x7CD80001,0x7CD80001,0x78D80001,0xB0CC0000,0x92D00000,0x76D40000,0x74CC0001,
-0x1D40012,};
-static const uint32_t g_etc1_to_bc7_m6_table15[] = {
-0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x2FFC0000,
-0x2FFC0000,0x2FFC0000,0x2FFC0000,0x72000000,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0x2F40000,0x2F40000,0x2F40000,0x1540000,0x1E80000,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,
-0x1680000,0x39F80000,0x39F80000,0x39F80000,0x78000000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x39F80000,0x39F80000,0x39F80000,0x78000000,0x39F80000,0x39F80000,0x39F80000,0x78000000,0x78000000,0x9000000,0xF00001,0xF00001,0x3180000,0x1300000,0x1480000,0x1480000,0x3940000,0x3180000,0x1300000,0x3FC0000,0x39F80000,
-0x3FC0000,0xF80001,0xF80001,0xF80001,0xF80001,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x3FF80000,0x3FF80000,0x7C000000,0x7C000000,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x3FF80000,0x3FF80000,0x7C000000,0x7C000000,0x3FF80000,
-0x3FF80000,0x7C000000,0x7C000000,0x7C000000,0x7200000,0xD080000,0xF80001,0x1540000,0x1A40000,0xBFC0000,0x25FC0000,0x4FF80000,0x1380000,0x1740000,0xBFC0000,0x7C000000,0xBFC0000,0x1040001,0x1840000,0x47FC0000,0x82000000,0x1840000,0x47FC0000,0x82000000,0x47FC0000,0x82000000,0x82000000,0x1840000,0x47FC0000,0x82000000,0x47FC0000,0x82000000,
-0x82000000,0x47FC0000,0x82000000,0x82000000,0x82000000,0x1840000,0x47FC0000,0x82000000,0x47FC0000,0x82000000,0x82000000,0x47FC0000,0x82000000,0x82000000,0x82000000,0x47FC0000,0x82000000,0x82000000,0x82000000,0x82000000,0x1480000,0x1180000,0x1180000,0x1B80000,0x2FFC0000,0x65F00000,0x82000000,0x82000000,0x1640000,0x1EC0000,0x7BE80000,0x82000000,
-0x17FC0000,0xF4008C,0x98F00033,0x88F00033,0x82F00033,0x92EC0024,0x88EC0013,0x84EC001A,0x84EC0024,0x80EC0016,0x7EEC0026,0x90E80034,0x88E8000A,0x84E8000F,0x86E40013,0x82E40002,0x7EE80016,0x82E80034,0x80E4000F,0x7EE40019,0x7CE40035,0x16C0088,0x8CDC0033,0x82E40033,0x86DC0026,0x82E00012,0x7EE40026,0x88D00033,0x82D40009,0x7ED80015,0x7CDC0034,0x3BFC0088,
-0x82C00033,0x7EC80026,0x7CBC0034,0x7A000088,0xBCE80002,0xFEEC0026,0xF0F0003C,0x96E80002,0x8AE80002,0x84E80002,0x82E80002,0x82E40002,0xB8DC0001,0x98E00001,0x82E40009,0x7ED80015,0x7FC0088,0xFC0033,0x90F8000A,0x86F4000A,0x82F4000A,0x8AF40013,0x86F00002,0x82F40001,0x82F40013,0x82F00005,0x7EF00015,0x3740033,0x88E80009,0x82EC000A,0x86E40012,0x82E40001,
-0x7EEC0014,0x3FFC0033,0x82D40008,0x7ED80014,0x7C000034,0x3740033,0x88E80009,0x82EC000A,0x86E40012,0x82E40001,0x7EEC0014,0x3FFC0033,0x82D40008,0x7ED80014,0x7C000034,0x3FFC0033,0x82D40008,0x7ED80014,0x7C000034,0x7C000034,0xBCE80001,0xD2F40013,0xE4F8000A,0x96E80001,0x8AE80001,0x84E80001,0x82E80001,0x82E40001,0xB8DC0001,0x94E40001,0x82E40009,0x7ED80014,
-0xDFC0033,0xF00033,0xF00033,0xF00033,0xF00033,0x8AEC0012,0x8AEC0012,0x8AEC0012,0x80EC0012,0x80EC0012,0x7CEC0012,0x88E80009,0x88E80009,0x88E80009,0x80E80002,0x80E80002,0x7CE80005,0x7EE40009,0x7EE40009,0x7CE40001,0x7AE40009,0x1640033,0x1640033,0x1640033,0x82E00012,0x82E00012,0x7CE40012,0x80DC0009,0x80DC0009,0x7CDC0001,0x7ADC0009,0x37F80033,
-0x37F80033,0x7CD00012,0x7AC40008,0x78000034,0xB0E80002,0xDEEC0012,0xF00033,0x94E80001,0x8AE80001,0x84E80001,0x82E80002,0x80E40002,0xB8DC0000,0x92E40001,0x82E40008,0x7CDC0001,0x1FC0033,0xF4000A,0xF4000A,0xF4000A,0xF4000A,0x82F40001,0x82F40001,0x82F40001,0x7EF00001,0x7EF00001,0x7CF00001,0x16C0008,0x16C0008,0x16C0008,0x7EEC0001,0x7EEC0001,
-0x7CEC0001,0x3BFC0008,0x3BFC0008,0x7CE00000,0x7A000008,0x16C0008,0x16C0008,0x16C0008,0x7EEC0001,0x7EEC0001,0x7CEC0001,0x3BFC0008,0x3BFC0008,0x7CE00000,0x7A000008,0x3BFC0008,0x3BFC0008,0x7CE00000,0x7A000008,0x7A000008,0xA4EC0000,0xA2F40001,0xF4000A,0x94E80000,0x86EC0000,0x82EC0000,0x80EC0000,0x7EEC0001,0xA6E40000,0x8EE80000,0x7FC0008,0x7CE00000,
-0x7FC0008,0x1000012,0x8CFC0001,0x84FC0001,0x82F80001,0x17C0012,0x86F00001,0x82F40000,0x43F80012,0x82E00000,0x7E000014,0x17C0012,0x86F00001,0x82F40000,0x43F80012,0x82E00000,0x7E000014,0x43F80012,0x82E00000,0x7E000014,0x7E000014,0x17C0012,0x86F00001,0x82F40000,0x43F80012,0x82E00000,0x7E000014,0x43F80012,0x82E00000,0x7E000014,0x7E000014,0x43F80012,
-0x82E00000,0x7E000014,0x7E000014,0x7E000014,0xCCE00000,0x1100012,0xC6FC0001,0x9AE40000,0x8AE80001,0x86E40000,0x82E80000,0x82D80000,0xC0D80000,0x98E00000,0x82EC0001,0x7E000014,0x11FC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0xEC0012,0x82E80001,0x82E80001,0x82E80001,0x82E80001,0x82E80001,
-0x82E80001,0x7AE80001,0x7AE80001,0x7AE80001,0x78E40001,0x35C0012,0x35C0012,0x35C0012,0x35C0012,0x35C0012,0x35C0012,0x7CDC0001,0x7CDC0001,0x7CDC0001,0x78E00000,0x33FC0012,0x33FC0012,0x33FC0012,0x78CC0000,0x74000014,0xBCE80001,0xEC0012,0xEC0012,0x94E80001,0x8AE80001,0x84E80001,0x84E80001,0x80E80001,0xB8DC0000,0x9AE00000,0x7EE40000,0x7CDC0001,
-0x1F40012,};
-static const uint32_t g_etc1_to_bc7_m6_table16[] = {
-0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x3DF80000,
-0x3DF80000,0x3DF80000,0x3DF80000,0x7A000001,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0x1080000,0x1080000,0x1080000,0x1700000,0x9FC0000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,
-0x3800000,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x80000001,0x3140000,0x1040000,0x1040000,0x52C0000,0x3440000,0x1600000,0x1600000,0x1B40000,0x52C0000,0x3440000,0x15FC0000,0x45FC0000,
-0x15FC0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x38C0000,0x38C0000,0x38C0000,0x4BFC0000,0x4BFC0000,0x84000001,0x38C0000,0x38C0000,0x38C0000,0x4BFC0000,0x4BFC0000,0x84000001,0x4BFC0000,0x4BFC0000,0x84000001,0x84000001,0x38C0000,0x38C0000,0x38C0000,0x4BFC0000,0x4BFC0000,0x84000001,0x4BFC0000,0x4BFC0000,0x84000001,0x84000001,0x4BFC0000,
-0x4BFC0000,0x84000001,0x84000001,0x84000001,0x1380000,0x71C0000,0x10C0000,0x16C0000,0x3C00000,0x1DF80000,0x35F80000,0x5BF80000,0x1500000,0x38C0000,0x1DF80000,0x84000001,0x1DF80000,0x1180000,0x1A00000,0x55F80000,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,
-0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x8A000001,0x1600000,0xD280000,0xD280000,0x3D40000,0x3FF80000,0x6FF80000,0x8A000001,0x8A000001,0x17C0000,0xBFC0000,0x85DC0000,0x8A000001,
-0x27FC0000,0x1080088,0x9F040034,0x91040034,0x8B040035,0x9B000026,0x91000015,0x8B000019,0x8D000026,0x8AFC0016,0x88FC0026,0x9CF80033,0x92F80009,0x8CFC000F,0x8EF80012,0x8AF80002,0x88F80016,0x8CF80033,0x8AF4000F,0x88F4001A,0x86F80033,0x1880088,0x96EC0033,0x8AF80034,0x90EC0026,0x8CF00013,0x88F40024,0x90E40033,0x8AEC000A,0x88E80013,0x86EC0033,0x49F80088,
-0x8AD80034,0x88D40024,0x86C80033,0x8200008C,0xC4F80002,0xF9000028,0xFB040038,0xA2F80001,0x94F80002,0x8EF80002,0x8CF80002,0x8AF80002,0xBEF00001,0x9EF40001,0x8CF40009,0x88E80013,0x19FC0088,0x10C0034,0x9B080008,0x8F080009,0x8B080009,0x97040012,0x8F040001,0x8D040001,0x8D040012,0x8B040005,0x89040012,0x1900033,0x90FC0009,0x8B000009,0x8EF80012,0x8AF80002,
-0x88FC0012,0x4DFC0033,0x8AEC0009,0x88E80012,0x86000033,0x1900033,0x90FC0009,0x8B000009,0x8EF80012,0x8AF80002,0x88FC0012,0x4DFC0033,0x8AEC0009,0x88E80012,0x86000033,0x4DFC0033,0x8AEC0009,0x88E80012,0x86000033,0x86000033,0xC0FC0001,0xEB040012,0xFD080008,0x9CFC0001,0x94F80002,0x8EF80002,0x8AFC0002,0x8AF40002,0xBEF00001,0x9AF80001,0x8CF40008,0x88E80012,
-0x1FF80033,0x1040034,0x1040034,0x1040034,0x1040034,0x91000014,0x91000014,0x91000014,0x89000014,0x89000014,0x84FC0015,0x92F80008,0x92F80008,0x92F80008,0x8AF80001,0x8AF80001,0x86F80005,0x86F8000A,0x86F8000A,0x84F80001,0x82F8000A,0x1800033,0x1800033,0x1800033,0x8CF00012,0x8CF00012,0x84F80013,0x8AEC0009,0x8AEC0009,0x84F00002,0x82F0000A,0x45F80033,
-0x45F80033,0x84E40013,0x82DC000A,0x80000033,0xC4F80001,0xD9000014,0x1040034,0xA0F80001,0x94F80001,0x8EF80001,0x8CF80001,0x88F80001,0xBCF00000,0x9EF40000,0x8AF80009,0x84F00002,0x13FC0033,0x1080008,0x1080008,0x1080008,0x1080008,0x8D040000,0x8D040000,0x8D040000,0x87040001,0x87040001,0x85040001,0x1880008,0x1880008,0x1880008,0x87000001,0x87000001,
-0x85000001,0x49F80008,0x49F80008,0x84F80001,0x8200000A,0x1880008,0x1880008,0x1880008,0x87000001,0x87000001,0x85000001,0x49F80008,0x49F80008,0x84F80001,0x8200000A,0x49F80008,0x49F80008,0x84F80001,0x8200000A,0x8200000A,0xB6FC0000,0xBB040000,0x1080008,0x9AFC0000,0x90FC0000,0x8CFC0000,0x89000001,0x88FC0000,0xB2F40000,0x98F80000,0x19FC0008,0x84F80001,
-0x19FC0008,0x1100014,0x970C0000,0x8F0C0000,0x8B0C0001,0x1980012,0x8F040001,0x8B080001,0x51F80012,0x8AF80001,0x88000012,0x1980012,0x8F040001,0x8B080001,0x51F80012,0x8AF80001,0x88000012,0x51F80012,0x8AF80001,0x88000012,0x88000012,0x1980012,0x8F040001,0x8B080001,0x51F80012,0x8AF80001,0x88000012,0x51F80012,0x8AF80001,0x88000012,0x88000012,0x51F80012,
-0x8AF80001,0x88000012,0x88000012,0x88000012,0xD0F40000,0x1240012,0xDF0C0000,0xA2F80000,0x94F80001,0x90F40000,0x8AFC0001,0x8AF00001,0xC6EC0000,0xA2F00000,0x8CFC0000,0x88000012,0x21FC0012,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0xFC0014,0x8CF80000,0x8CF80000,0x8CF80000,0x8CF80000,0x8CF80000,
-0x8CF80000,0x84F80000,0x84F80000,0x84F80000,0x80F80001,0x1780012,0x1780012,0x1780012,0x1780012,0x1780012,0x1780012,0x84F00001,0x84F00001,0x84F00001,0x80F40001,0x41FC0012,0x41FC0012,0x41FC0012,0x80E40001,0x7E000012,0xD4F80000,0xFC0014,0xFC0014,0xA4F80000,0x98F80000,0x90F80000,0x90F80000,0x8AF80000,0xBCF00000,0x9EF40000,0x86F80001,0x84F00001,
-0xFFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table17[] = {
-0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x49F80000,
-0x49F80000,0x49F80000,0x49F80000,0x82000001,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x5180000,0x5180000,0x5180000,0x1880000,0x19FC0000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,
-0x3980000,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x88000001,0xB240000,0x1140000,0x1140000,0x1400000,0x3580000,0x1780000,0x1780000,0x1D00000,0x1400000,0x3580000,0x23FC0000,0x51FC0000,
-0x23FC0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x3A40000,0x3A40000,0x3A40000,0x57FC0000,0x57FC0000,0x8C000001,0x3A40000,0x3A40000,0x3A40000,0x57FC0000,0x57FC0000,0x8C000001,0x57FC0000,0x57FC0000,0x8C000001,0x8C000001,0x3A40000,0x3A40000,0x3A40000,0x57FC0000,0x57FC0000,0x8C000001,0x57FC0000,0x57FC0000,0x8C000001,0x8C000001,0x57FC0000,
-0x57FC0000,0x8C000001,0x8C000001,0x8C000001,0x5480000,0xF2C0000,0x11C0000,0x3800000,0x1DC0000,0x2BFC0000,0x41FC0000,0x65FC0000,0x1640000,0x3A40000,0x2BFC0000,0x8C000001,0x2BFC0000,0x1280000,0x1B80000,0x61F80000,0x92000001,0x1B80000,0x61F80000,0x92000001,0x61F80000,0x92000001,0x92000001,0x1B80000,0x61F80000,0x92000001,0x61F80000,0x92000001,
-0x92000001,0x61F80000,0x92000001,0x92000001,0x92000001,0x1B80000,0x61F80000,0x92000001,0x61F80000,0x92000001,0x92000001,0x61F80000,0x92000001,0x92000001,0x92000001,0x61F80000,0x92000001,0x92000001,0x92000001,0x92000001,0x1740000,0x13C0000,0x13C0000,0x1F00000,0x4BFC0000,0x79F80000,0x92000001,0x92000001,0x3900000,0x1BFC0000,0x8DEC0000,0x92000001,
-0x37FC0000,0x1180088,0xA7140034,0x99140034,0x93140035,0xA3100026,0x99100015,0x93100019,0x95100026,0x930C0016,0x910C0026,0xA5080033,0x9B080009,0x950C000F,0x97080012,0x93080002,0x91080016,0x95080033,0x9304000F,0x9104001A,0x8F080033,0x1A00088,0x9EFC0033,0x93080034,0x98FC0026,0x95000013,0x91040024,0x98F40033,0x92FC000A,0x90F80013,0x8EFC0033,0x55F80088,
-0x92E80034,0x90E40024,0x8ED80033,0x8A00008C,0xCD080002,0xF110002E,0xF314003D,0xAB080001,0x9D080002,0x97080002,0x95080002,0x93080002,0xC7000001,0xA7040001,0x95040009,0x90F80013,0x27FC0088,0x11C0034,0xA3180008,0x97180009,0x93180009,0x9F140012,0x97140001,0x95140001,0x95140012,0x93140005,0x91140012,0x1A80033,0x990C0009,0x93100009,0x97080012,0x93080002,
-0x910C0012,0x59FC0033,0x92FC0009,0x90F80012,0x8E000033,0x1A80033,0x990C0009,0x93100009,0x97080012,0x93080002,0x910C0012,0x59FC0033,0x92FC0009,0x90F80012,0x8E000033,0x59FC0033,0x92FC0009,0x90F80012,0x8E000033,0x8E000033,0xC90C0001,0xF3140012,0xF5180009,0xA50C0001,0x9D080002,0x97080002,0x930C0002,0x93040002,0xC7000001,0xA3080001,0x95040008,0x90F80012,
-0x2DFC0033,0x1140034,0x1140034,0x1140034,0x1140034,0x99100014,0x99100014,0x99100014,0x91100014,0x91100014,0x8D0C0015,0x9B080008,0x9B080008,0x9B080008,0x93080001,0x93080001,0x8F080005,0x8F08000A,0x8F08000A,0x8D080001,0x8B08000A,0x1980033,0x1980033,0x1980033,0x95000012,0x95000012,0x8D080013,0x92FC0009,0x92FC0009,0x8D000002,0x8B00000A,0x51F80033,
-0x51F80033,0x8CF40013,0x8AEC000A,0x88000033,0xCD080001,0xE1100014,0x1140034,0xA9080001,0x9D080001,0x97080001,0x95080001,0x91080001,0xC5000000,0xA7040000,0x93080009,0x8D000002,0x21FC0033,0x1180008,0x1180008,0x1180008,0x1180008,0x95140000,0x95140000,0x95140000,0x8F140001,0x8F140001,0x8D140001,0x1A00008,0x1A00008,0x1A00008,0x8F100001,0x8F100001,
-0x8D100001,0x55F80008,0x55F80008,0x8D080001,0x8A00000A,0x1A00008,0x1A00008,0x1A00008,0x8F100001,0x8F100001,0x8D100001,0x55F80008,0x55F80008,0x8D080001,0x8A00000A,0x55F80008,0x55F80008,0x8D080001,0x8A00000A,0x8A00000A,0xBF0C0000,0xC3140000,0x1180008,0xA30C0000,0x990C0000,0x950C0000,0x91100001,0x910C0000,0xBB040000,0xA1080000,0x27FC0008,0x8D080001,
-0x27FC0008,0x1200014,0x9F1C0000,0x971C0000,0x931C0001,0x1B00012,0x97140001,0x93180001,0x5DF40012,0x93080001,0x90000012,0x1B00012,0x97140001,0x93180001,0x5DF40012,0x93080001,0x90000012,0x5DF40012,0x93080001,0x90000012,0x90000012,0x1B00012,0x97140001,0x93180001,0x5DF40012,0x93080001,0x90000012,0x5DF40012,0x93080001,0x90000012,0x90000012,0x5DF40012,
-0x93080001,0x90000012,0x90000012,0x90000012,0xD9040000,0x3340012,0xE71C0000,0xAB080000,0x9D080001,0x99040000,0x930C0001,0x93000001,0xCEFC0000,0xAB000000,0x950C0000,0x90000012,0x31FC0012,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x10C0014,0x95080000,0x95080000,0x95080000,0x95080000,0x95080000,
-0x95080000,0x8D080000,0x8D080000,0x8D080000,0x89080001,0x1900012,0x1900012,0x1900012,0x1900012,0x1900012,0x1900012,0x8D000001,0x8D000001,0x8D000001,0x89040001,0x4DFC0012,0x4DFC0012,0x4DFC0012,0x88F40001,0x86000012,0xDD080000,0x10C0014,0x10C0014,0xAD080000,0xA1080000,0x99080000,0x99080000,0x93080000,0xC5000000,0xA7040000,0x8F080001,0x8D000001,
-0x1FF80012,};
-static const uint32_t g_etc1_to_bc7_m6_table18[] = {
-0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x55F80000,
-0x55F80000,0x55F80000,0x55F80000,0x8A000001,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0xD280000,0xD280000,0xD280000,0x1A00000,0x27FC0000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,
-0x3B00000,0x5DFC0000,0x5DFC0000,0x5DFC0000,0x90000001,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x5DFC0000,0x5DFC0000,0x5DFC0000,0x90000001,0x5DFC0000,0x5DFC0000,0x5DFC0000,0x90000001,0x90000001,0x1380000,0x1240000,0x1240000,0x1540000,0x36C0000,0x18C0000,0x18C0000,0x1EC0000,0x1540000,0x36C0000,0x33FC0000,0x5DFC0000,
-0x33FC0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x63FC0000,0x63FC0000,0x94000001,0x94000001,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x63FC0000,0x63FC0000,0x94000001,0x94000001,0x63FC0000,
-0x63FC0000,0x94000001,0x94000001,0x94000001,0x15C0000,0x1400000,0x12C0000,0x1980000,0x1F80000,0x3BFC0000,0x4FFC0000,0x71F80000,0x1780000,0x3BC0000,0x3BFC0000,0x94000001,0x3BFC0000,0x1380000,0x1D00000,0x6DF80000,0x9A000001,0x1D00000,0x6DF80000,0x9A000001,0x6DF80000,0x9A000001,0x9A000001,0x1D00000,0x6DF80000,0x9A000001,0x6DF80000,0x9A000001,
-0x9A000001,0x6DF80000,0x9A000001,0x9A000001,0x9A000001,0x1D00000,0x6DF80000,0x9A000001,0x6DF80000,0x9A000001,0x9A000001,0x6DF80000,0x9A000001,0x9A000001,0x9A000001,0x6DF80000,0x9A000001,0x9A000001,0x9A000001,0x9A000001,0x1880000,0x14C0000,0x14C0000,0xDFC0000,0x59FC0000,0x83F80000,0x9A000001,0x9A000001,0x1A80000,0x2DFC0000,0x95FC0000,0x9A000001,
-0x45FC0000,0x1280088,0xAF240034,0xA1240034,0x9B240035,0xAB200026,0xA1200015,0x9B200019,0x9D200026,0x9B1C0016,0x991C0026,0xAD180033,0xA3180009,0x9D1C000F,0x9F180012,0x9B180002,0x99180016,0x9D180033,0x9B14000F,0x9914001A,0x97180033,0x1B80088,0xA70C0033,0x9B180034,0xA10C0026,0x9D100013,0x99140024,0xA1040033,0x9B0C000A,0x99080013,0x970C0033,0x61F80088,
-0x9AF80034,0x98F40024,0x96E80033,0x9200008C,0xD5180002,0xF920002E,0xFB24003D,0xB3180001,0xA5180002,0x9F180002,0x9D180002,0x9B180002,0xCF100001,0xAF140001,0x9D140009,0x99080013,0x37FC0088,0x12C0034,0xAB280008,0x9F280009,0x9B280009,0xA7240012,0x9F240001,0x9D240001,0x9D240012,0x9B240005,0x99240012,0x1C00033,0xA11C0009,0x9B200009,0x9F180012,0x9B180002,
-0x991C0012,0x65F80033,0x9B0C0009,0x99080012,0x96000033,0x1C00033,0xA11C0009,0x9B200009,0x9F180012,0x9B180002,0x991C0012,0x65F80033,0x9B0C0009,0x99080012,0x96000033,0x65F80033,0x9B0C0009,0x99080012,0x96000033,0x96000033,0xD11C0001,0xFB240012,0xFD280009,0xAD1C0001,0xA5180002,0x9F180002,0x9B1C0002,0x9B140002,0xCF100001,0xAB180001,0x9D140008,0x99080012,
-0x3DF80033,0x1240034,0x1240034,0x1240034,0x1240034,0xA1200014,0xA1200014,0xA1200014,0x99200014,0x99200014,0x951C0015,0xA3180008,0xA3180008,0xA3180008,0x9B180001,0x9B180001,0x97180005,0x9718000A,0x9718000A,0x95180001,0x9318000A,0x1B00033,0x1B00033,0x1B00033,0x9D100012,0x9D100012,0x95180013,0x9B0C0009,0x9B0C0009,0x95100002,0x9310000A,0x5DF40033,
-0x5DF40033,0x95040013,0x92FC000A,0x90000033,0xD5180001,0xE9200014,0x1240034,0xB1180001,0xA5180001,0x9F180001,0x9D180001,0x99180001,0xCD100000,0xAF140000,0x9B180009,0x95100002,0x31FC0033,0x1280008,0x1280008,0x1280008,0x1280008,0x9D240000,0x9D240000,0x9D240000,0x97240001,0x97240001,0x95240001,0x1B80008,0x1B80008,0x1B80008,0x97200001,0x97200001,
-0x95200001,0x61F80008,0x61F80008,0x95180001,0x9200000A,0x1B80008,0x1B80008,0x1B80008,0x97200001,0x97200001,0x95200001,0x61F80008,0x61F80008,0x95180001,0x9200000A,0x61F80008,0x61F80008,0x95180001,0x9200000A,0x9200000A,0xC71C0000,0xCB240000,0x1280008,0xAB1C0000,0xA11C0000,0x9D1C0000,0x99200001,0x991C0000,0xC3140000,0xA9180000,0x37FC0008,0x95180001,
-0x37FC0008,0x1300014,0xA72C0000,0x9F2C0000,0x9B2C0001,0x3C40012,0x9F240001,0x9B280001,0x67FC0012,0x9B180001,0x98000012,0x3C40012,0x9F240001,0x9B280001,0x67FC0012,0x9B180001,0x98000012,0x67FC0012,0x9B180001,0x98000012,0x98000012,0x3C40012,0x9F240001,0x9B280001,0x67FC0012,0x9B180001,0x98000012,0x67FC0012,0x9B180001,0x98000012,0x98000012,0x67FC0012,
-0x9B180001,0x98000012,0x98000012,0x98000012,0xE1140000,0xB440012,0xEF2C0000,0xB3180000,0xA5180001,0xA1140000,0x9B1C0001,0x9B100001,0xD70C0000,0xB3100000,0x9D1C0000,0x98000012,0x3FFC0012,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x11C0014,0x9D180000,0x9D180000,0x9D180000,0x9D180000,0x9D180000,
-0x9D180000,0x95180000,0x95180000,0x95180000,0x91180001,0x1A80012,0x1A80012,0x1A80012,0x1A80012,0x1A80012,0x1A80012,0x95100001,0x95100001,0x95100001,0x91140001,0x59FC0012,0x59FC0012,0x59FC0012,0x91040001,0x8E000012,0xE5180000,0x11C0014,0x11C0014,0xB5180000,0xA9180000,0xA1180000,0xA1180000,0x9B180000,0xCD100000,0xAF140000,0x97180001,0x95100001,
-0x2DFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table19[] = {
-0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x61F80000,
-0x61F80000,0x61F80000,0x61F80000,0x92000001,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x13C0000,0x13C0000,0x13C0000,0x1B80000,0x37FC0000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,
-0x1C80000,0x69FC0000,0x69FC0000,0x69FC0000,0x98000001,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x69FC0000,0x69FC0000,0x69FC0000,0x98000001,0x69FC0000,0x69FC0000,0x69FC0000,0x98000001,0x98000001,0x1480000,0x1340000,0x1340000,0x5640000,0x3800000,0x1A40000,0x1A40000,0x7FC0000,0x5640000,0x3800000,0x41FC0000,0x69FC0000,
-0x41FC0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x6FFC0000,0x6FFC0000,0x9C000001,0x9C000001,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x6FFC0000,0x6FFC0000,0x9C000001,0x9C000001,0x6FFC0000,
-0x6FFC0000,0x9C000001,0x9C000001,0x9C000001,0x1700000,0x1500000,0x13C0000,0x3AC0000,0x11FC0000,0x49FC0000,0x5DF80000,0x7BFC0000,0x18C0000,0x1D40000,0x49FC0000,0x9C000001,0x49FC0000,0x1480000,0x1E80000,0x79F80000,0xA2000001,0x1E80000,0x79F80000,0xA2000001,0x79F80000,0xA2000001,0xA2000001,0x1E80000,0x79F80000,0xA2000001,0x79F80000,0xA2000001,
-0xA2000001,0x79F80000,0xA2000001,0xA2000001,0xA2000001,0x1E80000,0x79F80000,0xA2000001,0x79F80000,0xA2000001,0xA2000001,0x79F80000,0xA2000001,0xA2000001,0xA2000001,0x79F80000,0xA2000001,0xA2000001,0xA2000001,0xA2000001,0x19C0000,0x75C0000,0x75C0000,0x21FC0000,0x67F80000,0x8DF80000,0xA2000001,0xA2000001,0x3BC0000,0x3DFC0000,0x9FD00000,0xA2000001,
-0x55FC0000,0x1380088,0xB7340034,0xA9340034,0xA3340035,0xB3300026,0xA9300015,0xA3300019,0xA5300026,0xA32C0016,0xA12C0026,0xB5280033,0xAB280009,0xA52C000F,0xA7280012,0xA3280002,0xA1280016,0xA5280033,0xA324000F,0xA124001A,0x9F280033,0x1D00088,0xAF1C0033,0xA3280034,0xA91C0026,0xA5200013,0xA1240024,0xA9140033,0xA31C000A,0xA1180013,0x9F1C0033,0x6DF80088,
-0xA3080034,0xA1040024,0x9EF80033,0x9A00008C,0xDD280002,0xF1300038,0xF3340044,0xBB280001,0xAD280002,0xA7280002,0xA5280002,0xA3280002,0xD7200001,0xB7240001,0xA5240009,0xA1180013,0x45FC0088,0x13C0034,0xB3380008,0xA7380009,0xA3380009,0xAF340012,0xA7340001,0xA5340001,0xA5340012,0xA3340005,0xA1340012,0x1D80033,0xA92C0009,0xA3300009,0xA7280012,0xA3280002,
-0xA12C0012,0x71F80033,0xA31C0009,0xA1180012,0x9E000033,0x1D80033,0xA92C0009,0xA3300009,0xA7280012,0xA3280002,0xA12C0012,0x71F80033,0xA31C0009,0xA1180012,0x9E000033,0x71F80033,0xA31C0009,0xA1180012,0x9E000033,0x9E000033,0xD92C0001,0xF3340014,0xF538000C,0xB52C0001,0xAD280002,0xA7280002,0xA32C0002,0xA3240002,0xD7200001,0xB3280001,0xA5240008,0xA1180012,
-0x4BFC0033,0x1340034,0x1340034,0x1340034,0x1340034,0xA9300014,0xA9300014,0xA9300014,0xA1300014,0xA1300014,0x9D2C0015,0xAB280008,0xAB280008,0xAB280008,0xA3280001,0xA3280001,0x9F280005,0x9F28000A,0x9F28000A,0x9D280001,0x9B28000A,0x3C40033,0x3C40033,0x3C40033,0xA5200012,0xA5200012,0x9D280013,0xA31C0009,0xA31C0009,0x9D200002,0x9B20000A,0x67FC0033,
-0x67FC0033,0x9D140013,0x9B0C000A,0x98000033,0xDD280001,0xF1300014,0x1340034,0xB9280001,0xAD280001,0xA7280001,0xA5280001,0xA1280001,0xD5200000,0xB7240000,0xA3280009,0x9D200002,0x3FFC0033,0x1380008,0x1380008,0x1380008,0x1380008,0xA5340000,0xA5340000,0xA5340000,0x9F340001,0x9F340001,0x9D340001,0x1D00008,0x1D00008,0x1D00008,0x9F300001,0x9F300001,
-0x9D300001,0x6DF80008,0x6DF80008,0x9D280001,0x9A00000A,0x1D00008,0x1D00008,0x1D00008,0x9F300001,0x9F300001,0x9D300001,0x6DF80008,0x6DF80008,0x9D280001,0x9A00000A,0x6DF80008,0x6DF80008,0x9D280001,0x9A00000A,0x9A00000A,0xCF2C0000,0xD3340000,0x1380008,0xB32C0000,0xA92C0000,0xA52C0000,0xA1300001,0xA12C0000,0xCB240000,0xB1280000,0x45FC0008,0x9D280001,
-0x45FC0008,0x1400014,0xAF3C0000,0xA73C0000,0xA33C0001,0x3DC0012,0xA7340001,0xA3380001,0x73FC0012,0xA3280001,0xA0000012,0x3DC0012,0xA7340001,0xA3380001,0x73FC0012,0xA3280001,0xA0000012,0x73FC0012,0xA3280001,0xA0000012,0xA0000012,0x3DC0012,0xA7340001,0xA3380001,0x73FC0012,0xA3280001,0xA0000012,0x73FC0012,0xA3280001,0xA0000012,0xA0000012,0x73FC0012,
-0xA3280001,0xA0000012,0xA0000012,0xA0000012,0xE9240000,0x1580012,0xF73C0000,0xBB280000,0xAD280001,0xA9240000,0xA32C0001,0xA3200001,0xDF1C0000,0xBB200000,0xA52C0000,0xA0000012,0x4FFC0012,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0x12C0014,0xA5280000,0xA5280000,0xA5280000,0xA5280000,0xA5280000,
-0xA5280000,0x9D280000,0x9D280000,0x9D280000,0x99280001,0x1C00012,0x1C00012,0x1C00012,0x1C00012,0x1C00012,0x1C00012,0x9D200001,0x9D200001,0x9D200001,0x99240001,0x65F80012,0x65F80012,0x65F80012,0x99140001,0x96000012,0xED280000,0x12C0014,0x12C0014,0xBD280000,0xB1280000,0xA9280000,0xA9280000,0xA3280000,0xD5200000,0xB7240000,0x9F280001,0x9D200001,
-0x3DF80012,};
-static const uint32_t g_etc1_to_bc7_m6_table20[] = {
-0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x6DFC0000,
-0x6DFC0000,0x6DFC0000,0x6DFC0000,0x9C000000,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0xF4C0000,0xF4C0000,0xF4C0000,0x3D00000,0x47FC0000,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,
-0x1E40000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0xA2000000,0x15C0000,0x1440001,0x1440001,0x17C0000,0x1980000,0x1BC0000,0x1BC0000,0x1DFC0000,0x17C0000,0x1980000,0x53FC0000,0x77F80000,
-0x53FC0000,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x1F00000,0x1F00000,0x1F00000,0x7DF80000,0x7DF80000,0xA6000000,0x1F00000,0x1F00000,0x1F00000,0x7DF80000,0x7DF80000,0xA6000000,0x7DF80000,0x7DF80000,0xA6000000,0xA6000000,0x1F00000,0x1F00000,0x1F00000,0x7DF80000,0x7DF80000,0xA6000000,0x7DF80000,0x7DF80000,0xA6000000,0xA6000000,0x7DF80000,
-0x7DF80000,0xA6000000,0xA6000000,0xA6000000,0x1840000,0x1640000,0x14C0001,0x3C40000,0x27FC0000,0x5BFC0000,0x6BFC0000,0x87FC0000,0x5A00000,0x1F00000,0x5BFC0000,0xA6000000,0x5BFC0000,0x1580001,0x7FC0000,0x85FC0000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,
-0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0xAC000000,0x3B00000,0x1700000,0x1700000,0x37FC0000,0x75FC0000,0x99F40000,0xAC000000,0xAC000000,0x3D40000,0x51FC0000,0xA9C40000,0xAC000000,
-0x65FC0000,0x148008C,0xC3440033,0xB3440033,0xAD440033,0xBD400024,0xB3400013,0xAF40001A,0xAF400024,0xAB400016,0xA9400026,0xBB3C0034,0xB33C000A,0xAF3C000F,0xB1380013,0xAD380002,0xA93C0016,0xAD3C0034,0xAB38000F,0xA9380019,0xA7380035,0x3E80088,0xB7300033,0xAD380033,0xB1300026,0xAD340012,0xA9380026,0xB3240033,0xAD280009,0xA92C0015,0xA7300034,0x79FC0088,
-0xAD140033,0xA91C0026,0xA7100034,0xA4000088,0xE73C0002,0xFB440034,0xFD480044,0xC13C0002,0xB53C0002,0xAF3C0002,0xAD3C0002,0xAD380002,0xE3300001,0xC3340001,0xAD380009,0xA92C0015,0x57FC0088,0x1500033,0xBB4C000A,0xB148000A,0xAD48000A,0xB5480013,0xB1440002,0xAD480001,0xAD480013,0xAD440005,0xA9440015,0x1F40033,0xB33C0009,0xAD40000A,0xB1380012,0xAD380001,
-0xA9400014,0x7FF80033,0xAD280008,0xA92C0014,0xA6000034,0x1F40033,0xB33C0009,0xAD40000A,0xB1380012,0xAD380001,0xA9400014,0x7FF80033,0xAD280008,0xA92C0014,0xA6000034,0x7FF80033,0xAD280008,0xA92C0014,0xA6000034,0xA6000034,0xE73C0001,0xFD480013,0xFF4C000B,0xC13C0001,0xB53C0001,0xAF3C0001,0xAD3C0001,0xAD380001,0xE3300001,0xBF380001,0xAD380009,0xA92C0014,
-0x5DF80033,0x1440033,0x1440033,0x1440033,0x1440033,0xB5400012,0xB5400012,0xB5400012,0xAB400012,0xAB400012,0xA7400012,0xB33C0009,0xB33C0009,0xB33C0009,0xAB3C0002,0xAB3C0002,0xA73C0005,0xA9380009,0xA9380009,0xA7380001,0xA5380009,0x1E00033,0x1E00033,0x1E00033,0xAD340012,0xAD340012,0xA7380012,0xAB300009,0xAB300009,0xA7300001,0xA5300009,0x75FC0033,
-0x75FC0033,0xA7240012,0xA5180008,0xA2000034,0xDB3C0002,0xF9400013,0x1440033,0xBF3C0001,0xB53C0001,0xAF3C0001,0xAD3C0002,0xAB380002,0xE3300000,0xBD380001,0xAD380008,0xA7300001,0x51FC0033,0x148000A,0x148000A,0x148000A,0x148000A,0xAD480001,0xAD480001,0xAD480001,0xA9440001,0xA9440001,0xA7440001,0x3E80008,0x3E80008,0x3E80008,0xA9400001,0xA9400001,
-0xA7400001,0x79FC0008,0x79FC0008,0xA7340000,0xA4000008,0x3E80008,0x3E80008,0x3E80008,0xA9400001,0xA9400001,0xA7400001,0x79FC0008,0x79FC0008,0xA7340000,0xA4000008,0x79FC0008,0x79FC0008,0xA7340000,0xA4000008,0xA4000008,0xCF400000,0xCD480001,0x148000A,0xBF3C0000,0xB1400000,0xAD400000,0xAB400000,0xA9400001,0xD1380000,0xB93C0000,0x57FC0008,0xA7340000,
-0x57FC0008,0x1540012,0xB7500001,0xAF500001,0xAD4C0001,0x1F80012,0xB1440001,0xAD480000,0x81FC0012,0xAD340000,0xA8000014,0x1F80012,0xB1440001,0xAD480000,0x81FC0012,0xAD340000,0xA8000014,0x81FC0012,0xAD340000,0xA8000014,0xA8000014,0x1F80012,0xB1440001,0xAD480000,0x81FC0012,0xAD340000,0xA8000014,0x81FC0012,0xAD340000,0xA8000014,0xA8000014,0x81FC0012,
-0xAD340000,0xA8000014,0xA8000014,0xA8000014,0xF7340000,0xD680012,0xF1500001,0xC5380000,0xB53C0001,0xB1380000,0xAD3C0000,0xAD2C0000,0xEB2C0000,0xC3340000,0xAD400001,0xA8000014,0x5FFC0012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0x1400012,0xAD3C0001,0xAD3C0001,0xAD3C0001,0xAD3C0001,0xAD3C0001,
-0xAD3C0001,0xA53C0001,0xA53C0001,0xA53C0001,0xA3380001,0x1DC0012,0x1DC0012,0x1DC0012,0x1DC0012,0x1DC0012,0x1DC0012,0xA7300001,0xA7300001,0xA7300001,0xA3340000,0x73F80012,0x73F80012,0x73F80012,0xA3200000,0x9E000014,0xE73C0001,0x1400012,0x1400012,0xBF3C0001,0xB53C0001,0xAF3C0001,0xAF3C0001,0xAB3C0001,0xE3300000,0xC5340000,0xA9380000,0xA7300001,
-0x4DFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table21[] = {
-0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,
-0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1600000,0x1600000,0x1600000,0x3E80000,0x57FC0000,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,
-0x1FC0000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0xAA000000,0x16C0000,0x1540001,0x1540001,0x38C0000,0x1AC0000,0x1D00000,0x1D00000,0x31FC0000,0x38C0000,0x1AC0000,0x61FC0000,0x83F80000,
-0x61FC0000,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0xDFC0000,0xDFC0000,0xDFC0000,0x89F80000,0x89F80000,0xAE000000,0xDFC0000,0xDFC0000,0xDFC0000,0x89F80000,0x89F80000,0xAE000000,0x89F80000,0x89F80000,0xAE000000,0xAE000000,0xDFC0000,0xDFC0000,0xDFC0000,0x89F80000,0x89F80000,0xAE000000,0x89F80000,0x89F80000,0xAE000000,0xAE000000,0x89F80000,
-0x89F80000,0xAE000000,0xAE000000,0xAE000000,0x1980000,0x3740000,0x15C0001,0x1DC0000,0x3BFC0000,0x69FC0000,0x79FC0000,0x93F80000,0x5B40000,0xDFC0000,0x69FC0000,0xAE000000,0x69FC0000,0x1680001,0x1FFC0000,0x91FC0000,0xB4000000,0x1FFC0000,0x91FC0000,0xB4000000,0x91FC0000,0xB4000000,0xB4000000,0x1FFC0000,0x91FC0000,0xB4000000,0x91FC0000,0xB4000000,
-0xB4000000,0x91FC0000,0xB4000000,0xB4000000,0xB4000000,0x1FFC0000,0x91FC0000,0xB4000000,0x91FC0000,0xB4000000,0xB4000000,0x91FC0000,0xB4000000,0xB4000000,0xB4000000,0x91FC0000,0xB4000000,0xB4000000,0xB4000000,0xB4000000,0x3C40000,0x9800000,0x9800000,0x4BFC0000,0x83FC0000,0xA3F40000,0xB4000000,0xB4000000,0x1EC0000,0x63FC0000,0xB1D40000,0xB4000000,
-0x75FC0000,0x158008C,0xCB540033,0xBB540033,0xB5540033,0xC5500024,0xBB500013,0xB750001A,0xB7500024,0xB3500016,0xB1500026,0xC34C0034,0xBB4C000A,0xB74C000F,0xB9480013,0xB5480002,0xB14C0016,0xB54C0034,0xB348000F,0xB1480019,0xAF480035,0x7FC0088,0xBF400033,0xB5480033,0xB9400026,0xB5440012,0xB1480026,0xBB340033,0xB5380009,0xB13C0015,0xAF400034,0x85FC0088,
-0xB5240033,0xB12C0026,0xAF200034,0xAC000088,0xEF4C0002,0xF354003E,0xF558004B,0xC94C0002,0xBD4C0002,0xB74C0002,0xB54C0002,0xB5480002,0xEB400001,0xCB440001,0xB5480009,0xB13C0015,0x65FC0088,0x1600033,0xC35C000A,0xB958000A,0xB558000A,0xBD580013,0xB9540002,0xB5580001,0xB5580013,0xB5540005,0xB1540015,0xFFC0033,0xBB4C0009,0xB550000A,0xB9480012,0xB5480001,
-0xB1500014,0x8BF80033,0xB5380008,0xB13C0014,0xAE000034,0xFFC0033,0xBB4C0009,0xB550000A,0xB9480012,0xB5480001,0xB1500014,0x8BF80033,0xB5380008,0xB13C0014,0xAE000034,0x8BF80033,0xB5380008,0xB13C0014,0xAE000034,0xAE000034,0xEF4C0001,0xF5580015,0xF75C000E,0xC94C0001,0xBD4C0001,0xB74C0001,0xB54C0001,0xB5480001,0xEB400001,0xC7480001,0xB5480009,0xB13C0014,
-0x6BFC0033,0x1540033,0x1540033,0x1540033,0x1540033,0xBD500012,0xBD500012,0xBD500012,0xB3500012,0xB3500012,0xAF500012,0xBB4C0009,0xBB4C0009,0xBB4C0009,0xB34C0002,0xB34C0002,0xAF4C0005,0xB1480009,0xB1480009,0xAF480001,0xAD480009,0x1F80033,0x1F80033,0x1F80033,0xB5440012,0xB5440012,0xAF480012,0xB3400009,0xB3400009,0xAF400001,0xAD400009,0x81FC0033,
-0x81FC0033,0xAF340012,0xAD280008,0xAA000034,0xE34C0002,0xF1500016,0x1540033,0xC74C0001,0xBD4C0001,0xB74C0001,0xB54C0002,0xB3480002,0xEB400000,0xC5480001,0xB5480008,0xAF400001,0x5FFC0033,0x158000A,0x158000A,0x158000A,0x158000A,0xB5580001,0xB5580001,0xB5580001,0xB1540001,0xB1540001,0xAF540001,0x7FC0008,0x7FC0008,0x7FC0008,0xB1500001,0xB1500001,
-0xAF500001,0x85FC0008,0x85FC0008,0xAF440000,0xAC000008,0x7FC0008,0x7FC0008,0x7FC0008,0xB1500001,0xB1500001,0xAF500001,0x85FC0008,0x85FC0008,0xAF440000,0xAC000008,0x85FC0008,0x85FC0008,0xAF440000,0xAC000008,0xAC000008,0xD7500000,0xD5580001,0x158000A,0xC74C0000,0xB9500000,0xB5500000,0xB3500000,0xB1500001,0xD9480000,0xC14C0000,0x65FC0008,0xAF440000,
-0x65FC0008,0x1640012,0xBF600001,0xB7600001,0xB55C0001,0x15FC0012,0xB9540001,0xB5580000,0x8DFC0012,0xB5440000,0xB0000014,0x15FC0012,0xB9540001,0xB5580000,0x8DFC0012,0xB5440000,0xB0000014,0x8DFC0012,0xB5440000,0xB0000014,0xB0000014,0x15FC0012,0xB9540001,0xB5580000,0x8DFC0012,0xB5440000,0xB0000014,0x8DFC0012,0xB5440000,0xB0000014,0xB0000014,0x8DFC0012,
-0xB5440000,0xB0000014,0xB0000014,0xB0000014,0xFF440000,0x17C0012,0xF9600001,0xCD480000,0xBD4C0001,0xB9480000,0xB54C0000,0xB53C0000,0xF33C0000,0xCB440000,0xB5500001,0xB0000014,0x6FFC0012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0x1500012,0xB54C0001,0xB54C0001,0xB54C0001,0xB54C0001,0xB54C0001,
-0xB54C0001,0xAD4C0001,0xAD4C0001,0xAD4C0001,0xAB480001,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0xAF400001,0xAF400001,0xAF400001,0xAB440000,0x7FF80012,0x7FF80012,0x7FF80012,0xAB300000,0xA6000014,0xEF4C0001,0x1500012,0x1500012,0xC74C0001,0xBD4C0001,0xB74C0001,0xB74C0001,0xB34C0001,0xEB400000,0xCD440000,0xB1480000,0xAF400001,
-0x5DF80012,};
-static const uint32_t g_etc1_to_bc7_m6_table22[] = {
-0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x85FC0000,
-0x85FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1700000,0x1700000,0x1700000,0x7FC0000,0x65FC0000,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,
-0x19FC0000,0x8FF80000,0x8FF80000,0x8FF80000,0xB2000000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x8FF80000,0x8FF80000,0x8FF80000,0xB2000000,0x8FF80000,0x8FF80000,0x8FF80000,0xB2000000,0xB2000000,0x77C0000,0x1640001,0x1640001,0x1A00000,0x1C00000,0x1E80000,0x1E80000,0x45FC0000,0x1A00000,0x1C00000,0x71FC0000,0x8FF80000,
-0x71FC0000,0x16C0001,0x16C0001,0x16C0001,0x16C0001,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x95F80000,0x95F80000,0xB6000000,0xB6000000,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x95F80000,0x95F80000,0xB6000000,0xB6000000,0x95F80000,
-0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x3A80000,0xB840000,0x16C0001,0x3F00000,0x4FFC0000,0x79FC0000,0x87F80000,0x9DFC0000,0x1CC0000,0x25FC0000,0x79FC0000,0xB6000000,0x79FC0000,0x1780001,0x37FC0000,0x9DFC0000,0xBC000000,0x37FC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0xBC000000,0xBC000000,0x37FC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0xBC000000,
-0xBC000000,0x9DFC0000,0xBC000000,0xBC000000,0xBC000000,0x37FC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0xBC000000,0xBC000000,0x9DFC0000,0xBC000000,0xBC000000,0xBC000000,0x9DFC0000,0xBC000000,0xBC000000,0xBC000000,0xBC000000,0x3D80000,0x1940000,0x1940000,0x5DFC0000,0x91FC0000,0xADF40000,0xBC000000,0xBC000000,0x9FC0000,0x73FC0000,0xB9E40000,0xBC000000,
-0x83FC0000,0x168008C,0xD3640033,0xC3640033,0xBD640033,0xCD600024,0xC3600013,0xBF60001A,0xBF600024,0xBB600016,0xB9600026,0xCB5C0034,0xC35C000A,0xBF5C000F,0xC1580013,0xBD580002,0xB95C0016,0xBD5C0034,0xBB58000F,0xB9580019,0xB7580035,0x1FFC0088,0xC7500033,0xBD580033,0xC1500026,0xBD540012,0xB9580026,0xC3440033,0xBD480009,0xB94C0015,0xB7500034,0x91FC0088,
-0xBD340033,0xB93C0026,0xB7300034,0xB4000088,0xF75C0002,0xFB64003E,0xFD68004B,0xD15C0002,0xC55C0002,0xBF5C0002,0xBD5C0002,0xBD580002,0xF3500001,0xD3540001,0xBD580009,0xB94C0015,0x75FC0088,0x1700033,0xCB6C000A,0xC168000A,0xBD68000A,0xC5680013,0xC1640002,0xBD680001,0xBD680013,0xBD640005,0xB9640015,0x29FC0033,0xC35C0009,0xBD60000A,0xC1580012,0xBD580001,
-0xB9600014,0x97F80033,0xBD480008,0xB94C0014,0xB6000034,0x29FC0033,0xC35C0009,0xBD60000A,0xC1580012,0xBD580001,0xB9600014,0x97F80033,0xBD480008,0xB94C0014,0xB6000034,0x97F80033,0xBD480008,0xB94C0014,0xB6000034,0xB6000034,0xF75C0001,0xFD680015,0xFF6C000E,0xD15C0001,0xC55C0001,0xBF5C0001,0xBD5C0001,0xBD580001,0xF3500001,0xCF580001,0xBD580009,0xB94C0014,
-0x7BFC0033,0x1640033,0x1640033,0x1640033,0x1640033,0xC5600012,0xC5600012,0xC5600012,0xBB600012,0xBB600012,0xB7600012,0xC35C0009,0xC35C0009,0xC35C0009,0xBB5C0002,0xBB5C0002,0xB75C0005,0xB9580009,0xB9580009,0xB7580001,0xB5580009,0x15FC0033,0x15FC0033,0x15FC0033,0xBD540012,0xBD540012,0xB7580012,0xBB500009,0xBB500009,0xB7500001,0xB5500009,0x8DFC0033,
-0x8DFC0033,0xB7440012,0xB5380008,0xB2000034,0xEB5C0002,0xF9600016,0x1640033,0xCF5C0001,0xC55C0001,0xBF5C0001,0xBD5C0002,0xBB580002,0xF3500000,0xCD580001,0xBD580008,0xB7500001,0x6FFC0033,0x168000A,0x168000A,0x168000A,0x168000A,0xBD680001,0xBD680001,0xBD680001,0xB9640001,0xB9640001,0xB7640001,0x1FFC0008,0x1FFC0008,0x1FFC0008,0xB9600001,0xB9600001,
-0xB7600001,0x91FC0008,0x91FC0008,0xB7540000,0xB4000008,0x1FFC0008,0x1FFC0008,0x1FFC0008,0xB9600001,0xB9600001,0xB7600001,0x91FC0008,0x91FC0008,0xB7540000,0xB4000008,0x91FC0008,0x91FC0008,0xB7540000,0xB4000008,0xB4000008,0xDF600000,0xDD680001,0x168000A,0xCF5C0000,0xC1600000,0xBD600000,0xBB600000,0xB9600001,0xE1580000,0xC95C0000,0x75FC0008,0xB7540000,
-0x75FC0008,0x1740012,0xC7700001,0xBF700001,0xBD6C0001,0x2FFC0012,0xC1640001,0xBD680000,0x99FC0012,0xBD540000,0xB8000014,0x2FFC0012,0xC1640001,0xBD680000,0x99FC0012,0xBD540000,0xB8000014,0x99FC0012,0xBD540000,0xB8000014,0xB8000014,0x2FFC0012,0xC1640001,0xBD680000,0x99FC0012,0xBD540000,0xB8000014,0x99FC0012,0xBD540000,0xB8000014,0xB8000014,0x99FC0012,
-0xBD540000,0xB8000014,0xB8000014,0xB8000014,0xF15C0001,0x18C0012,0xF1700002,0xD5580000,0xC55C0001,0xC1580000,0xBD5C0000,0xBD4C0000,0xFB4C0000,0xD3540000,0xBD600001,0xB8000014,0x7FF80012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0x1600012,0xBD5C0001,0xBD5C0001,0xBD5C0001,0xBD5C0001,0xBD5C0001,
-0xBD5C0001,0xB55C0001,0xB55C0001,0xB55C0001,0xB3580001,0xFFC0012,0xFFC0012,0xFFC0012,0xFFC0012,0xFFC0012,0xFFC0012,0xB7500001,0xB7500001,0xB7500001,0xB3540000,0x8BF80012,0x8BF80012,0x8BF80012,0xB3400000,0xAE000014,0xF75C0001,0x1600012,0x1600012,0xCF5C0001,0xC55C0001,0xBF5C0001,0xBF5C0001,0xBB5C0001,0xF3500000,0xD5540000,0xB9580000,0xB7500001,
-0x6BFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table23[] = {
-0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,
-0x91FC0000,0x91FC0000,0x91FC0000,0xB4000000,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x9800000,0x9800000,0x9800000,0x1FFC0000,0x75FC0000,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,
-0x31FC0000,0x9BF80000,0x9BF80000,0x9BF80000,0xBA000000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x9BF80000,0x9BF80000,0x9BF80000,0xBA000000,0x9BF80000,0x9BF80000,0x9BF80000,0xBA000000,0xBA000000,0xF8C0000,0x1740001,0x1740001,0x1B40000,0x1D40000,0x1FC0000,0x1FC0000,0x59FC0000,0x1B40000,0x1D40000,0x7FFC0000,0x9BF80000,
-0x7FFC0000,0x17C0001,0x17C0001,0x17C0001,0x17C0001,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0xA1F80000,0xA1F80000,0xBE000000,0xBE000000,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0xA1F80000,0xA1F80000,0xBE000000,0xBE000000,0xA1F80000,
-0xA1F80000,0xBE000000,0xBE000000,0xBE000000,0x1BC0000,0x1980000,0x17C0001,0xFFC0000,0x63FC0000,0x87FC0000,0x95F80000,0xA9F40000,0x1E00000,0x3DFC0000,0x87FC0000,0xBE000000,0x87FC0000,0x1880001,0x4FFC0000,0xA9FC0000,0xC4000000,0x4FFC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xC4000000,0xC4000000,0x4FFC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xC4000000,
-0xC4000000,0xA9FC0000,0xC4000000,0xC4000000,0xC4000000,0x4FFC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xC4000000,0xC4000000,0xA9FC0000,0xC4000000,0xC4000000,0xC4000000,0xA9FC0000,0xC4000000,0xC4000000,0xC4000000,0xC4000000,0x3EC0000,0x1A40000,0x1A40000,0x71FC0000,0x9FF80000,0xB7F40000,0xC4000000,0xC4000000,0x27FC0000,0x85FC0000,0xC1F40000,0xC4000000,
-0x93FC0000,0x178008C,0xDB740033,0xCB740033,0xC5740033,0xD5700024,0xCB700013,0xC770001A,0xC7700024,0xC3700016,0xC1700026,0xD36C0034,0xCB6C000A,0xC76C000F,0xC9680013,0xC5680002,0xC16C0016,0xC56C0034,0xC368000F,0xC1680019,0xBF680035,0x37FC0088,0xCF600033,0xC5680033,0xC9600026,0xC5640012,0xC1680026,0xCB540033,0xC5580009,0xC15C0015,0xBF600034,0x9DFC0088,
-0xC5440033,0xC14C0026,0xBF400034,0xBC000088,0xFF6C0002,0xF374004C,0xF5780054,0xD96C0002,0xCD6C0002,0xC76C0002,0xC56C0002,0xC5680002,0xFB600001,0xDB640001,0xC5680009,0xC15C0015,0x83FC0088,0x1800033,0xD37C000A,0xC978000A,0xC578000A,0xCD780013,0xC9740002,0xC5780001,0xC5780013,0xC5740005,0xC1740015,0x41FC0033,0xCB6C0009,0xC570000A,0xC9680012,0xC5680001,
-0xC1700014,0xA1FC0033,0xC5580008,0xC15C0014,0xBE000034,0x41FC0033,0xCB6C0009,0xC570000A,0xC9680012,0xC5680001,0xC1700014,0xA1FC0033,0xC5580008,0xC15C0014,0xBE000034,0xA1FC0033,0xC5580008,0xC15C0014,0xBE000034,0xBE000034,0xFF6C0001,0xF578001B,0xF77C0013,0xD96C0001,0xCD6C0001,0xC76C0001,0xC56C0001,0xC5680001,0xFB600001,0xD7680001,0xC5680009,0xC15C0014,
-0x89FC0033,0x1740033,0x1740033,0x1740033,0x1740033,0xCD700012,0xCD700012,0xCD700012,0xC3700012,0xC3700012,0xBF700012,0xCB6C0009,0xCB6C0009,0xCB6C0009,0xC36C0002,0xC36C0002,0xBF6C0005,0xC1680009,0xC1680009,0xBF680001,0xBD680009,0x2FFC0033,0x2FFC0033,0x2FFC0033,0xC5640012,0xC5640012,0xBF680012,0xC3600009,0xC3600009,0xBF600001,0xBD600009,0x99FC0033,
-0x99FC0033,0xBF540012,0xBD480008,0xBA000034,0xF36C0002,0xF170001B,0x1740033,0xD76C0001,0xCD6C0001,0xC76C0001,0xC56C0002,0xC3680002,0xFB600000,0xD5680001,0xC5680008,0xBF600001,0x7FF80033,0x178000A,0x178000A,0x178000A,0x178000A,0xC5780001,0xC5780001,0xC5780001,0xC1740001,0xC1740001,0xBF740001,0x37FC0008,0x37FC0008,0x37FC0008,0xC1700001,0xC1700001,
-0xBF700001,0x9DFC0008,0x9DFC0008,0xBF640000,0xBC000008,0x37FC0008,0x37FC0008,0x37FC0008,0xC1700001,0xC1700001,0xBF700001,0x9DFC0008,0x9DFC0008,0xBF640000,0xBC000008,0x9DFC0008,0x9DFC0008,0xBF640000,0xBC000008,0xBC000008,0xE7700000,0xE5780001,0x178000A,0xD76C0000,0xC9700000,0xC5700000,0xC3700000,0xC1700001,0xE9680000,0xD16C0000,0x83FC0008,0xBF640000,
-0x83FC0008,0x1840012,0xCF800001,0xC7800001,0xC57C0001,0x47FC0012,0xC9740001,0xC5780000,0xA5F80012,0xC5640000,0xC0000014,0x47FC0012,0xC9740001,0xC5780000,0xA5F80012,0xC5640000,0xC0000014,0xA5F80012,0xC5640000,0xC0000014,0xC0000014,0x47FC0012,0xC9740001,0xC5780000,0xA5F80012,0xC5640000,0xC0000014,0xA5F80012,0xC5640000,0xC0000014,0xC0000014,0xA5F80012,
-0xC5640000,0xC0000014,0xC0000014,0xC0000014,0xF96C0001,0x79C0012,0xF9800002,0xDD680000,0xCD6C0001,0xC9680000,0xC56C0000,0xC55C0000,0xF9640001,0xDB640000,0xC5700001,0xC0000014,0x8DFC0012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0x1700012,0xC56C0001,0xC56C0001,0xC56C0001,0xC56C0001,0xC56C0001,
-0xC56C0001,0xBD6C0001,0xBD6C0001,0xBD6C0001,0xBB680001,0x29FC0012,0x29FC0012,0x29FC0012,0x29FC0012,0x29FC0012,0x29FC0012,0xBF600001,0xBF600001,0xBF600001,0xBB640000,0x97F80012,0x97F80012,0x97F80012,0xBB500000,0xB6000014,0xFF6C0001,0x1700012,0x1700012,0xD76C0001,0xCD6C0001,0xC76C0001,0xC76C0001,0xC36C0001,0xFB600000,0xDD640000,0xC1680000,0xBF600001,
-0x7BFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table24[] = {
-0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x9FF80000,
-0x9FF80000,0x9FF80000,0x9FF80000,0xBC000001,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x3940000,0x3940000,0x3940000,0x3BFC0000,0x85FC0000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,
-0x4DFC0000,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0xC2000001,0x9A00000,0x1880000,0x1880000,0x1C80000,0x1EC0000,0x23FC0000,0x23FC0000,0x6FFC0000,0x1C80000,0x1EC0000,0x91FC0000,0xA7FC0000,
-0x91FC0000,0x1900000,0x1900000,0x1900000,0x1900000,0x59FC0000,0x59FC0000,0x59FC0000,0xADFC0000,0xADFC0000,0xC6000001,0x59FC0000,0x59FC0000,0x59FC0000,0xADFC0000,0xADFC0000,0xC6000001,0xADFC0000,0xADFC0000,0xC6000001,0xC6000001,0x59FC0000,0x59FC0000,0x59FC0000,0xADFC0000,0xADFC0000,0xC6000001,0xADFC0000,0xADFC0000,0xC6000001,0xC6000001,0xADFC0000,
-0xADFC0000,0xC6000001,0xC6000001,0xC6000001,0x1D00000,0xDA80000,0x1900000,0x31FC0000,0x79FC0000,0x99FC0000,0xA3FC0000,0xB5F80000,0x3F40000,0x59FC0000,0x99FC0000,0xC6000001,0x99FC0000,0x19C0000,0x6BFC0000,0xB7F80000,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,
-0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0xCC000001,0x11FC0000,0x1B80000,0x1B80000,0x87FC0000,0xADFC0000,0xC3F00000,0xCC000001,0xCC000001,0x49FC0000,0x97FC0000,0xCBE80000,0xCC000001,
-0xA3FC0000,0x18C0088,0xE1880034,0xD3880034,0xCD880035,0xDD840026,0xD3840015,0xCD840019,0xCF840026,0xCD800016,0xCB800026,0xDF7C0033,0xD57C0009,0xCF80000F,0xD17C0012,0xCD7C0002,0xCB7C0016,0xCF7C0033,0xCD78000F,0xCB78001A,0xC97C0033,0x53FC0088,0xD9700033,0xCD7C0034,0xD3700026,0xCF740013,0xCB780024,0xD3680033,0xCD70000A,0xCB6C0013,0xC9700033,0xABF80088,
-0xCD5C0034,0xCB580024,0xC94C0033,0xC400008C,0xFF800006,0xFD880048,0xFD880058,0xE57C0001,0xD77C0002,0xD17C0002,0xCF7C0002,0xCD7C0002,0xFF740004,0xE1780001,0xCF780009,0xCB6C0013,0x95FC0088,0x1900034,0xDD8C0008,0xD18C0009,0xCD8C0009,0xD9880012,0xD1880001,0xCF880001,0xCF880012,0xCD880005,0xCB880012,0x5BFC0033,0xD3800009,0xCD840009,0xD17C0012,0xCD7C0002,
-0xCB800012,0xAFFC0033,0xCD700009,0xCB6C0012,0xC8000033,0x5BFC0033,0xD3800009,0xCD840009,0xD17C0012,0xCD7C0002,0xCB800012,0xAFFC0033,0xCD700009,0xCB6C0012,0xC8000033,0xAFFC0033,0xCD700009,0xCB6C0012,0xC8000033,0xC8000033,0xFF800002,0xFF8C0018,0xFF8C0018,0xDF800001,0xD77C0002,0xD17C0002,0xCD800002,0xCD780002,0xFB780003,0xDD7C0001,0xCF780008,0xCB6C0012,
-0x9BFC0033,0x1880034,0x1880034,0x1880034,0x1880034,0xD3840014,0xD3840014,0xD3840014,0xCB840014,0xCB840014,0xC7800015,0xD57C0008,0xD57C0008,0xD57C0008,0xCD7C0001,0xCD7C0001,0xC97C0005,0xC97C000A,0xC97C000A,0xC77C0001,0xC57C000A,0x49FC0033,0x49FC0033,0x49FC0033,0xCF740012,0xCF740012,0xC77C0013,0xCD700009,0xCD700009,0xC7740002,0xC574000A,0xA7F80033,
-0xA7F80033,0xC7680013,0xC560000A,0xC2000033,0xF9800004,0xFB840018,0x1880034,0xE37C0001,0xD77C0001,0xD17C0001,0xCF7C0001,0xCB7C0001,0xFF740000,0xE1780000,0xCD7C0009,0xC7740002,0x8FFC0033,0x18C0008,0x18C0008,0x18C0008,0x18C0008,0xCF880000,0xCF880000,0xCF880000,0xC9880001,0xC9880001,0xC7880001,0x53FC0008,0x53FC0008,0x53FC0008,0xC9840001,0xC9840001,
-0xC7840001,0xABF80008,0xABF80008,0xC77C0001,0xC400000A,0x53FC0008,0x53FC0008,0x53FC0008,0xC9840001,0xC9840001,0xC7840001,0xABF80008,0xABF80008,0xC77C0001,0xC400000A,0xABF80008,0xABF80008,0xC77C0001,0xC400000A,0xC400000A,0xF9800000,0xFD880000,0x18C0008,0xDD800000,0xD3800000,0xCF800000,0xCB840001,0xCB800000,0xF5780000,0xDB7C0000,0x95FC0008,0xC77C0001,
-0x95FC0008,0x1940014,0xD9900000,0xD1900000,0xCD900001,0x63FC0012,0xD1880001,0xCD8C0001,0xB3F80012,0xCD7C0001,0xCA000012,0x63FC0012,0xD1880001,0xCD8C0001,0xB3F80012,0xCD7C0001,0xCA000012,0xB3F80012,0xCD7C0001,0xCA000012,0xCA000012,0x63FC0012,0xD1880001,0xCD8C0001,0xB3F80012,0xCD7C0001,0xCA000012,0xB3F80012,0xCD7C0001,0xCA000012,0xCA000012,0xB3F80012,
-0xCD7C0001,0xCA000012,0xCA000012,0xCA000012,0xF7840002,0x1B00012,0xF3940005,0xE57C0000,0xD77C0001,0xD3780000,0xCD800001,0xCD740001,0xF57C0002,0xE5740000,0xCF800000,0xCA000012,0x9FF80012,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0x1800014,0xCF7C0000,0xCF7C0000,0xCF7C0000,0xCF7C0000,0xCF7C0000,
-0xCF7C0000,0xC77C0000,0xC77C0000,0xC77C0000,0xC37C0001,0x43FC0012,0x43FC0012,0x43FC0012,0x43FC0012,0x43FC0012,0x43FC0012,0xC7740001,0xC7740001,0xC7740001,0xC3780001,0xA3FC0012,0xA3FC0012,0xA3FC0012,0xC3680001,0xC0000012,0xF77C0004,0x1800014,0x1800014,0xE77C0000,0xDB7C0000,0xD37C0000,0xD37C0000,0xCD7C0000,0xFF740000,0xE1780000,0xC97C0001,0xC7740001,
-0x8BFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table25[] = {
-0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,
-0xABF80000,0xABF80000,0xABF80000,0xC4000001,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0xBA40000,0xBA40000,0xBA40000,0x53FC0000,0x95FC0000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,
-0x65FC0000,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0xCA000001,0x1B40000,0x1980000,0x1980000,0x5D80000,0x7FC0000,0x41FC0000,0x41FC0000,0x83FC0000,0x5D80000,0x7FC0000,0x9FFC0000,0xB3FC0000,
-0x9FFC0000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x71FC0000,0x71FC0000,0x71FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0x71FC0000,0x71FC0000,0x71FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xB9FC0000,0xCE000001,0xCE000001,0x71FC0000,0x71FC0000,0x71FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xB9FC0000,0xCE000001,0xCE000001,0xB9FC0000,
-0xB9FC0000,0xCE000001,0xCE000001,0xCE000001,0x1E40000,0x1BC0000,0x1A00000,0x4FFC0000,0x8DFC0000,0xA7FC0000,0xB1FC0000,0xBFFC0000,0x1BFC0000,0x71FC0000,0xA7FC0000,0xCE000001,0xA7FC0000,0x1AC0000,0x83FC0000,0xC3F80000,0xD4000001,0x83FC0000,0xC3F80000,0xD4000001,0xC3F80000,0xD4000001,0xD4000001,0x83FC0000,0xC3F80000,0xD4000001,0xC3F80000,0xD4000001,
-0xD4000001,0xC3F80000,0xD4000001,0xD4000001,0xD4000001,0x83FC0000,0xC3F80000,0xD4000001,0xC3F80000,0xD4000001,0xD4000001,0xC3F80000,0xD4000001,0xD4000001,0xD4000001,0xC3F80000,0xD4000001,0xD4000001,0xD4000001,0xD4000001,0x37FC0000,0x1C80000,0x1C80000,0x9BFC0000,0xBBFC0000,0xCDF00000,0xD4000001,0xD4000001,0x67FC0000,0xA9FC0000,0xD3F80000,0xD4000001,
-0xB3FC0000,0x19C0088,0xE9980034,0xDB980034,0xD5980035,0xE5940026,0xDB940015,0xD5940019,0xD7940026,0xD5900016,0xD3900026,0xE78C0033,0xDD8C0009,0xD790000F,0xD98C0012,0xD58C0002,0xD38C0016,0xD78C0033,0xD588000F,0xD388001A,0xD18C0033,0x6BFC0088,0xE1800033,0xD58C0034,0xDB800026,0xD7840013,0xD3880024,0xDB780033,0xD580000A,0xD37C0013,0xD1800033,0xB7F80088,
-0xD56C0034,0xD3680024,0xD15C0033,0xCC00008C,0xFF90000E,0xF5980056,0xF79C0061,0xED8C0001,0xDF8C0002,0xD98C0002,0xD78C0002,0xD58C0002,0xFF880008,0xE9880001,0xD7880009,0xD37C0013,0xA3FC0088,0x1A00034,0xE59C0008,0xD99C0009,0xD59C0009,0xE1980012,0xD9980001,0xD7980001,0xD7980012,0xD5980005,0xD3980012,0x75FC0033,0xDB900009,0xD5940009,0xD98C0012,0xD58C0002,
-0xD3900012,0xBBFC0033,0xD5800009,0xD37C0012,0xD0000033,0x75FC0033,0xDB900009,0xD5940009,0xD98C0012,0xD58C0002,0xD3900012,0xBBFC0033,0xD5800009,0xD37C0012,0xD0000033,0xBBFC0033,0xD5800009,0xD37C0012,0xD0000033,0xD0000033,0xFF940003,0xF79C001E,0xF9A00019,0xE7900001,0xDF8C0002,0xD98C0002,0xD5900002,0xD5880002,0xFF880004,0xE58C0001,0xD7880008,0xD37C0012,
-0xA9FC0033,0x1980034,0x1980034,0x1980034,0x1980034,0xDB940014,0xDB940014,0xDB940014,0xD3940014,0xD3940014,0xCF900015,0xDD8C0008,0xDD8C0008,0xDD8C0008,0xD58C0001,0xD58C0001,0xD18C0005,0xD18C000A,0xD18C000A,0xCF8C0001,0xCD8C000A,0x63FC0033,0x63FC0033,0x63FC0033,0xD7840012,0xD7840012,0xCF8C0013,0xD5800009,0xD5800009,0xCF840002,0xCD84000A,0xB3F80033,
-0xB3F80033,0xCF780013,0xCD70000A,0xCA000033,0xFF900005,0xF394001D,0x1980034,0xEB8C0001,0xDF8C0001,0xD98C0001,0xD78C0001,0xD38C0001,0xFB880002,0xE9880000,0xD58C0009,0xCF840002,0x9FF80033,0x19C0008,0x19C0008,0x19C0008,0x19C0008,0xD7980000,0xD7980000,0xD7980000,0xD1980001,0xD1980001,0xCF980001,0x6BFC0008,0x6BFC0008,0x6BFC0008,0xD1940001,0xD1940001,
-0xCF940001,0xB7F80008,0xB7F80008,0xCF8C0001,0xCC00000A,0x6BFC0008,0x6BFC0008,0x6BFC0008,0xD1940001,0xD1940001,0xCF940001,0xB7F80008,0xB7F80008,0xCF8C0001,0xCC00000A,0xB7F80008,0xB7F80008,0xCF8C0001,0xCC00000A,0xCC00000A,0xEB940001,0xF5980001,0x19C0008,0xE5900000,0xDB900000,0xD7900000,0xD3940001,0xD3900000,0xFD880000,0xE38C0000,0xA3FC0008,0xCF8C0001,
-0xA3FC0008,0x1A40014,0xE1A00000,0xD9A00000,0xD5A00001,0x7BFC0012,0xD9980001,0xD59C0001,0xBFF80012,0xD58C0001,0xD2000012,0x7BFC0012,0xD9980001,0xD59C0001,0xBFF80012,0xD58C0001,0xD2000012,0xBFF80012,0xD58C0001,0xD2000012,0xD2000012,0x7BFC0012,0xD9980001,0xD59C0001,0xBFF80012,0xD58C0001,0xD2000012,0xBFF80012,0xD58C0001,0xD2000012,0xD2000012,0xBFF80012,
-0xD58C0001,0xD2000012,0xD2000012,0xD2000012,0xFF940002,0x9C00012,0xFBA40005,0xED8C0000,0xDF8C0001,0xDB880000,0xD5900001,0xD5840001,0xFD8C0002,0xED840000,0xD7900000,0xD2000012,0xADFC0012,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0x1900014,0xD78C0000,0xD78C0000,0xD78C0000,0xD78C0000,0xD78C0000,
-0xD78C0000,0xCF8C0000,0xCF8C0000,0xCF8C0000,0xCB8C0001,0x5BFC0012,0x5BFC0012,0x5BFC0012,0x5BFC0012,0x5BFC0012,0x5BFC0012,0xCF840001,0xCF840001,0xCF840001,0xCB880001,0xAFFC0012,0xAFFC0012,0xAFFC0012,0xCB780001,0xC8000012,0xFF8C0004,0x1900014,0x1900014,0xEF8C0000,0xE38C0000,0xDB8C0000,0xDB8C0000,0xD58C0000,0xFB880001,0xE9880000,0xD18C0001,0xCF840001,
-0x9BFC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table26[] = {
-0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0xB7F80000,
-0xB7F80000,0xB7F80000,0xB7F80000,0xCC000001,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x1B80000,0x1B80000,0x1B80000,0x6BFC0000,0xA3FC0000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0x7DFC0000,
-0x7DFC0000,0xBFFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0x7DFC0000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0xBFFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0xD2000001,0x1C40000,0x1A80000,0x1A80000,0x1EC0000,0x2DFC0000,0x5FFC0000,0x5FFC0000,0x97FC0000,0x1EC0000,0x2DFC0000,0xAFFC0000,0xBFFC0000,
-0xAFFC0000,0x1B00000,0x1B00000,0x1B00000,0x1B00000,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xC5FC0000,0xD6000001,0xD6000001,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xC5FC0000,0xD6000001,0xD6000001,0xC5FC0000,
-0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0x5F40000,0x1CC0000,0x1B00000,0x6FFC0000,0xA1FC0000,0xB7FC0000,0xBFF80000,0xCBF80000,0x41FC0000,0x89FC0000,0xB7FC0000,0xD6000001,0xB7FC0000,0x1BC0000,0x9BFC0000,0xCFF80000,0xDC000001,0x9BFC0000,0xCFF80000,0xDC000001,0xCFF80000,0xDC000001,0xDC000001,0x9BFC0000,0xCFF80000,0xDC000001,0xCFF80000,0xDC000001,
-0xDC000001,0xCFF80000,0xDC000001,0xDC000001,0xDC000001,0x9BFC0000,0xCFF80000,0xDC000001,0xCFF80000,0xDC000001,0xDC000001,0xCFF80000,0xDC000001,0xDC000001,0xDC000001,0xCFF80000,0xDC000001,0xDC000001,0xDC000001,0xDC000001,0x5FFC0000,0x5D80000,0x5D80000,0xAFFC0000,0xC9F80000,0xD7F00000,0xDC000001,0xDC000001,0x85FC0000,0xB9FC0000,0xDDCC0000,0xDC000001,
-0xC1FC0000,0x1AC0088,0xF1A80034,0xE3A80034,0xDDA80035,0xEDA40026,0xE3A40015,0xDDA40019,0xDFA40026,0xDDA00016,0xDBA00026,0xEF9C0033,0xE59C0009,0xDFA0000F,0xE19C0012,0xDD9C0002,0xDB9C0016,0xDF9C0033,0xDD98000F,0xDB98001A,0xD99C0033,0x83FC0088,0xE9900033,0xDD9C0034,0xE3900026,0xDF940013,0xDB980024,0xE3880033,0xDD90000A,0xDB8C0013,0xD9900033,0xC3F80088,
-0xDD7C0034,0xDB780024,0xD96C0033,0xD400008C,0xFFA00016,0xFDA80056,0xFFAC0061,0xF59C0001,0xE79C0002,0xE19C0002,0xDF9C0002,0xDD9C0002,0xFF9C0013,0xF1980001,0xDF980009,0xDB8C0013,0xB3FC0088,0x1B00034,0xEDAC0008,0xE1AC0009,0xDDAC0009,0xE9A80012,0xE1A80001,0xDFA80001,0xDFA80012,0xDDA80005,0xDBA80012,0x8DFC0033,0xE3A00009,0xDDA40009,0xE19C0012,0xDD9C0002,
-0xDBA00012,0xC7FC0033,0xDD900009,0xDB8C0012,0xD8000033,0x8DFC0033,0xE3A00009,0xDDA40009,0xE19C0012,0xDD9C0002,0xDBA00012,0xC7FC0033,0xDD900009,0xDB8C0012,0xD8000033,0xC7FC0033,0xDD900009,0xDB8C0012,0xD8000033,0xD8000033,0xF9A80009,0xFFAC001E,0xF1B00020,0xEFA00001,0xE79C0002,0xE19C0002,0xDDA00002,0xDD980002,0xFDA00009,0xED9C0001,0xDF980008,0xDB8C0012,
-0xB9FC0033,0x1A80034,0x1A80034,0x1A80034,0x1A80034,0xE3A40014,0xE3A40014,0xE3A40014,0xDBA40014,0xDBA40014,0xD7A00015,0xE59C0008,0xE59C0008,0xE59C0008,0xDD9C0001,0xDD9C0001,0xD99C0005,0xD99C000A,0xD99C000A,0xD79C0001,0xD59C000A,0x7BFC0033,0x7BFC0033,0x7BFC0033,0xDF940012,0xDF940012,0xD79C0013,0xDD900009,0xDD900009,0xD7940002,0xD594000A,0xBFF80033,
-0xBFF80033,0xD7880013,0xD580000A,0xD2000033,0xFFA00006,0xFBA4001D,0x1A80034,0xF39C0001,0xE79C0001,0xE19C0001,0xDF9C0001,0xDB9C0001,0xFB980006,0xF1980000,0xDD9C0009,0xD7940002,0xADFC0033,0x1AC0008,0x1AC0008,0x1AC0008,0x1AC0008,0xDFA80000,0xDFA80000,0xDFA80000,0xD9A80001,0xD9A80001,0xD7A80001,0x83FC0008,0x83FC0008,0x83FC0008,0xD9A40001,0xD9A40001,
-0xD7A40001,0xC3F80008,0xC3F80008,0xD79C0001,0xD400000A,0x83FC0008,0x83FC0008,0x83FC0008,0xD9A40001,0xD9A40001,0xD7A40001,0xC3F80008,0xC3F80008,0xD79C0001,0xD400000A,0xC3F80008,0xC3F80008,0xD79C0001,0xD400000A,0xD400000A,0xF3A40001,0xFDA80001,0x1AC0008,0xEDA00000,0xE3A00000,0xDFA00000,0xDBA40001,0xDBA00000,0xF5A00001,0xEB9C0000,0xB3FC0008,0xD79C0001,
-0xB3FC0008,0x1B40014,0xE9B00000,0xE1B00000,0xDDB00001,0x93FC0012,0xE1A80001,0xDDAC0001,0xCBF80012,0xDD9C0001,0xDA000012,0x93FC0012,0xE1A80001,0xDDAC0001,0xCBF80012,0xDD9C0001,0xDA000012,0xCBF80012,0xDD9C0001,0xDA000012,0xDA000012,0x93FC0012,0xE1A80001,0xDDAC0001,0xCBF80012,0xDD9C0001,0xDA000012,0xCBF80012,0xDD9C0001,0xDA000012,0xDA000012,0xCBF80012,
-0xDD9C0001,0xDA000012,0xDA000012,0xDA000012,0xF9A80005,0x1D40012,0xF3B40008,0xF59C0000,0xE79C0001,0xE3980000,0xDDA00001,0xDD940001,0xFBA40005,0xF5940000,0xDFA00000,0xDA000012,0xBDF80012,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0x1A00014,0xDF9C0000,0xDF9C0000,0xDF9C0000,0xDF9C0000,0xDF9C0000,
-0xDF9C0000,0xD79C0000,0xD79C0000,0xD79C0000,0xD39C0001,0x75FC0012,0x75FC0012,0x75FC0012,0x75FC0012,0x75FC0012,0x75FC0012,0xD7940001,0xD7940001,0xD7940001,0xD3980001,0xBBFC0012,0xBBFC0012,0xBBFC0012,0xD3880001,0xD0000012,0xF9A00005,0x1A00014,0x1A00014,0xF79C0000,0xEB9C0000,0xE39C0000,0xE39C0000,0xDD9C0000,0xFB980002,0xF1980000,0xD99C0001,0xD7940001,
-0xA9FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table27[] = {
-0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,
-0xC3F80000,0xC3F80000,0xC3F80000,0xD4000001,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1C80000,0x1C80000,0x1C80000,0x83FC0000,0xB3FC0000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x95FC0000,0x95FC0000,0x95FC0000,0x95FC0000,0x95FC0000,
-0x95FC0000,0xCBFC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0x95FC0000,0x95FC0000,0x95FC0000,0x95FC0000,0x95FC0000,0x95FC0000,0xCBFC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0xDA000001,0x3D40000,0x1B80000,0x1B80000,0x9FC0000,0x55FC0000,0x7DFC0000,0x7DFC0000,0xABFC0000,0x9FC0000,0x55FC0000,0xBFF80000,0xCBFC0000,
-0xBFF80000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xD1FC0000,0xDE000001,0xDE000001,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xD1FC0000,0xDE000001,0xDE000001,0xD1FC0000,
-0xD1FC0000,0xDE000001,0xDE000001,0xDE000001,0x27FC0000,0x7DC0000,0x1C00000,0x8DFC0000,0xB3FC0000,0xC5FC0000,0xCBFC0000,0xD5FC0000,0x69FC0000,0xA3FC0000,0xC5FC0000,0xDE000001,0xC5FC0000,0x1CC0000,0xB5FC0000,0xDBF80000,0xE4000001,0xB5FC0000,0xDBF80000,0xE4000001,0xDBF80000,0xE4000001,0xE4000001,0xB5FC0000,0xDBF80000,0xE4000001,0xDBF80000,0xE4000001,
-0xE4000001,0xDBF80000,0xE4000001,0xE4000001,0xE4000001,0xB5FC0000,0xDBF80000,0xE4000001,0xDBF80000,0xE4000001,0xE4000001,0xDBF80000,0xE4000001,0xE4000001,0xE4000001,0xDBF80000,0xE4000001,0xE4000001,0xE4000001,0xE4000001,0x87FC0000,0xDE80000,0xDE80000,0xC3FC0000,0xD5FC0000,0xE1F00000,0xE4000001,0xE4000001,0xA3FC0000,0xCBFC0000,0xE5DC0000,0xE4000001,
-0xD1FC0000,0x1BC0088,0xF9B80034,0xEBB80034,0xE5B80035,0xF5B40026,0xEBB40015,0xE5B40019,0xE7B40026,0xE5B00016,0xE3B00026,0xF7AC0033,0xEDAC0009,0xE7B0000F,0xE9AC0012,0xE5AC0002,0xE3AC0016,0xE7AC0033,0xE5A8000F,0xE3A8001A,0xE1AC0033,0x9BFC0088,0xF1A00033,0xE5AC0034,0xEBA00026,0xE7A40013,0xE3A80024,0xEB980033,0xE5A0000A,0xE39C0013,0xE1A00033,0xCFF80088,
-0xE58C0034,0xE3880024,0xE17C0033,0xDC00008C,0xFFB40021,0xF5B80068,0xF7BC006C,0xFDAC0001,0xEFAC0002,0xE9AC0002,0xE7AC0002,0xE5AC0002,0xFDB00021,0xF9A80001,0xE7A80009,0xE39C0013,0xC1FC0088,0x1C00034,0xF5BC0008,0xE9BC0009,0xE5BC0009,0xF1B80012,0xE9B80001,0xE7B80001,0xE7B80012,0xE5B80005,0xE3B80012,0xA5FC0033,0xEBB00009,0xE5B40009,0xE9AC0012,0xE5AC0002,
-0xE3B00012,0xD3FC0033,0xE5A00009,0xE39C0012,0xE0000033,0xA5FC0033,0xEBB00009,0xE5B40009,0xE9AC0012,0xE5AC0002,0xE3B00012,0xD3FC0033,0xE5A00009,0xE39C0012,0xE0000033,0xD3FC0033,0xE5A00009,0xE39C0012,0xE0000033,0xE0000033,0xFDB8000E,0xF9C00024,0xF9C00020,0xF7B00001,0xEFAC0002,0xE9AC0002,0xE5B00002,0xE5A80002,0xFDB00011,0xF5AC0001,0xE7A80008,0xE39C0012,
-0xC7FC0033,0x1B80034,0x1B80034,0x1B80034,0x1B80034,0xEBB40014,0xEBB40014,0xEBB40014,0xE3B40014,0xE3B40014,0xDFB00015,0xEDAC0008,0xEDAC0008,0xEDAC0008,0xE5AC0001,0xE5AC0001,0xE1AC0005,0xE1AC000A,0xE1AC000A,0xDFAC0001,0xDDAC000A,0x93FC0033,0x93FC0033,0x93FC0033,0xE7A40012,0xE7A40012,0xDFAC0013,0xE5A00009,0xE5A00009,0xDFA40002,0xDDA4000A,0xCBF80033,
-0xCBF80033,0xDF980013,0xDD90000A,0xDA000033,0xF9B00011,0xF3B40024,0x1B80034,0xFBAC0001,0xEFAC0001,0xE9AC0001,0xE7AC0001,0xE3AC0001,0xFBAC0009,0xF9A80000,0xE5AC0009,0xDFA40002,0xBDF80033,0x1BC0008,0x1BC0008,0x1BC0008,0x1BC0008,0xE7B80000,0xE7B80000,0xE7B80000,0xE1B80001,0xE1B80001,0xDFB80001,0x9BFC0008,0x9BFC0008,0x9BFC0008,0xE1B40001,0xE1B40001,
-0xDFB40001,0xCFF80008,0xCFF80008,0xDFAC0001,0xDC00000A,0x9BFC0008,0x9BFC0008,0x9BFC0008,0xE1B40001,0xE1B40001,0xDFB40001,0xCFF80008,0xCFF80008,0xDFAC0001,0xDC00000A,0xCFF80008,0xCFF80008,0xDFAC0001,0xDC00000A,0xDC00000A,0xFBB40001,0xF5B80004,0x1BC0008,0xF5B00000,0xEBB00000,0xE7B00000,0xE3B40001,0xE3B00000,0xFDB00001,0xF3AC0000,0xC1FC0008,0xDFAC0001,
-0xC1FC0008,0x1C40014,0xF1C00000,0xE9C00000,0xE5C00001,0xABFC0012,0xE9B80001,0xE5BC0001,0xD7F80012,0xE5AC0001,0xE2000012,0xABFC0012,0xE9B80001,0xE5BC0001,0xD7F80012,0xE5AC0001,0xE2000012,0xD7F80012,0xE5AC0001,0xE2000012,0xE2000012,0xABFC0012,0xE9B80001,0xE5BC0001,0xD7F80012,0xE5AC0001,0xE2000012,0xD7F80012,0xE5AC0001,0xE2000012,0xE2000012,0xD7F80012,
-0xE5AC0001,0xE2000012,0xE2000012,0xE2000012,0xFBBC0008,0x1E40012,0xFBC40008,0xFDAC0000,0xEFAC0001,0xEBA80000,0xE5B00001,0xE5A40001,0xF3BC0008,0xFDA40000,0xE7B00000,0xE2000012,0xCBFC0012,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0x1B00014,0xE7AC0000,0xE7AC0000,0xE7AC0000,0xE7AC0000,0xE7AC0000,
-0xE7AC0000,0xDFAC0000,0xDFAC0000,0xDFAC0000,0xDBAC0001,0x8DFC0012,0x8DFC0012,0x8DFC0012,0x8DFC0012,0x8DFC0012,0x8DFC0012,0xDFA40001,0xDFA40001,0xDFA40001,0xDBA80001,0xC7FC0012,0xC7FC0012,0xC7FC0012,0xDB980001,0xD8000012,0xF1B00008,0x1B00014,0x1B00014,0xFFAC0000,0xF3AC0000,0xEBAC0000,0xEBAC0000,0xE5AC0000,0xF7AC0005,0xF9A80000,0xE1AC0001,0xDFA40001,
-0xB9FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table28[] = {
-0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0xD1F80000,
-0xD1F80000,0xD1F80000,0xD1F80000,0xDE000000,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1DC0000,0x1DC0000,0x1DC0000,0x9FFC0000,0xC3FC0000,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,
-0xB1FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xE4000000,0x1E80000,0x1C80001,0x1C80001,0x49FC0000,0x81FC0000,0x9FFC0000,0x9FFC0000,0xC1FC0000,0x49FC0000,0x81FC0000,0xCFFC0000,0xD9FC0000,
-0xCFFC0000,0x1D00001,0x1D00001,0x1D00001,0x1D00001,0xBDFC0000,0xBDFC0000,0xBDFC0000,0xDFF80000,0xDFF80000,0xE8000000,0xBDFC0000,0xBDFC0000,0xBDFC0000,0xDFF80000,0xDFF80000,0xE8000000,0xDFF80000,0xDFF80000,0xE8000000,0xE8000000,0xBDFC0000,0xBDFC0000,0xBDFC0000,0xDFF80000,0xDFF80000,0xE8000000,0xDFF80000,0xDFF80000,0xE8000000,0xE8000000,0xDFF80000,
-0xDFF80000,0xE8000000,0xE8000000,0xE8000000,0x67FC0000,0x1F00000,0x1D00001,0xAFFC0000,0xCBFC0000,0xD7FC0000,0xDBFC0000,0xE1FC0000,0x95FC0000,0xBDFC0000,0xD7FC0000,0xE8000000,0xD7FC0000,0x1DC0001,0xCFFC0000,0xE7FC0000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,
-0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xEE000000,0xB3FC0000,0x7FC0000,0x7FC0000,0xD9FC0000,0xE5FC0000,0xEBFC0000,0xEE000000,0xEE000000,0xC5FC0000,0xDDFC0000,0xEFD00000,0xEE000000,
-0xE1FC0000,0x1CC008C,0xFFC80037,0xF5C80033,0xEFC80033,0xFFC40024,0xF5C40013,0xF1C4001A,0xF1C40024,0xEDC40016,0xEBC40026,0xFDC00034,0xF5C0000A,0xF1C0000F,0xF3BC0013,0xEFBC0002,0xEBC00016,0xEFC00034,0xEDBC000F,0xEBBC0019,0xE9BC0035,0xB7FC0088,0xF9B40033,0xEFBC0033,0xF3B40026,0xEFB80012,0xEBBC0026,0xF5A80033,0xEFAC0009,0xEBB00015,0xE9B40034,0xDDF40088,
-0xEF980033,0xEBA00026,0xE9940034,0xE6000088,0xFDC8003F,0xFFCC0064,0xFFCC006C,0xFFC00007,0xF7C00002,0xF1C00002,0xEFC00002,0xEFBC0002,0xFFC40034,0xFFBC0005,0xEFBC0009,0xEBB00015,0xD3FC0088,0x1D40033,0xFDD0000A,0xF3CC000A,0xEFCC000A,0xF7CC0013,0xF3C80002,0xEFCC0001,0xEFCC0013,0xEFC80005,0xEBC80015,0xC1FC0033,0xF5C00009,0xEFC4000A,0xF3BC0012,0xEFBC0001,
-0xEBC40014,0xE1F80033,0xEFAC0008,0xEBB00014,0xE8000034,0xC1FC0033,0xF5C00009,0xEFC4000A,0xF3BC0012,0xEFBC0001,0xEBC40014,0xE1F80033,0xEFAC0008,0xEBB00014,0xE8000034,0xE1F80033,0xEFAC0008,0xEBB00014,0xE8000034,0xE8000034,0xFDCC0014,0xF1D0002D,0xF3D4002A,0xFFC40003,0xF7C00001,0xF1C00001,0xEFC00001,0xEFBC0001,0xF7CC0019,0xFFC00002,0xEFBC0009,0xEBB00014,
-0xD9FC0033,0x1C80033,0x1C80033,0x1C80033,0x1C80033,0xF7C40012,0xF7C40012,0xF7C40012,0xEDC40012,0xEDC40012,0xE9C40012,0xF5C00009,0xF5C00009,0xF5C00009,0xEDC00002,0xEDC00002,0xE9C00005,0xEBBC0009,0xEBBC0009,0xE9BC0001,0xE7BC0009,0xAFFC0033,0xAFFC0033,0xAFFC0033,0xEFB80012,0xEFB80012,0xE9BC0012,0xEDB40009,0xEDB40009,0xE9B40001,0xE7B40009,0xD7FC0033,
-0xD7FC0033,0xE9A80012,0xE79C0008,0xE4000034,0xFBC40013,0xFDC80023,0x1C80033,0xFDC00003,0xF7C00001,0xF1C00001,0xEFC00002,0xEDBC0002,0xFDC00013,0xFFBC0001,0xEFBC0008,0xE9B40001,0xCDFC0033,0x1CC000A,0x1CC000A,0x1CC000A,0x1CC000A,0xEFCC0001,0xEFCC0001,0xEFCC0001,0xEBC80001,0xEBC80001,0xE9C80001,0xB7FC0008,0xB7FC0008,0xB7FC0008,0xEBC40001,0xEBC40001,
-0xE9C40001,0xDDF40008,0xDDF40008,0xE9B80000,0xE6000008,0xB7FC0008,0xB7FC0008,0xB7FC0008,0xEBC40001,0xEBC40001,0xE9C40001,0xDDF40008,0xDDF40008,0xE9B80000,0xE6000008,0xDDF40008,0xDDF40008,0xE9B80000,0xE6000008,0xE6000008,0xFDC80002,0xFFCC0002,0x1CC000A,0xF5C80001,0xF3C40000,0xEFC40000,0xEDC40000,0xEBC40001,0xFFC40002,0xFBC00000,0xD3FC0008,0xE9B80000,
-0xD3FC0008,0x1D80012,0xF9D40001,0xF1D40001,0xEFD00001,0xC7FC0012,0xF3C80001,0xEFCC0000,0xE3FC0012,0xEFB80000,0xEA000014,0xC7FC0012,0xF3C80001,0xEFCC0000,0xE3FC0012,0xEFB80000,0xEA000014,0xE3FC0012,0xEFB80000,0xEA000014,0xEA000014,0xC7FC0012,0xF3C80001,0xEFCC0000,0xE3FC0012,0xEFB80000,0xEA000014,0xE3FC0012,0xEFB80000,0xEA000014,0xEA000014,0xE3FC0012,
-0xEFB80000,0xEA000014,0xEA000014,0xEA000014,0xFFD00008,0x1F80012,0xF5D8000D,0xFFC40002,0xF7C00001,0xF3BC0000,0xEFC00000,0xEFB00000,0xF9D00008,0xFFC00001,0xEFC40001,0xEA000014,0xDDF80012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0x1C40012,0xEFC00001,0xEFC00001,0xEFC00001,0xEFC00001,0xEFC00001,
-0xEFC00001,0xE7C00001,0xE7C00001,0xE7C00001,0xE5BC0001,0xA9FC0012,0xA9FC0012,0xA9FC0012,0xA9FC0012,0xA9FC0012,0xA9FC0012,0xE9B40001,0xE9B40001,0xE9B40001,0xE5B80000,0xD5F80012,0xD5F80012,0xD5F80012,0xE5A40000,0xE0000014,0xF9C0000A,0x1C40012,0x1C40012,0xF9C00002,0xF7C00001,0xF1C00001,0xF1C00001,0xEDC00001,0xFFBC0005,0xFFBC0001,0xEBBC0000,0xE9B40001,
-0xC9FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table29[] = {
-0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,
-0xDDF40000,0xDDF40000,0xDDF40000,0xE6000000,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1EC0000,0x1EC0000,0x1EC0000,0xB7FC0000,0xD3FC0000,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,
-0xC9FC0000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xEC000000,0x5F80000,0x1D80001,0x1D80001,0x83FC0000,0xA9FC0000,0xBDFC0000,0xBDFC0000,0xD3FC0000,0x83FC0000,0xA9FC0000,0xDFF80000,0xE5F80000,
-0xDFF80000,0x1E00001,0x1E00001,0x1E00001,0x1E00001,0xD5FC0000,0xD5FC0000,0xD5FC0000,0xEBF80000,0xEBF80000,0xF0000000,0xD5FC0000,0xD5FC0000,0xD5FC0000,0xEBF80000,0xEBF80000,0xF0000000,0xEBF80000,0xEBF80000,0xF0000000,0xF0000000,0xD5FC0000,0xD5FC0000,0xD5FC0000,0xEBF80000,0xEBF80000,0xF0000000,0xEBF80000,0xEBF80000,0xF0000000,0xF0000000,0xEBF80000,
-0xEBF80000,0xF0000000,0xF0000000,0xF0000000,0x9FFC0000,0x27FC0000,0x1E00001,0xCDFC0000,0xDDFC0000,0xE5FC0000,0xE9F80000,0xEDF80000,0xBDFC0000,0xD5FC0000,0xE5FC0000,0xF0000000,0xE5FC0000,0x1EC0001,0xE9FC0000,0xF3FC0000,0xF6000000,0xE9FC0000,0xF3FC0000,0xF6000000,0xF3FC0000,0xF6000000,0xF6000000,0xE9FC0000,0xF3FC0000,0xF6000000,0xF3FC0000,0xF6000000,
-0xF6000000,0xF3FC0000,0xF6000000,0xF6000000,0xF6000000,0xE9FC0000,0xF3FC0000,0xF6000000,0xF3FC0000,0xF6000000,0xF6000000,0xF3FC0000,0xF6000000,0xF6000000,0xF6000000,0xF3FC0000,0xF6000000,0xF6000000,0xF6000000,0xF6000000,0xDBFC0000,0x87FC0000,0x87FC0000,0xEDFC0000,0xF3F80000,0xF5FC0000,0xF6000000,0xF6000000,0xE3FC0000,0xEFFC0000,0xF7E00000,0xF6000000,
-0xF1FC0000,0x1DC008C,0xFFDC0044,0xFDD80033,0xF7D80033,0xFDD80034,0xFDD40013,0xF9D4001A,0xF9D40024,0xF5D40016,0xF3D40026,0xFFD0003C,0xFDD0000A,0xF9D0000F,0xFBCC0013,0xF7CC0002,0xF3D00016,0xF7D00034,0xF5CC000F,0xF3CC0019,0xF1CC0035,0xCFFC0088,0xFFC80035,0xF7CC0033,0xFBC40026,0xF7C80012,0xF3CC0026,0xFDB80033,0xF7BC0009,0xF3C00015,0xF1C40034,0xE7FC0088,
-0xF7A80033,0xF3B00026,0xF1A40034,0xEE000088,0xFFD8004E,0xF7DC0076,0xF7DC007B,0xFFD4001A,0xFFD00002,0xF9D00002,0xF7D00002,0xF7CC0002,0xFFD4004A,0xFFD00015,0xF7CC0009,0xF3C00015,0xE1FC0088,0x1E40033,0xFFE0000E,0xFBDC000A,0xF7DC000A,0xFFDC0013,0xFBD80002,0xF7DC0001,0xF7DC0013,0xF7D80005,0xF3D80015,0xD9FC0033,0xFDD00009,0xF7D4000A,0xFBCC0012,0xF7CC0001,
-0xF3D40014,0xEDF80033,0xF7BC0008,0xF3C00014,0xF0000034,0xD9FC0033,0xFDD00009,0xF7D4000A,0xFBCC0012,0xF7CC0001,0xF3D40014,0xEDF80033,0xF7BC0008,0xF3C00014,0xF0000034,0xEDF80033,0xF7BC0008,0xF3C00014,0xF0000034,0xF0000034,0xFDE00021,0xF9E0002D,0xFBE4002A,0xFFD8000D,0xFFD00001,0xF9D00001,0xF7D00001,0xF7CC0001,0xFFDC0019,0xFFD4000A,0xF7CC0009,0xF3C00014,
-0xE7FC0033,0x1D80033,0x1D80033,0x1D80033,0x1D80033,0xFFD40012,0xFFD40012,0xFFD40012,0xF5D40012,0xF5D40012,0xF1D40012,0xFDD00009,0xFDD00009,0xFDD00009,0xF5D00002,0xF5D00002,0xF1D00005,0xF3CC0009,0xF3CC0009,0xF1CC0001,0xEFCC0009,0xC7FC0033,0xC7FC0033,0xC7FC0033,0xF7C80012,0xF7C80012,0xF1CC0012,0xF5C40009,0xF5C40009,0xF1C40001,0xEFC40009,0xE3FC0033,
-0xE3FC0033,0xF1B80012,0xEFAC0008,0xEC000034,0xFDD4001D,0xF5D8002A,0x1D80033,0xFFD4000A,0xFFD00001,0xF9D00001,0xF7D00002,0xF5CC0002,0xFFD00018,0xFFD00005,0xF7CC0008,0xF1C40001,0xDDF80033,0x1DC000A,0x1DC000A,0x1DC000A,0x1DC000A,0xF7DC0001,0xF7DC0001,0xF7DC0001,0xF3D80001,0xF3D80001,0xF1D80001,0xCFFC0008,0xCFFC0008,0xCFFC0008,0xF3D40001,0xF3D40001,
-0xF1D40001,0xE7FC0008,0xE7FC0008,0xF1C80000,0xEE000008,0xCFFC0008,0xCFFC0008,0xCFFC0008,0xF3D40001,0xF3D40001,0xF1D40001,0xE7FC0008,0xE7FC0008,0xF1C80000,0xEE000008,0xE7FC0008,0xE7FC0008,0xF1C80000,0xEE000008,0xEE000008,0xFFD80004,0xF7DC0005,0x1DC000A,0xFDD80001,0xFBD40000,0xF7D40000,0xF5D40000,0xF3D40001,0xF1DC0005,0xFBD40001,0xE1FC0008,0xF1C80000,
-0xE1FC0008,0x1E80012,0xFDE40002,0xF9E40001,0xF7E00001,0xDFFC0012,0xFBD80001,0xF7DC0000,0xEFFC0012,0xF7C80000,0xF2000014,0xDFFC0012,0xFBD80001,0xF7DC0000,0xEFFC0012,0xF7C80000,0xF2000014,0xEFFC0012,0xF7C80000,0xF2000014,0xF2000014,0xDFFC0012,0xFBD80001,0xF7DC0000,0xEFFC0012,0xF7C80000,0xF2000014,0xEFFC0012,0xF7C80000,0xF2000014,0xF2000014,0xEFFC0012,
-0xF7C80000,0xF2000014,0xF2000014,0xF2000014,0xF7E8000D,0x57FC0012,0xFDE8000D,0xFDE00008,0xFFD00001,0xFBCC0000,0xF7D00000,0xF7C00000,0xF9E4000D,0xFFD80005,0xF7D40001,0xF2000014,0xEBFC0012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0x1D40012,0xF7D00001,0xF7D00001,0xF7D00001,0xF7D00001,0xF7D00001,
-0xF7D00001,0xEFD00001,0xEFD00001,0xEFD00001,0xEDCC0001,0xC1FC0012,0xC1FC0012,0xC1FC0012,0xC1FC0012,0xC1FC0012,0xC1FC0012,0xF1C40001,0xF1C40001,0xF1C40001,0xEDC80000,0xE1F80012,0xE1F80012,0xE1F80012,0xEDB40000,0xE8000014,0xF3D4000D,0x1D40012,0x1D40012,0xFBD00005,0xFFD00001,0xF9D00001,0xF9D00001,0xF5D00001,0xFBD00008,0xFDCC0004,0xF3CC0000,0xF1C40001,
-0xD9FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table30[] = {
-0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xE7FC0000,
-0xE7FC0000,0xE7FC0000,0xE7FC0000,0xEE000000,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0x7FC0000,0x7FC0000,0x7FC0000,0xCFFC0000,0xE1FC0000,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xE3FC0000,
-0xE3FC0000,0xF1F80000,0xF1F80000,0xF1F80000,0xF4000000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xF1F80000,0xF1F80000,0xF1F80000,0xF4000000,0xF1F80000,0xF1F80000,0xF1F80000,0xF4000000,0xF4000000,0x67FC0000,0x1E80001,0x1E80001,0xBBFC0000,0xD1FC0000,0xDBFC0000,0xDBFC0000,0xE7FC0000,0xBBFC0000,0xD1FC0000,0xEDFC0000,0xF1F80000,
-0xEDFC0000,0x1F00001,0x1F00001,0x1F00001,0x1F00001,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xF7F80000,0xF7F80000,0xF8000000,0xF8000000,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xF7F80000,0xF7F80000,0xF8000000,0xF8000000,0xF7F80000,
-0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xD7FC0000,0xA7FC0000,0x1F00001,0xEBFC0000,0xF1FC0000,0xF5FC0000,0xF5FC0000,0xF7FC0000,0xE3FC0000,0xEFFC0000,0xF5FC0000,0xF8000000,0xF5FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1EC008C,0xFFEC005F,0xFFE80043,0xFFE80033,0xFFE8004C,0xFFE4002B,0xFFE4001B,0xFFE40026,0xFDE40016,0xFBE40026,0xFFE80054,0xFFE40023,0xFFE00011,0xFFE0001B,0xFFDC0002,0xFBE00016,0xFFE00034,0xFDDC000F,0xFBDC0019,0xF9DC0035,0xE9FC0088,0xFFE4004B,0xFFDC0033,0xFFD80036,0xFFD80012,0xFBDC0026,0xFFD8003D,0xFFCC0009,0xFBD00015,0xF9D40034,0xF3FC0088,
-0xFFB80033,0xFBC00026,0xF9B40034,0xF6000088,0xFFEC006A,0xFFEC0076,0xFFEC007B,0xFFE8004F,0xFFE40026,0xFFE0000C,0xFFE00002,0xFFDC0002,0xFDEC006A,0xFFE4004A,0xFFDC0009,0xFBD00015,0xF1FC0088,0x1F40033,0xFDF00023,0xFFF0000E,0xFFEC000A,0xFDF00023,0xFFEC000A,0xFFEC0001,0xFFEC0013,0xFFE80005,0xFBE80015,0xF1FC0033,0xFFEC0019,0xFFE4000A,0xFFE40015,0xFFDC0001,
-0xFBE40014,0xF9F80033,0xFFCC0008,0xFBD00014,0xF8000034,0xF1FC0033,0xFFEC0019,0xFFE4000A,0xFFE40015,0xFFDC0001,0xFBE40014,0xF9F80033,0xFFCC0008,0xFBD00014,0xF8000034,0xF9F80033,0xFFCC0008,0xFBD00014,0xF8000034,0xF8000034,0xFDF4002A,0xF3F40033,0xF3F40033,0xFFEC001E,0xFFE80012,0xFFE40009,0xFFE00001,0xFFDC0001,0xFFF00029,0xFFF00021,0xFFDC0009,0xFBD00014,
-0xF7FC0033,0x1E80033,0x1E80033,0x1E80033,0x1E80033,0xFDE4001B,0xFDE4001B,0xFDE4001B,0xFDE40012,0xFDE40012,0xF9E40012,0xFFE00011,0xFFE00011,0xFFE00011,0xFDE00002,0xFDE00002,0xF9E00005,0xFBDC0009,0xFBDC0009,0xF9DC0001,0xF7DC0009,0xDFFC0033,0xDFFC0033,0xDFFC0033,0xFFD80012,0xFFD80012,0xF9DC0012,0xFDD40009,0xFDD40009,0xF9D40001,0xF7D40009,0xEFFC0033,
-0xEFFC0033,0xF9C80012,0xF7BC0008,0xF4000034,0xFFE80022,0xFDE8002A,0x1E80033,0xFDE4001A,0xFFE4000D,0xFFE00003,0xFFE00002,0xFDDC0002,0xFFE40021,0xFFE00018,0xFFDC0008,0xF9D40001,0xEBFC0033,0x1EC000A,0x1EC000A,0x1EC000A,0x1EC000A,0xFFEC0001,0xFFEC0001,0xFFEC0001,0xFBE80001,0xFBE80001,0xF9E80001,0xE9FC0008,0xE9FC0008,0xE9FC0008,0xFBE40001,0xFBE40001,
-0xF9E40001,0xF3FC0008,0xF3FC0008,0xF9D80000,0xF6000008,0xE9FC0008,0xE9FC0008,0xE9FC0008,0xFBE40001,0xFBE40001,0xF9E40001,0xF3FC0008,0xF3FC0008,0xF9D80000,0xF6000008,0xF3FC0008,0xF3FC0008,0xF9D80000,0xF6000008,0xF6000008,0xFBEC0005,0xFFEC0005,0x1EC000A,0xF9EC0005,0xFDE80002,0xFFE40000,0xFDE40000,0xFBE40001,0xF9EC0005,0xFFE80002,0xF1FC0008,0xF9D80000,
-0xF1FC0008,0x1F80012,0xFFF4000A,0xFFF00005,0xFFF00001,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xFBFC0012,0xFFD80000,0xFA000014,0xFA000014,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xFBFC0012,0xFFD80000,0xFA000014,0xFA000014,0xFBFC0012,
-0xFFD80000,0xFA000014,0xFA000014,0xFA000014,0xFFF8000D,0xD7FC0012,0xF5F80012,0xFFF4000D,0xFDF4000D,0xFFE80005,0xFFE00000,0xFFD00000,0xF5FC0012,0xFFF00011,0xFFE40001,0xFA000014,0xFBFC0012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0x1E40012,0xFFE00001,0xFFE00001,0xFFE00001,0xFFE00001,0xFFE00001,
-0xFFE00001,0xF7E00001,0xF7E00001,0xF7E00001,0xF5DC0001,0xD9FC0012,0xD9FC0012,0xD9FC0012,0xD9FC0012,0xD9FC0012,0xD9FC0012,0xF9D40001,0xF9D40001,0xF9D40001,0xF5D80000,0xEDF80012,0xEDF80012,0xEDF80012,0xF5C40000,0xF0000014,0xFBE4000D,0x1E40012,0x1E40012,0xFBE0000A,0xFDE00005,0xFFE00002,0xFFE00002,0xFDE00001,0xF7E4000D,0xFDE00008,0xFBDC0000,0xF9D40001,
-0xE7FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table31[] = {
-0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xF3FC0000,
-0xF3FC0000,0xF3FC0000,0xF3FC0000,0xF6000000,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0x87FC0000,0x87FC0000,0x87FC0000,0xE9FC0000,0xF1FC0000,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,
-0xFBFC0000,0xFDF80000,0xFDF80000,0xFDF80000,0xFC000000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFDF80000,0xFC000000,0xFDF80000,0xFDF80000,0xFDF80000,0xFC000000,0xFC000000,0xE7FC0000,0x1F80001,0x1F80001,0xF5FC0000,0xF7FC0000,0xF9FC0000,0xF9FC0000,0xFBFC0000,0xF5FC0000,0xF7FC0000,0xFDF80000,0xFDF80000,
-0xFDF80000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1F8002C,0xFFF80027,0xFFF80024,0xFFF80023,0xFFF80022,0xFFF4001F,0xFFF4001B,0xFFF4001A,0xFFF40016,0xFFF40012,0xFFF4001C,0xFFF40017,0xFFF40013,0xFFF00012,0xFFF0000E,0xFFF0000A,0xFFF00009,0xFFF00005,0xFFF00001,0xFFEC0005,0xF7FC002C,0xFDF80027,0xFFF40023,0xFFF0001A,0xFFF00016,0xFFF00012,0xFFF00011,0xFFF0000D,0xFFE40005,0xFFE40005,0xFBFC002C,
-0xFFEC0023,0xFFE00012,0xFFCC0004,0xFC00002C,0xFFF80027,0xF5F8002C,0xF5F8002C,0xFFF80022,0xFFF4001A,0xFFF40012,0xFFF40011,0xFFF00009,0xFFF80022,0xFFF4001F,0xFFF0000B,0xFFE40005,0xFBFC002C,0x1FC0003,0xFDFC0003,0xFFFC0002,0xFFFC0002,0xFDFC0003,0xFFFC0002,0xFFFC0002,0xFFFC0001,0xFFFC0001,0xFFF80001,0xFDFC0003,0xFFFC0002,0xFFFC0002,0xFFFC0001,0xFFF80001,
-0xFFF80000,0xFFF80003,0xFFF80002,0xFFF00000,0xFE000004,0xFDFC0003,0xFFFC0002,0xFFFC0002,0xFFFC0001,0xFFF80001,0xFFF80000,0xFFF80003,0xFFF80002,0xFFF00000,0xFE000004,0xFFF80003,0xFFF80002,0xFFF00000,0xFE000004,0xFE000004,0xFDFC0003,0xF7FC0003,0xF7FC0003,0xFDFC0003,0xFFFC0002,0xFFFC0001,0xFFF80001,0xFFF80001,0xFDFC0003,0xFDFC0003,0xFFF80002,0xFFF00000,
-0xFFF80003,0x1F80023,0x1F80023,0x1F80023,0x1F80023,0xFFF4001B,0xFFF4001B,0xFFF4001B,0xFFF40016,0xFFF40016,0xFFF40012,0xFFF40013,0xFFF40013,0xFFF40013,0xFFF0000E,0xFFF0000E,0xFFF0000A,0xFFF00005,0xFFF00005,0xFFF00001,0xFDEC0005,0xF7FC0023,0xF7FC0023,0xF7FC0023,0xFFF00016,0xFFF00016,0xFFF00012,0xFFF0000D,0xFFF0000D,0xFFE40005,0xFDE80004,0xFBFC0023,
-0xFBFC0023,0xFFE00012,0xFDD40004,0xFA000024,0xFFF40022,0xF5F80023,0x1F80023,0xFFF4001D,0xFFF40016,0xFFF40011,0xFFF40011,0xFFF00009,0xFFF4001D,0xFFF40016,0xFFF0000B,0xFFE40005,0xFBFC0023,0x1FC0002,0x1FC0002,0x1FC0002,0x1FC0002,0xFDFC0002,0xFDFC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFF80001,0xFDFC0002,0xFDFC0002,0xFDFC0002,0xFFF80001,0xFFF80001,
-0xFFF80000,0xFFF80002,0xFFF80002,0xFFF00000,0xFC000004,0xFDFC0002,0xFDFC0002,0xFDFC0002,0xFFF80001,0xFFF80001,0xFFF80000,0xFFF80002,0xFFF80002,0xFFF00000,0xFC000004,0xFFF80002,0xFFF80002,0xFFF00000,0xFC000004,0xFC000004,0xFBFC0002,0xF7FC0002,0x1FC0002,0xFDFC0002,0xFDFC0002,0xFFF80001,0xFFF80001,0xFFF80001,0xFDFC0002,0xFDFC0002,0xFFF80002,0xFFF00000,
-0xFFF80002,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0x1F40012,0xFDF0000A,0xFDF0000A,0xFDF0000A,0xFDF0000A,0xFDF0000A,
-0xFDF0000A,0xFFF00001,0xFFF00001,0xFFF00001,0xFDEC0001,0xF1FC0012,0xF1FC0012,0xF1FC0012,0xF1FC0012,0xF1FC0012,0xF1FC0012,0xFDEC0005,0xFDEC0005,0xFDEC0005,0xFDE80000,0xF9F80012,0xF9F80012,0xF9F80012,0xFDD40000,0xF8000014,0xF3F40012,0x1F40012,0x1F40012,0xFFF4000D,0xFDF4000D,0xFFF0000A,0xFFF0000A,0xFFF00005,0xFFF4000D,0xFDF4000D,0xFFF00002,0xFDEC0005,
-0xF7FC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table32[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x80001,0x80001,0x80001,0x80001,0x20C0000,0x20C0000,0x20C0000,0x180000,0x180000,0x4000000,0x20C0000,0x20C0000,0x20C0000,0x180000,0x180000,0x4000000,0x180000,0x180000,0x4000000,0x4000000,0x20C0000,0x20C0000,0x20C0000,0x180000,0x180000,0x4000000,0x180000,0x180000,0x4000000,0x4000000,0x180000,
-0x180000,0x4000000,0x4000000,0x4000000,0xC0000,0xC080000,0x80001,0xC0000,0x100000,0x140000,0x140000,0x200000,0xC0000,0x20C0000,0x140000,0x4000000,0x140000,0x200001,0x2300000,0x640000,0x10000000,0x2300000,0x640000,0x10000000,0x640000,0x10000000,0x10000000,0x2300000,0x640000,0x10000000,0x640000,0x10000000,
-0x10000000,0x640000,0x10000000,0x10000000,0x10000000,0x2300000,0x640000,0x10000000,0x640000,0x10000000,0x10000000,0x640000,0x10000000,0x10000000,0x10000000,0x640000,0x10000000,0x10000000,0x10000000,0x10000000,0x2280000,0x240000,0x240000,0x380000,0x500000,0x9C0000,0x10000000,0x10000000,0x22C0000,0x400000,0x1F40000,0x10000000,
-0x480000,0xC00C2,0x2E040011,0x18040011,0x10040011,0x20000048,0x18000009,0x10000001,0x10000048,0xE00001D,0xA000048,0x14000099,0x12000035,0xE00001D,0xC000060,0xC000030,0xA000058,0xA000099,0xA000059,0x8000074,0x6000099,0x1000C2,0x12000059,0xC000031,0xC000070,0xC000040,0x8000062,0x80000A7,0xA000069,0x800007D,0x600009D,0x1800C2,
-0x8000089,0x8000098,0x40000B2,0x40000C2,0x42000029,0xA8000048,0xEA040011,0x1E000029,0x16000032,0xE000032,0xC000022,0xA00003D,0x2C000056,0x1A00003D,0xC00004F,0x800007D,0x1400C2,0x10009A,0x2E04000D,0x1804000D,0x1004000D,0x20000048,0x18000009,0x10000001,0x10000048,0xE00001D,0xA000048,0x140099,0x12000035,0xE00001D,0xC000060,0xC000030,
-0xA000058,0x240099,0xA000059,0x8000074,0x6000099,0x140099,0x12000035,0xE00001D,0xC000060,0xC000030,0xA000058,0x240099,0xA000059,0x8000074,0x6000099,0x240099,0xA000059,0x8000074,0x6000099,0x6000099,0x42000029,0xA8000048,0xEA04000D,0x1E000029,0x16000032,0xE000032,0xC000022,0xA00003D,0x2C00004D,0x1A000039,0xC00004E,0x8000074,
-0x1C0099,0x40011,0x40011,0x40011,0x40011,0xE000000,0xE000000,0xE000000,0x6000000,0x6000000,0x4000000,0x400000D,0x400000D,0x400000D,0x6000004,0x6000004,0x4000004,0x200000D,0x200000D,0x2000008,0x200000D,0x40011,0x40011,0x40011,0x6000008,0x6000008,0x2000006,0x200000E,0x200000E,0x2000009,0x200000E,0x80011,
-0x80011,0x200000C,0x2000011,0x12,0x20000004,0x48000000,0x40011,0x10000004,0x6000005,0x8000004,0x8000004,0x4000005,0xC000009,0xA000006,0x200000D,0x2000009,0x80011,0x4000D,0x4000D,0x4000D,0x4000D,0xE000000,0xE000000,0xE000000,0x6000000,0x6000000,0x4000000,0x4000D,0x4000D,0x4000D,0x6000004,0x6000004,
-0x4000004,0x8000D,0x8000D,0x2000008,0x200000D,0x4000D,0x4000D,0x4000D,0x6000004,0x6000004,0x4000004,0x8000D,0x8000D,0x2000008,0x200000D,0x8000D,0x8000D,0x2000008,0x200000D,0x200000D,0x20000004,0x48000000,0x4000D,0x10000004,0x6000005,0x8000004,0x8000004,0x4000005,0x4040008,0xA000005,0x8000D,0x2000008,
-0x8000D,0x14004A,0x260C0001,0x16080001,0x10080001,0x200048,0x18000009,0x10000001,0x3C0048,0xE00001D,0xA000048,0x200048,0x18000009,0x10000001,0x3C0048,0xE00001D,0xA000048,0x3C0048,0xE00001D,0xA000048,0xA000048,0x200048,0x18000009,0x10000001,0x3C0048,0xE00001D,0xA000048,0x3C0048,0xE00001D,0xA000048,0xA000048,0x3C0048,
-0xE00001D,0xA000048,0xA000048,0xA000048,0x42000019,0x180048,0xAE0C0001,0x1E000019,0x16000019,0x10000019,0xE000014,0xE000028,0x32000022,0x1E00001D,0x10000011,0xA000048,0x2C0048,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table33[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x80000,0x80000,0x80000,0x80000,0x80000,
-0x80000,0xC0000,0xC0000,0xC0000,0x2000000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0xC0000,0xC0000,0xC0000,0x2000000,0xC0000,0xC0000,0xC0000,0x2000000,0x2000000,0xA040000,0x40001,0x40001,0x6040000,0x80000,0x80000,0x80000,0x80000,0x6040000,0x80000,0xC0000,0xC0000,
-0xC0000,0x180001,0x180001,0x180001,0x180001,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x4C0000,0x4C0000,0xC000000,0xC000000,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x4C0000,0x4C0000,0xC000000,0xC000000,0x4C0000,
-0x4C0000,0xC000000,0xC000000,0xC000000,0x41C0000,0x1C0000,0x180001,0x240000,0x2C0000,0x340000,0x3C0000,0x5C0000,0x200000,0x2240000,0x340000,0xC000000,0x340000,0x300001,0x480000,0x940000,0x18000000,0x480000,0x940000,0x18000000,0x940000,0x18000000,0x18000000,0x480000,0x940000,0x18000000,0x940000,0x18000000,
-0x18000000,0x940000,0x18000000,0x18000000,0x18000000,0x480000,0x940000,0x18000000,0x940000,0x18000000,0x18000000,0x940000,0x18000000,0x18000000,0x18000000,0x940000,0x18000000,0x18000000,0x18000000,0x18000000,0x23C0000,0x2340000,0x2340000,0x540000,0x780000,0xEC0000,0x18000000,0x18000000,0x440000,0x5C0000,0xBC80000,0x18000000,
-0x680000,0x14017F,0x3E0C005E,0x220C005E,0x180C005E,0x3404004D,0x22040006,0x1804000E,0x1A04004D,0x16000011,0x1204004D,0x2A0000F3,0x22000045,0x18000032,0x18000069,0x14000021,0x12000051,0x140000F3,0x10000084,0x10000090,0xC0000F4,0x1C017F,0x1E0000AE,0x18000072,0x180000A9,0x12000051,0x12000075,0x12000118,0x100000A8,0xE0000B2,0xC000104,0x38017F,
-0x100000FD,0xE0000FD,0xA000139,0xA000181,0x6400001A,0xFA04004F,0xFE0C0067,0x3400001A,0x22000021,0x1A00001A,0x1600000E,0x14000038,0x42000066,0x2C000041,0x1600005C,0xE0000B2,0x28017F,0x1C00F3,0x3A100032,0x20100032,0x18100032,0x30080049,0x22040002,0x18080005,0x1A040049,0x1604000E,0x12040049,0x2800F3,0x22000045,0x18000032,0x18000069,0x14000021,
-0x12000051,0x5000F3,0x10000084,0x10000090,0xC0000F4,0x2800F3,0x22000045,0x18000032,0x18000069,0x14000021,0x12000051,0x5000F3,0x10000084,0x10000090,0xC0000F4,0x5000F3,0x10000084,0x10000090,0xC0000F4,0xC0000F4,0x6400001A,0xEC080049,0xF0100036,0x3400001A,0x22000021,0x1A00001A,0x1600000E,0x14000038,0x50000052,0x2C000038,0x1600005B,0x10000090,
-0x3800F3,0xC005E,0xC005E,0xC005E,0xC005E,0x22040005,0x22040005,0x22040005,0x12040005,0x12040005,0xC040005,0x16000032,0x16000032,0x16000032,0x12000009,0x12000009,0xC000001,0xA000034,0xA000034,0xA000014,0x6000034,0x20C005D,0x20C005D,0x20C005D,0xC000021,0xC000021,0xC000011,0x8000043,0x8000043,0x8000022,0x6000038,0x18005D,
-0x18005D,0x800003D,0x400004D,0x400005D,0x52000005,0xAA040005,0xC005E,0x2C00000A,0x1A000008,0x1600000A,0x12000008,0xC00000D,0x3400001D,0x1C000016,0xE000033,0x8000022,0x14005D,0x100032,0x100032,0x100032,0x100032,0x1E080001,0x1E080001,0x1E080001,0x10080001,0x10080001,0xC040001,0x180032,0x180032,0x180032,0x12000009,0x12000009,
-0xC000001,0x2C0032,0x2C0032,0xA000014,0x6000034,0x180032,0x180032,0x180032,0x12000009,0x12000009,0xC000001,0x2C0032,0x2C0032,0xA000014,0x6000034,0x2C0032,0x2C0032,0xA000014,0x6000034,0x6000034,0x52000005,0x8C080001,0x100032,0x2C00000A,0x1A000008,0x1600000A,0x12000008,0xC00000D,0x34000014,0x1C000012,0x200032,0xA000014,
-0x200032,0x24004A,0x2E1C0001,0x1E180001,0x18180001,0x380048,0x22040001,0x180C0001,0x700048,0x16000008,0x12000048,0x380048,0x22040001,0x180C0001,0x700048,0x16000008,0x12000048,0x700048,0x16000008,0x12000048,0x12000048,0x380048,0x22040001,0x180C0001,0x700048,0x16000008,0x12000048,0x700048,0x16000008,0x12000048,0x12000048,0x700048,
-0x16000008,0x12000048,0x12000048,0x12000048,0x7400000A,0x280048,0xB61C0001,0x38000008,0x2204000D,0x1C000008,0x18000004,0x14000014,0x50000012,0x2E00000D,0x1A000001,0x12000048,0x500048,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x8000000,0x8000000,0x8000000,0x8000000,0x8000000,
-0x8000000,0x4000000,0x4000000,0x4000000,0x2000000,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x2000002,0x2000002,0x2000002,0x2000001,0x5,0x5,0x5,0x2000004,0x5,0x28000000,0x40005,0x40005,0x12000000,0xC000000,0xA000000,0xA000000,0x6000000,0x12000001,0xC000001,0x4000000,0x2000002,
-0x5,};
-static const uint32_t g_etc1_to_bc7_m6_table34[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x200000,0x200000,0x200000,0x200000,0x200000,
-0x200000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0xA000000,0x180000,0x140001,0x140001,0x2180000,0x1C0000,0x1C0000,0x1C0000,0x240000,0x2180000,0x1C0000,0x2C0000,0x3C0000,
-0x2C0000,0x280001,0x280001,0x280001,0x280001,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x7C0000,0x7C0000,0x14000000,0x14000000,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x7C0000,0x7C0000,0x14000000,0x14000000,0x7C0000,
-0x7C0000,0x14000000,0x14000000,0x14000000,0x300000,0x2C0000,0x280001,0x380000,0x440000,0x580000,0x640000,0x980000,0x340000,0x23C0000,0x580000,0x14000000,0x580000,0x400001,0x600000,0xC40000,0x20000000,0x600000,0xC40000,0x20000000,0xC40000,0x20000000,0x20000000,0x600000,0xC40000,0x20000000,0xC40000,0x20000000,
-0x20000000,0xC40000,0x20000000,0x20000000,0x20000000,0x600000,0xC40000,0x20000000,0xC40000,0x20000000,0x20000000,0xC40000,0x20000000,0x20000000,0x20000000,0xC40000,0x20000000,0x20000000,0x20000000,0x20000000,0x4500000,0xA440000,0xA440000,0x26C0000,0xA00000,0x13C0000,0x20000000,0x20000000,0x580000,0x7C0000,0x13D80000,0x20000000,
-0x8C0000,0x200253,0x4E1400DE,0x2A1400DF,0x201400DE,0x440C0085,0x2C0C0042,0x220C005A,0x240C0085,0x1E0C0045,0x1A0C0085,0x420000F3,0x30000032,0x2008004A,0x2600004E,0x20000001,0x1A00004C,0x200000F3,0x1C000054,0x18000074,0x140000F4,0x300253,0x28000106,0x200000DD,0x220000D3,0x1E000069,0x18000099,0x1E000158,0x1A0000AF,0x180000B4,0x14000125,0x5C0253,
-0x16000179,0x16000159,0x100001AD,0x10000255,0x9C000003,0xF01000B1,0xF4180106,0x4E000001,0x36000001,0x28000001,0x20000005,0x1E00000C,0x64000051,0x3E00001D,0x20000042,0x180000B4,0x400253,0x2C00F3,0x42200032,0x28200032,0x20200032,0x38180049,0x2A140002,0x20180005,0x22140049,0x1E14000E,0x1A140049,0x4000F3,0x30000032,0x20100032,0x2404004E,0x20000001,
-0x1A040049,0x8000F3,0x1C000054,0x18000074,0x140000F4,0x4000F3,0x30000032,0x20100032,0x2404004E,0x20000001,0x1A040049,0x8000F3,0x1C000054,0x18000074,0x140000F4,0x8000F3,0x1C000054,0x18000074,0x140000F4,0x140000F4,0x9C000003,0xF4180049,0xF8200036,0x4E000001,0x36000001,0x28000001,0x20080001,0x1E00000C,0x6C000021,0x3E00000D,0x2000003E,0x18000074,
-0x5C00F3,0x1400DE,0x1400DE,0x1400DE,0x1400DE,0x320C003D,0x320C003D,0x320C003D,0x1C0C003D,0x1C0C003D,0x140C003D,0x30000032,0x30000032,0x30000032,0x1E000001,0x1E000001,0x1604000C,0x16000034,0x16000034,0x12000008,0xE000034,0x2000DD,0x2000DD,0x2000DD,0x18000059,0x18000059,0x12000041,0x12000068,0x12000068,0x1200002C,0xE00004D,0x3C00DD,
-0x3C00DD,0xE000089,0xA000095,0xA0000DD,0x98000002,0xEE0C003D,0x1400DE,0x4E000000,0x32000001,0x26000001,0x24000002,0x1A000001,0x56000023,0x36000012,0x1E000036,0x1200002C,0x2C00DD,0x200032,0x200032,0x200032,0x200032,0x26180001,0x26180001,0x26180001,0x18180001,0x18180001,0x14140001,0x300032,0x300032,0x300032,0x1C040001,0x1C040001,
-0x140C0000,0x5C0032,0x5C0032,0x12000008,0xE000034,0x300032,0x300032,0x300032,0x1C040001,0x1C040001,0x140C0000,0x5C0032,0x5C0032,0x12000008,0xE000034,0x5C0032,0x5C0032,0x12000008,0xE000034,0xE000034,0x7A080000,0x94180001,0x200032,0x4E000000,0x2C080001,0x24040000,0x1E080001,0x1A000001,0x5C000008,0x3C000002,0x400032,0x12000008,
-0x400032,0x34004A,0x362C0001,0x26280001,0x20280001,0x500048,0x2A140001,0x201C0001,0xA00048,0x20000001,0x1A000048,0x500048,0x2A140001,0x201C0001,0xA00048,0x20000001,0x1A000048,0xA00048,0x20000001,0x1A000048,0x1A000048,0x500048,0x2A140001,0x201C0001,0xA00048,0x20000001,0x1A000048,0xA00048,0x20000001,0x1A000048,0x1A000048,0xA00048,
-0x20000001,0x1A000048,0x1A000048,0x1A000048,0x9C000002,0x4380048,0xBE2C0001,0x4A040001,0x36000001,0x28000001,0x20080000,0x1E000008,0x72000008,0x46000004,0x220C0000,0x1A000048,0x700048,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0x20000000,0x20000000,0x20000000,0x20000000,0x20000000,
-0x20000000,0x10000000,0x10000000,0x10000000,0xA000000,0x10003D,0x10003D,0x10003D,0x10003D,0x10003D,0x10003D,0xC000014,0xC000014,0xC000014,0x800000D,0x18003D,0x18003D,0x18003D,0x8000028,0x400003D,0xA8000000,0xC003D,0xC003D,0x4A000000,0x34000000,0x28000000,0x28000000,0x1A000000,0x44000011,0x34000009,0x14000001,0xC000014,
-0x14003D,};
-static const uint32_t g_etc1_to_bc7_m6_table35[] = {
-0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x240000,
-0x240000,0x240000,0x240000,0x6000000,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xE0C0000,0xE0C0000,0xE0C0000,0x140000,0x1C0000,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x380000,0x380000,0x380000,0x380000,0x380000,
-0x380000,0x700000,0x700000,0x700000,0x12000000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x700000,0x700000,0x700000,0x12000000,0x700000,0x700000,0x700000,0x12000000,0x12000000,0x280000,0x240001,0x240001,0x2C0000,0x300000,0x340000,0x340000,0x400000,0x2C0000,0x300000,0x500000,0x700000,
-0x500000,0x380001,0x380001,0x380001,0x380001,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0xAC0000,0xAC0000,0x1C000000,0x1C000000,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0xAC0000,0xAC0000,0x1C000000,0x1C000000,0xAC0000,
-0xAC0000,0x1C000000,0x1C000000,0x1C000000,0x440000,0x63C0000,0x380001,0x24C0000,0x600000,0x780000,0x8C0000,0xD40000,0x480000,0x540000,0x780000,0x1C000000,0x780000,0x500001,0x780000,0xF40000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,
-0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0x780000,0xF40000,0x28000000,0xF40000,0x28000000,0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0xF40000,0x28000000,0x28000000,0x28000000,0x28000000,0x4640000,0x580000,0x580000,0x880000,0xC80000,0x1880000,0x28000000,0x28000000,0x700000,0x980000,0x1BE80000,0x28000000,
-0xAC0000,0x300274,0x562400F3,0x322400F4,0x282400F3,0x4C1C0092,0x341C004F,0x2A1C0067,0x2C1C0092,0x2818004E,0x22180092,0x4A1000F4,0x38100033,0x2A140051,0x300C004A,0x28100002,0x2210004D,0x280C00F4,0x24080053,0x20080069,0x1C0C00F5,0x2440274,0x3A0000F5,0x281000F4,0x2E0000AA,0x28000049,0x22040090,0x2800011F,0x2400005D,0x20000069,0x1C0000FD,0x8C0274,
-0x2000015B,0x1C000120,0x1A000181,0x16000278,0xA4100004,0xF82000C2,0xFC28011F,0x56100002,0x3E100002,0x30100002,0x28100006,0x260C0006,0x90000009,0x54040000,0x2A080035,0x20000069,0x640274,0x3C00F3,0x4A300032,0x30300032,0x28300032,0x40280049,0x32240002,0x28280005,0x2A240049,0x2624000E,0x22240049,0x5800F3,0x38100032,0x28200032,0x300C0049,0x28100001,
-0x22140049,0xB000F3,0x2600003E,0x20000059,0x1C0000F4,0x5800F3,0x38100032,0x28200032,0x300C0049,0x28100001,0x22140049,0xB000F3,0x2600003E,0x20000059,0x1C0000F4,0xB000F3,0x2600003E,0x20000059,0x1C0000F4,0x1C0000F4,0xA4100003,0xFC280049,0xF030003B,0x56100001,0x3E100001,0x30100001,0x28180001,0x28080004,0x90000005,0x54040000,0x2A040033,0x20000059,
-0x7C00F3,0x2400F3,0x2400F3,0x2400F3,0x2400F3,0x3A1C004A,0x3A1C004A,0x3A1C004A,0x2418004A,0x2418004A,0x1C18004A,0x38100033,0x38100033,0x38100033,0x26100002,0x26100002,0x1E10000E,0x200C0033,0x200C0033,0x1C0C0005,0x160C0035,0x3400F3,0x3400F3,0x3400F3,0x28000049,0x28000049,0x1C08004A,0x22000042,0x22000042,0x1A000009,0x16000035,0x6800F3,
-0x6800F3,0x16000074,0x16000074,0x120000F4,0xA0100003,0xF61C004A,0x2400F3,0x56100001,0x3A100002,0x2E100002,0x2C100003,0x240C0001,0x84000005,0x54040000,0x260C0033,0x1A000009,0x4C00F3,0x300032,0x300032,0x300032,0x300032,0x2E280001,0x2E280001,0x2E280001,0x20280001,0x20280001,0x1C240001,0x2440032,0x2440032,0x2440032,0x24140001,0x24140001,
-0x1C1C0000,0x8C0032,0x8C0032,0x1C000000,0x16000034,0x2440032,0x2440032,0x2440032,0x24140001,0x24140001,0x1C1C0000,0x8C0032,0x8C0032,0x1C000000,0x16000034,0x8C0032,0x8C0032,0x1C000000,0x16000034,0x16000034,0x82180000,0x9C280001,0x300032,0x56100000,0x34180001,0x2C140000,0x26180001,0x240C0000,0x84040001,0x4E080000,0x640032,0x1C000000,
-0x640032,0x44004A,0x3E3C0001,0x2E380001,0x28380001,0x680048,0x32240001,0x282C0001,0xD00048,0x28080000,0x22000048,0x680048,0x32240001,0x282C0001,0xD00048,0x28080000,0x22000048,0xD00048,0x28080000,0x22000048,0x22000048,0x680048,0x32240001,0x282C0001,0xD00048,0x28080000,0x22000048,0xD00048,0x28080000,0x22000048,0x22000048,0xD00048,
-0x28080000,0x22000048,0x22000048,0x22000048,0xB8080000,0xC480048,0xC63C0001,0x5A0C0000,0x3E0C0001,0x32080000,0x28180000,0x28000001,0x94000002,0x54040000,0x2A1C0000,0x22000048,0x940048,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x18004A,0x28100001,0x28100001,0x28100001,0x28100001,0x28100001,
-0x28100001,0x180C0001,0x180C0001,0x180C0001,0x120C0001,0x2240048,0x2240048,0x2240048,0x2240048,0x2240048,0x2240048,0x18000005,0x18000005,0x18000005,0x12000001,0x4C0048,0x4C0048,0x4C0048,0x10000014,0xC000048,0xB0100001,0x18004A,0x18004A,0x52100001,0x3C100001,0x30100001,0x30100001,0x22100001,0x84000001,0x54040000,0x1C0C0001,0x18000005,
-0x340048,};
-static const uint32_t g_etc1_to_bc7_m6_table36[] = {
-0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x5C0000,
-0x5C0000,0x5C0000,0x5C0000,0xE000001,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x8200000,0x8200000,0x8200000,0x300000,0x400000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,
-0x2500000,0xA40000,0xA40000,0xA40000,0x1A000001,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0xA40000,0xA40000,0xA40000,0x1A000001,0xA40000,0xA40000,0xA40000,0x1A000001,0x1A000001,0x3C0000,0x380000,0x380000,0x400000,0x2440000,0x4C0000,0x4C0000,0x5C0000,0x400000,0x2440000,0x740000,0xA40000,
-0x740000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x700000,0x700000,0x700000,0xE40000,0xE40000,0x24000001,0x700000,0x700000,0x700000,0xE40000,0xE40000,0x24000001,0xE40000,0xE40000,0x24000001,0x24000001,0x700000,0x700000,0x700000,0xE40000,0xE40000,0x24000001,0xE40000,0xE40000,0x24000001,0x24000001,0xE40000,
-0xE40000,0x24000001,0x24000001,0x24000001,0x580000,0x500000,0x4C0000,0x680000,0x800000,0xA00000,0xB80000,0x1180000,0x600000,0x700000,0xA00000,0x24000001,0xA00000,0x640000,0x940000,0x12C0000,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,
-0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x30000001,0x7C0000,0xC680000,0xC680000,0xA80000,0xF40000,0x1E00000,0x30000001,0x30000001,0x880000,0xBC0000,0x25DC0000,0x30000001,
-0xD40000,0x400278,0x5E3800F4,0x3C3800F4,0x303800F5,0x582C0090,0x402C004D,0x34300069,0x362C0090,0x302C004D,0x2A2C0092,0x542000F3,0x42200032,0x34280053,0x38200049,0x30200002,0x2C20004E,0x302000F4,0x2E1C0051,0x2A1C0067,0x262000F3,0x600274,0x460C00F3,0x302000F4,0x3E040092,0x3210004A,0x2A180092,0x360400FD,0x30000033,0x2A00004F,0x260800F4,0xC40274,
-0x2C000121,0x260000E2,0x2400014C,0x20000274,0xB4200005,0xF23400D0,0xF63C012C,0x62200000,0x48200001,0x38200002,0x32240006,0x30200006,0xAC080002,0x5A180002,0x321C0035,0x2A00004F,0x8C0274,0x4C00F4,0x52440034,0x3A400034,0x30400035,0x4C380048,0x3A380001,0x32380005,0x3238004A,0x3034000E,0x2A38004A,0x7400F3,0x40240032,0x32300033,0x38200049,0x30200002,
-0x2A28004A,0xE800F3,0x30000033,0x2A00004B,0x260000F3,0x7400F3,0x40240032,0x32300033,0x38200049,0x30200002,0x2A28004A,0xE800F3,0x30000033,0x2A00004B,0x260000F3,0xE800F3,0x30000033,0x2A00004B,0x260000F3,0x260000F3,0xA8240003,0xF63C004C,0xFA440038,0x62200000,0x48200001,0x38200002,0x32280001,0x30180003,0xAC080001,0x5A180001,0x34140032,0x2A00004B,
-0xA400F3,0x3800F4,0x3800F4,0x3800F4,0x3800F4,0x462C0048,0x462C0048,0x462C0048,0x2C2C0049,0x2C2C0049,0x242C0049,0x42200032,0x42200032,0x42200032,0x30200001,0x30200001,0x2624000E,0x28200032,0x28200032,0x24200005,0x20200032,0x5000F3,0x5000F3,0x5000F3,0x32100049,0x32100049,0x241C0049,0x30000032,0x30000032,0x240C0002,0x20100032,0xA000F3,
-0xA000F3,0x22000059,0x1E000053,0x1A0000F3,0xA8200004,0xFE2C0049,0x3800F4,0x62200000,0x44200001,0x38200001,0x34200004,0x2C200001,0xA20C0000,0x5A180001,0x301C0032,0x240C0002,0x7000F3,0x400034,0x400034,0x400034,0x400034,0x38380000,0x38380000,0x38380000,0x2A380000,0x2A380000,0x24380001,0x600032,0x600032,0x600032,0x2C280001,0x2C280001,
-0x24300001,0xC40032,0xC40032,0x24140001,0x20000032,0x600032,0x600032,0x600032,0x2C280001,0x2C280001,0x24300001,0xC40032,0xC40032,0x24140001,0x20000032,0xC40032,0xC40032,0x24140001,0x20000032,0x20000032,0x90280000,0xB4380000,0x400034,0x62200000,0x42240000,0x36240000,0x32280000,0x2C200001,0xA20C0000,0x5A180000,0x8C0032,0x24140001,
-0x8C0032,0x580048,0x4A4C0000,0x364C0001,0x304C0001,0x2800048,0x3A380001,0x30400001,0x1080048,0x301C0001,0x2A00004A,0x2800048,0x3A380001,0x30400001,0x1080048,0x301C0001,0x2A00004A,0x1080048,0x301C0001,0x2A00004A,0x2A00004A,0x2800048,0x3A380001,0x30400001,0x1080048,0x301C0001,0x2A00004A,0x1080048,0x301C0001,0x2A00004A,0x2A00004A,0x1080048,
-0x301C0001,0x2A00004A,0x2A00004A,0x2A00004A,0xC4180000,0x65C0048,0xDE4C0000,0x62200000,0x46200001,0x3A1C0000,0x302C0001,0x300C0001,0xB4040000,0x5E140000,0x32300001,0x2A00004A,0xB80048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x2C0048,0x34200000,0x34200000,0x34200000,0x34200000,0x34200000,
-0x34200000,0x20200001,0x20200001,0x20200001,0x1A200001,0x400048,0x400048,0x400048,0x400048,0x400048,0x400048,0x240C0001,0x240C0001,0x240C0001,0x1A140001,0x800048,0x800048,0x800048,0x1A000005,0x1400004A,0xC8200000,0x2C0048,0x2C0048,0x62200000,0x48200000,0x3C200000,0x3C200000,0x2C200000,0x9E0C0000,0x62140000,0x281C0000,0x240C0001,
-0x5C0048,};
-static const uint32_t g_etc1_to_bc7_m6_table37[] = {
-0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x8C0000,
-0x8C0000,0x8C0000,0x8C0000,0x16000001,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x340000,0x340000,0x340000,0x2440000,0x640000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,
-0x2680000,0xD80000,0xD80000,0xD80000,0x22000001,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,0xD80000,0xD80000,0xD80000,0x22000001,0xD80000,0xD80000,0xD80000,0x22000001,0x22000001,0x4C0000,0x480000,0x480000,0x540000,0x2580000,0x600000,0x600000,0x780000,0x540000,0x2580000,0x980000,0xD80000,
-0x980000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x1140000,0x1140000,0x2C000001,0x2C000001,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x1140000,0x1140000,0x2C000001,0x2C000001,0x1140000,
-0x1140000,0x2C000001,0x2C000001,0x2C000001,0x6680000,0x8600000,0x5C0000,0x7C0000,0x980000,0xC00000,0xE00000,0x1540000,0x740000,0x880000,0xC00000,0x2C000001,0xC00000,0x740000,0xAC0000,0x15C0000,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,
-0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0x38000001,0x900000,0x7C0000,0x7C0000,0x2C00000,0x11C0000,0x9F00000,0x38000001,0x38000001,0x9C0000,0xD80000,0x2DEC0000,0x38000001,
-0xF40000,0x500278,0x664800F4,0x444800F4,0x384800F5,0x603C0090,0x483C004D,0x3C400069,0x3E3C0090,0x383C004D,0x323C0092,0x5C3000F3,0x4A300032,0x3C380053,0x40300049,0x38300002,0x3430004E,0x383000F4,0x362C0051,0x322C0067,0x2E3000F3,0x780274,0x4E1C00F3,0x383000F4,0x46140092,0x3A20004A,0x32280092,0x440400F3,0x38100033,0x3210004F,0x2E1800F4,0xF40274,
-0x36000104,0x320000B2,0x2C00011F,0x28000274,0xBC300005,0xFA4400D0,0xFE4C012C,0x6A300000,0x50300001,0x40300002,0x3A340006,0x38300006,0xB4180002,0x62280002,0x3A2C0035,0x3210004F,0xAC0274,0x5C00F4,0x5A540034,0x42500034,0x38500035,0x54480048,0x42480001,0x3A480005,0x3A48004A,0x3844000E,0x3248004A,0x8C00F3,0x48340032,0x3A400033,0x40300049,0x38300002,
-0x3238004A,0x11800F3,0x38100033,0x3208004A,0x2E0000F3,0x8C00F3,0x48340032,0x3A400033,0x40300049,0x38300002,0x3238004A,0x11800F3,0x38100033,0x3208004A,0x2E0000F3,0x11800F3,0x38100033,0x3208004A,0x2E0000F3,0x2E0000F3,0xB0340003,0xFE4C004C,0xF254003D,0x6A300000,0x50300001,0x40300002,0x3A380001,0x38280003,0xB4180001,0x62280001,0x3C240032,0x3208004A,
-0xC800F3,0x4800F4,0x4800F4,0x4800F4,0x4800F4,0x4E3C0048,0x4E3C0048,0x4E3C0048,0x343C0049,0x343C0049,0x2C3C0049,0x4A300032,0x4A300032,0x4A300032,0x38300001,0x38300001,0x2E34000E,0x30300032,0x30300032,0x2C300005,0x28300032,0x6800F3,0x6800F3,0x6800F3,0x3A200049,0x3A200049,0x2C2C0049,0x38100032,0x38100032,0x2C1C0002,0x28200032,0xD000F3,
-0xD000F3,0x2C000049,0x2600003E,0x220000F3,0xB0300004,0xF63C004C,0x4800F4,0x6A300000,0x4C300001,0x40300001,0x3C300004,0x34300001,0xAA1C0000,0x62280001,0x382C0032,0x2C1C0002,0x9400F3,0x500034,0x500034,0x500034,0x500034,0x40480000,0x40480000,0x40480000,0x32480000,0x32480000,0x2C480001,0x780032,0x780032,0x780032,0x34380001,0x34380001,
-0x2C400001,0xF40032,0xF40032,0x2C240001,0x28000032,0x780032,0x780032,0x780032,0x34380001,0x34380001,0x2C400001,0xF40032,0xF40032,0x2C240001,0x28000032,0xF40032,0xF40032,0x2C240001,0x28000032,0x28000032,0x98380000,0xBC480000,0x500034,0x6A300000,0x4A340000,0x3E340000,0x3A380000,0x34300001,0xAA1C0000,0x62280000,0xAC0032,0x2C240001,
-0xAC0032,0x680048,0x525C0000,0x3E5C0001,0x385C0001,0x2980048,0x42480001,0x38500001,0x1380048,0x382C0001,0x3200004A,0x2980048,0x42480001,0x38500001,0x1380048,0x382C0001,0x3200004A,0x1380048,0x382C0001,0x3200004A,0x3200004A,0x2980048,0x42480001,0x38500001,0x1380048,0x382C0001,0x3200004A,0x1380048,0x382C0001,0x3200004A,0x3200004A,0x1380048,
-0x382C0001,0x3200004A,0x3200004A,0x3200004A,0xCC280000,0xE6C0048,0xE65C0000,0x6A300000,0x4E300001,0x422C0000,0x383C0001,0x381C0001,0xBC140000,0x66240000,0x3A400001,0x3200004A,0xDC0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C0048,0x3C300000,0x3C300000,0x3C300000,0x3C300000,0x3C300000,
-0x3C300000,0x28300001,0x28300001,0x28300001,0x22300001,0x580048,0x580048,0x580048,0x580048,0x580048,0x580048,0x2C1C0001,0x2C1C0001,0x2C1C0001,0x22240001,0xB00048,0xB00048,0xB00048,0x22000001,0x1C00004A,0xD0300000,0x3C0048,0x3C0048,0x6A300000,0x50300000,0x44300000,0x44300000,0x34300000,0xA61C0000,0x6A240000,0x302C0000,0x2C1C0001,
-0x7C0048,};
-static const uint32_t g_etc1_to_bc7_m6_table38[] = {
-0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0xBC0000,
-0xBC0000,0xBC0000,0xBC0000,0x1E000001,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x440000,0x440000,0x440000,0x25C0000,0x880000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,
-0x2800000,0x1080000,0x1080000,0x1080000,0x2A000001,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,0x1080000,0x1080000,0x1080000,0x2A000001,0x1080000,0x1080000,0x1080000,0x2A000001,0x2A000001,0x65C0000,0x580000,0x580000,0x4640000,0x26C0000,0x780000,0x780000,0x940000,0x4640000,0x26C0000,0xB80000,0x1080000,
-0xB80000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0x1440000,0x1440000,0x34000001,0x34000001,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0x1440000,0x1440000,0x34000001,0x34000001,0x1440000,
-0x1440000,0x34000001,0x34000001,0x34000001,0x27C0000,0x740000,0x6C0000,0x2900000,0xB40000,0xE40000,0x1080000,0x1900000,0x880000,0xA00000,0xE40000,0x34000001,0xE40000,0x840000,0xC40000,0x18C0000,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,
-0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0x40000001,0xA40000,0x8C0000,0x8C0000,0xDC0000,0x1400000,0x13F00000,0x40000001,0x40000001,0xB40000,0xF80000,0x35FC0000,0x40000001,
-0x1180000,0x600278,0x6E5800F4,0x4C5800F4,0x405800F5,0x684C0090,0x504C004D,0x44500069,0x464C0090,0x404C004D,0x3A4C0092,0x644000F3,0x52400032,0x44480053,0x48400049,0x40400002,0x3C40004E,0x404000F4,0x3E3C0051,0x3A3C0067,0x364000F3,0x900274,0x562C00F3,0x404000F4,0x4E240092,0x4230004A,0x3A380092,0x4C1400F3,0x40200033,0x3A20004F,0x362800F4,0x1240274,
-0x400000F5,0x3A00009A,0x3400010B,0x30000274,0xC4400005,0xF25400E2,0xF65C0139,0x72400000,0x58400001,0x48400002,0x42440006,0x40400006,0xBC280002,0x6A380002,0x423C0035,0x3A20004F,0xD00274,0x6C00F4,0x62640034,0x4A600034,0x40600035,0x5C580048,0x4A580001,0x42580005,0x4258004A,0x4054000E,0x3A58004A,0xA400F3,0x50440032,0x42500033,0x48400049,0x40400002,
-0x3A48004A,0x14C00F3,0x40200033,0x3A18004A,0x360000F3,0xA400F3,0x50440032,0x42500033,0x48400049,0x40400002,0x3A48004A,0x14C00F3,0x40200033,0x3A18004A,0x360000F3,0x14C00F3,0x40200033,0x3A18004A,0x360000F3,0x360000F3,0xB8440003,0xF65C004E,0xFA64003D,0x72400000,0x58400001,0x48400002,0x42480001,0x40380003,0xBC280001,0x6A380001,0x44340032,0x3A18004A,
-0xE800F3,0x5800F4,0x5800F4,0x5800F4,0x5800F4,0x564C0048,0x564C0048,0x564C0048,0x3C4C0049,0x3C4C0049,0x344C0049,0x52400032,0x52400032,0x52400032,0x40400001,0x40400001,0x3644000E,0x38400032,0x38400032,0x34400005,0x30400032,0x8000F3,0x8000F3,0x8000F3,0x42300049,0x42300049,0x343C0049,0x40200032,0x40200032,0x342C0002,0x30300032,0x10000F3,
-0x10000F3,0x340C0049,0x30000033,0x2A0000F3,0xB8400004,0xFE4C004C,0x5800F4,0x72400000,0x54400001,0x48400001,0x44400004,0x3C400001,0xB22C0000,0x6A380001,0x403C0032,0x342C0002,0xB400F3,0x600034,0x600034,0x600034,0x600034,0x48580000,0x48580000,0x48580000,0x3A580000,0x3A580000,0x34580001,0x900032,0x900032,0x900032,0x3C480001,0x3C480001,
-0x34500001,0x1240032,0x1240032,0x34340001,0x30000032,0x900032,0x900032,0x900032,0x3C480001,0x3C480001,0x34500001,0x1240032,0x1240032,0x34340001,0x30000032,0x1240032,0x1240032,0x34340001,0x30000032,0x30000032,0xA0480000,0xC4580000,0x600034,0x72400000,0x52440000,0x46440000,0x42480000,0x3C400001,0xB22C0000,0x6A380000,0xD00032,0x34340001,
-0xD00032,0x780048,0x5A6C0000,0x466C0001,0x406C0001,0x2B00048,0x4A580001,0x40600001,0x1680048,0x403C0001,0x3A00004A,0x2B00048,0x4A580001,0x40600001,0x1680048,0x403C0001,0x3A00004A,0x1680048,0x403C0001,0x3A00004A,0x3A00004A,0x2B00048,0x4A580001,0x40600001,0x1680048,0x403C0001,0x3A00004A,0x1680048,0x403C0001,0x3A00004A,0x3A00004A,0x1680048,
-0x403C0001,0x3A00004A,0x3A00004A,0x3A00004A,0xD4380000,0x800048,0xEE6C0000,0x72400000,0x56400001,0x4A3C0000,0x404C0001,0x402C0001,0xC4240000,0x6E340000,0x42500001,0x3A00004A,0xFC0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x4C0048,0x44400000,0x44400000,0x44400000,0x44400000,0x44400000,
-0x44400000,0x30400001,0x30400001,0x30400001,0x2A400001,0x700048,0x700048,0x700048,0x700048,0x700048,0x700048,0x342C0001,0x342C0001,0x342C0001,0x2A340001,0xE40048,0xE40048,0xE40048,0x2A100001,0x2400004A,0xD8400000,0x4C0048,0x4C0048,0x72400000,0x58400000,0x4C400000,0x4C400000,0x3C400000,0xAE2C0000,0x72340000,0x383C0000,0x342C0001,
-0xA00048,};
-static const uint32_t g_etc1_to_bc7_m6_table39[] = {
-0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0xF00000,
-0xF00000,0xF00000,0xF00000,0x26000001,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x2540000,0x2540000,0x2540000,0x2740000,0xA80000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,
-0x2980000,0x1380000,0x1380000,0x1380000,0x32000001,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x1380000,0x1380000,0x1380000,0x32000001,0x1380000,0x1380000,0x1380000,0x32000001,0x32000001,0xE6C0000,0x680000,0x680000,0x780000,0x2800000,0x8C0000,0x8C0000,0x2AC0000,0x780000,0x2800000,0xDC0000,0x1380000,
-0xDC0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0x1740000,0x1740000,0x3C000001,0x3C000001,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0x1740000,0x1740000,0x3C000001,0x3C000001,0x1740000,
-0x1740000,0x3C000001,0x3C000001,0x3C000001,0x900000,0x840000,0x7C0000,0xA80000,0xD00000,0x1080000,0x12C0000,0x1CC0000,0x9C0000,0xB80000,0x1080000,0x3C000001,0x1080000,0x940000,0xDC0000,0x1BC0000,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,
-0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0xDC0000,0x1BC0000,0x48000001,0x1BC0000,0x48000001,0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0x1BC0000,0x48000001,0x48000001,0x48000001,0x48000001,0xB80000,0x69C0000,0x69C0000,0xF80000,0x1680000,0x1DF40000,0x48000001,0x48000001,0xC80000,0x1140000,0x3FD00000,0x48000001,
-0x1380000,0x700278,0x766800F4,0x546800F4,0x486800F5,0x705C0090,0x585C004D,0x4C600069,0x4E5C0090,0x485C004D,0x425C0092,0x6C5000F3,0x5A500032,0x4C580053,0x50500049,0x48500002,0x4450004E,0x485000F4,0x464C0051,0x424C0067,0x3E5000F3,0xA80274,0x5E3C00F3,0x485000F4,0x56340092,0x4A40004A,0x42480092,0x542400F3,0x48300033,0x4230004F,0x3E3800F4,0x1580274,
-0x480C00F4,0x42040092,0x3E0000F7,0x38000274,0xCC500005,0xFA6400E2,0xFE6C0139,0x7A500000,0x60500001,0x50500002,0x4A540006,0x48500006,0xC4380002,0x72480002,0x4A4C0035,0x4230004F,0xF00274,0x7C00F4,0x6A740034,0x52700034,0x48700035,0x64680048,0x52680001,0x4A680005,0x4A68004A,0x4864000E,0x4268004A,0xBC00F3,0x58540032,0x4A600033,0x50500049,0x48500002,
-0x4258004A,0x17C00F3,0x48300033,0x4228004A,0x3E0000F3,0xBC00F3,0x58540032,0x4A600033,0x50500049,0x48500002,0x4258004A,0x17C00F3,0x48300033,0x4228004A,0x3E0000F3,0x17C00F3,0x48300033,0x4228004A,0x3E0000F3,0x3E0000F3,0xC0540003,0xFE6C004E,0xF2740044,0x7A500000,0x60500001,0x50500002,0x4A580001,0x48480003,0xC4380001,0x72480001,0x4C440032,0x4228004A,
-0x10C00F3,0x6800F4,0x6800F4,0x6800F4,0x6800F4,0x5E5C0048,0x5E5C0048,0x5E5C0048,0x445C0049,0x445C0049,0x3C5C0049,0x5A500032,0x5A500032,0x5A500032,0x48500001,0x48500001,0x3E54000E,0x40500032,0x40500032,0x3C500005,0x38500032,0x9800F3,0x9800F3,0x9800F3,0x4A400049,0x4A400049,0x3C4C0049,0x48300032,0x48300032,0x3C3C0002,0x38400032,0x13000F3,
-0x13000F3,0x3C1C0049,0x38080032,0x320000F3,0xC0500004,0xF65C0051,0x6800F4,0x7A500000,0x5C500001,0x50500001,0x4C500004,0x44500001,0xBA3C0000,0x72480001,0x484C0032,0x3C3C0002,0xD800F3,0x700034,0x700034,0x700034,0x700034,0x50680000,0x50680000,0x50680000,0x42680000,0x42680000,0x3C680001,0xA80032,0xA80032,0xA80032,0x44580001,0x44580001,
-0x3C600001,0x1580032,0x1580032,0x3C440001,0x38000032,0xA80032,0xA80032,0xA80032,0x44580001,0x44580001,0x3C600001,0x1580032,0x1580032,0x3C440001,0x38000032,0x1580032,0x1580032,0x3C440001,0x38000032,0x38000032,0xA8580000,0xCC680000,0x700034,0x7A500000,0x5A540000,0x4E540000,0x4A580000,0x44500001,0xBA3C0000,0x72480000,0xF00032,0x3C440001,
-0xF00032,0x880048,0x627C0000,0x4E7C0001,0x487C0001,0xC80048,0x52680001,0x48700001,0x1980048,0x484C0001,0x4200004A,0xC80048,0x52680001,0x48700001,0x1980048,0x484C0001,0x4200004A,0x1980048,0x484C0001,0x4200004A,0x4200004A,0xC80048,0x52680001,0x48700001,0x1980048,0x484C0001,0x4200004A,0x1980048,0x484C0001,0x4200004A,0x4200004A,0x1980048,
-0x484C0001,0x4200004A,0x4200004A,0x4200004A,0xDC480000,0x900048,0xF67C0000,0x7A500000,0x5E500001,0x524C0000,0x485C0001,0x483C0001,0xCC340000,0x76440000,0x4A600001,0x4200004A,0x1200048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x5C0048,0x4C500000,0x4C500000,0x4C500000,0x4C500000,0x4C500000,
-0x4C500000,0x38500001,0x38500001,0x38500001,0x32500001,0x880048,0x880048,0x880048,0x880048,0x880048,0x880048,0x3C3C0001,0x3C3C0001,0x3C3C0001,0x32440001,0x1140048,0x1140048,0x1140048,0x32200001,0x2C00004A,0xE0500000,0x5C0048,0x5C0048,0x7A500000,0x60500000,0x54500000,0x54500000,0x44500000,0xB63C0000,0x7A440000,0x404C0000,0x3C3C0001,
-0xC00048,};
-static const uint32_t g_etc1_to_bc7_m6_table40[] = {
-0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x1240000,
-0x1240000,0x1240000,0x1240000,0x30000000,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x680000,0x680000,0x680000,0x900000,0xD00000,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,
-0xB40000,0x1700000,0x1700000,0x1700000,0x3C000000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x1700000,0x1700000,0x1700000,0x3C000000,0x1700000,0x1700000,0x1700000,0x3C000000,0x3C000000,0x8800000,0x780001,0x780001,0x28C0000,0x980000,0x2A40000,0x2A40000,0xCC0000,0x28C0000,0x980000,0x1000000,0x1700000,
-0x1000000,0x8C0001,0x8C0001,0x8C0001,0x8C0001,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x1AC0000,0x1AC0000,0x46000000,0x46000000,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x1AC0000,0x1AC0000,0x46000000,0x46000000,0x1AC0000,
-0x1AC0000,0x46000000,0x46000000,0x46000000,0xA40000,0x980000,0x8C0001,0xC00000,0x2EC0000,0x12C0000,0x15C0000,0x5F80000,0x2B00000,0x2D00000,0x12C0000,0x46000000,0x12C0000,0xA40001,0x2F40000,0x1F40000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,
-0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x52000000,0xD00000,0xB00000,0xB00000,0x3140000,0x1940000,0x27FC0000,0x52000000,0x52000000,0xE00000,0x1380000,0x49C40000,0x52000000,
-0x1600000,0x840274,0x807800F3,0x5C7800F4,0x527800F3,0x76700092,0x5E70004F,0x54700067,0x56700092,0x526C004E,0x4C6C0092,0x746400F4,0x62640033,0x54680051,0x5A60004A,0x52640002,0x4C64004D,0x526000F4,0x4E5C0053,0x4A5C0069,0x466000F5,0xC40274,0x684C00F3,0x526400F4,0x5E480092,0x52540049,0x4C580090,0x5C3800F3,0x52400032,0x4C40004D,0x464C00F4,0x18C0274,
-0x521800F3,0x4C140090,0x460800F4,0x40000278,0xCE640004,0xF47800F4,0xF67C014C,0x80640002,0x68640002,0x5A640002,0x52640006,0x50600006,0xCE4C0002,0x7E580000,0x545C0035,0x4C40004D,0x1180274,0x9000F3,0x74840032,0x5A840032,0x52840032,0x6A7C0049,0x5C780002,0x527C0005,0x54780049,0x5078000E,0x4C780049,0xD400F3,0x62640032,0x52740032,0x5A600049,0x52640001,
-0x4C680049,0x1B000F3,0x523C0032,0x4C380048,0x460000F4,0xD400F3,0x62640032,0x52740032,0x5A600049,0x52640001,0x4C680049,0x1B000F3,0x523C0032,0x4C380048,0x460000F4,0x1B000F3,0x523C0032,0x4C380048,0x460000F4,0x460000F4,0xCE640003,0xF8800053,0xFC880043,0x80640001,0x68640001,0x5A640001,0x526C0001,0x525C0004,0xD4480001,0x7E580000,0x54580033,0x4C380048,
-0x13000F3,0x7800F3,0x7800F3,0x7800F3,0x7800F3,0x6470004A,0x6470004A,0x6470004A,0x4E6C004A,0x4E6C004A,0x466C004A,0x62640033,0x62640033,0x62640033,0x50640002,0x50640002,0x4864000E,0x4A600033,0x4A600033,0x46600005,0x40600035,0x2B000F3,0x2B000F3,0x2B000F3,0x52540049,0x52540049,0x465C004A,0x50440032,0x50440032,0x464C0001,0x42500034,0x16800F3,
-0x16800F3,0x462C0048,0x40200034,0x3C0000F4,0xCA640003,0xF0700053,0x7800F3,0x80640001,0x64640002,0x58640002,0x56640003,0x4E600001,0xBE500000,0x7E580000,0x50600033,0x464C0001,0xFC00F3,0x840032,0x840032,0x840032,0x840032,0x587C0001,0x587C0001,0x587C0001,0x4A7C0001,0x4A7C0001,0x46780001,0xC40032,0xC40032,0xC40032,0x4E680001,0x4E680001,
-0x46700000,0x18C0032,0x18C0032,0x46500000,0x40000034,0xC40032,0xC40032,0xC40032,0x4E680001,0x4E680001,0x46700000,0x18C0032,0x18C0032,0x46500000,0x40000034,0x18C0032,0x18C0032,0x46500000,0x40000034,0x40000034,0xAC6C0000,0xC67C0001,0x840032,0x80640000,0x5E6C0001,0x56680000,0x506C0001,0x4E600000,0xBE500000,0x785C0000,0x1180032,0x46500000,
-0x1180032,0x98004A,0x68900001,0x588C0001,0x528C0001,0xE40048,0x5C780001,0x52800001,0x1D00048,0x525C0000,0x4C000048,0xE40048,0x5C780001,0x52800001,0x1D00048,0x525C0000,0x4C000048,0x1D00048,0x525C0000,0x4C000048,0x4C000048,0xE40048,0x5C780001,0x52800001,0x1D00048,0x525C0000,0x4C000048,0x1D00048,0x525C0000,0x4C000048,0x4C000048,0x1D00048,
-0x525C0000,0x4C000048,0x4C000048,0x4C000048,0xE25C0000,0xA40048,0xF0900001,0x84600000,0x68600001,0x5C5C0000,0x526C0000,0x52480000,0xD8440000,0x7E580000,0x54700000,0x4C000048,0x1480048,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x6C004A,0x52640001,0x52640001,0x52640001,0x52640001,0x52640001,
-0x52640001,0x42600001,0x42600001,0x42600001,0x3C600001,0xA40048,0xA40048,0xA40048,0xA40048,0xA40048,0xA40048,0x464C0001,0x464C0001,0x464C0001,0x3C540001,0x14C0048,0x14C0048,0x14C0048,0x3C300000,0x36000048,0xDA640001,0x6C004A,0x6C004A,0x7C640001,0x66640001,0x5A640001,0x5A640001,0x4C640001,0xBA500000,0x7E580000,0x46600001,0x464C0001,
-0xE80048,};
-static const uint32_t g_etc1_to_bc7_m6_table41[] = {
-0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0x1580000,
-0x1580000,0x1580000,0x1580000,0x38000000,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x4780000,0x4780000,0x4780000,0xA80000,0xF00000,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,
-0xCC0000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0x44000000,0x940000,0x880001,0x880001,0xA00000,0xAC0000,0xBC0000,0xBC0000,0xE80000,0xA00000,0xAC0000,0x1240000,0x1A00000,
-0x1240000,0x9C0001,0x9C0001,0x9C0001,0x9C0001,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x1DC0000,0x1DC0000,0x4E000000,0x4E000000,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x1DC0000,0x1DC0000,0x4E000000,0x4E000000,0x1DC0000,
-0x1DC0000,0x4E000000,0x4E000000,0x4E000000,0xB80000,0xA80000,0x9C0001,0x2D40000,0x1080000,0x1500000,0x1800000,0x11F40000,0x2C40000,0x2E80000,0x1500000,0x4E000000,0x1500000,0xB40001,0x30C0000,0xBFC0000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,
-0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0x5A000000,0xE40000,0x8C00000,0x8C00000,0x1300000,0x1BC0000,0x31FC0000,0x5A000000,0x5A000000,0xF80000,0x1540000,0x51D40000,0x5A000000,
-0x1800000,0x940274,0x888800F3,0x648800F4,0x5A8800F3,0x7E800092,0x6680004F,0x5C800067,0x5E800092,0x5A7C004E,0x547C0092,0x7C7400F4,0x6A740033,0x5C780051,0x6270004A,0x5A740002,0x5474004D,0x5A7000F4,0x566C0053,0x526C0069,0x4E7000F5,0xDC0274,0x705C00F3,0x5A7400F4,0x66580092,0x5A640049,0x54680090,0x644800F3,0x5A500032,0x5450004D,0x4E5C00F4,0x1BC0274,
-0x5A2800F3,0x54240090,0x4E1800F4,0x48000278,0xD6740004,0xFC8800F4,0xFE8C014C,0x88740002,0x70740002,0x62740002,0x5A740006,0x58700006,0xD65C0002,0x86680000,0x5C6C0035,0x5450004D,0x1380274,0xA000F3,0x7C940032,0x62940032,0x5A940032,0x728C0049,0x64880002,0x5A8C0005,0x5C880049,0x5888000E,0x54880049,0xEC00F3,0x6A740032,0x5A840032,0x62700049,0x5A740001,
-0x54780049,0x1E400F3,0x5A4C0032,0x54480048,0x4E0000F4,0xEC00F3,0x6A740032,0x5A840032,0x62700049,0x5A740001,0x54780049,0x1E400F3,0x5A4C0032,0x54480048,0x4E0000F4,0x1E400F3,0x5A4C0032,0x54480048,0x4E0000F4,0x4E0000F4,0xD6740003,0xF0900059,0xF498004A,0x88740001,0x70740001,0x62740001,0x5A7C0001,0x5A6C0004,0xDC580001,0x86680000,0x5C680033,0x54480048,
-0x15400F3,0x8800F3,0x8800F3,0x8800F3,0x8800F3,0x6C80004A,0x6C80004A,0x6C80004A,0x567C004A,0x567C004A,0x4E7C004A,0x6A740033,0x6A740033,0x6A740033,0x58740002,0x58740002,0x5074000E,0x52700033,0x52700033,0x4E700005,0x48700035,0xC800F3,0xC800F3,0xC800F3,0x5A640049,0x5A640049,0x4E6C004A,0x58540032,0x58540032,0x4E5C0001,0x4A600034,0x19800F3,
-0x19800F3,0x4E3C0048,0x48300034,0x440000F4,0xD2740003,0xF8800053,0x8800F3,0x88740001,0x6C740002,0x60740002,0x5E740003,0x56700001,0xC6600000,0x86680000,0x58700033,0x4E5C0001,0x12000F3,0x940032,0x940032,0x940032,0x940032,0x608C0001,0x608C0001,0x608C0001,0x528C0001,0x528C0001,0x4E880001,0xDC0032,0xDC0032,0xDC0032,0x56780001,0x56780001,
-0x4E800000,0x1BC0032,0x1BC0032,0x4E600000,0x48000034,0xDC0032,0xDC0032,0xDC0032,0x56780001,0x56780001,0x4E800000,0x1BC0032,0x1BC0032,0x4E600000,0x48000034,0x1BC0032,0x1BC0032,0x4E600000,0x48000034,0x48000034,0xB47C0000,0xCE8C0001,0x940032,0x88740000,0x667C0001,0x5E780000,0x587C0001,0x56700000,0xC6600000,0x806C0000,0x1380032,0x4E600000,
-0x1380032,0xA8004A,0x70A00001,0x609C0001,0x5A9C0001,0xFC0048,0x64880001,0x5A900001,0x3F80048,0x5A6C0000,0x54000048,0xFC0048,0x64880001,0x5A900001,0x3F80048,0x5A6C0000,0x54000048,0x3F80048,0x5A6C0000,0x54000048,0x54000048,0xFC0048,0x64880001,0x5A900001,0x3F80048,0x5A6C0000,0x54000048,0x3F80048,0x5A6C0000,0x54000048,0x54000048,0x3F80048,
-0x5A6C0000,0x54000048,0x54000048,0x54000048,0xEA6C0000,0x2B40048,0xF8A00001,0x8C700000,0x70700001,0x646C0000,0x5A7C0000,0x5A580000,0xE0540000,0x86680000,0x5C800000,0x54000048,0x1680048,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x7C004A,0x5A740001,0x5A740001,0x5A740001,0x5A740001,0x5A740001,
-0x5A740001,0x4A700001,0x4A700001,0x4A700001,0x44700001,0xBC0048,0xBC0048,0xBC0048,0xBC0048,0xBC0048,0xBC0048,0x4E5C0001,0x4E5C0001,0x4E5C0001,0x44640001,0x17C0048,0x17C0048,0x17C0048,0x44400000,0x3E000048,0xE2740001,0x7C004A,0x7C004A,0x84740001,0x6E740001,0x62740001,0x62740001,0x54740001,0xC2600000,0x86680000,0x4E700001,0x4E5C0001,
-0x10C0048,};
-static const uint32_t g_etc1_to_bc7_m6_table42[] = {
-0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0x1880000,
-0x1880000,0x1880000,0x1880000,0x40000000,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0xC880000,0xC880000,0xC880000,0xC00000,0x1140000,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,
-0xE40000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0x4C000000,0xA40000,0x980001,0x980001,0xB40000,0xC00000,0xD00000,0xD00000,0x3000000,0xB40000,0xC00000,0x1480000,0x1D00000,
-0x1480000,0xAC0001,0xAC0001,0xAC0001,0xAC0001,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x5FC0000,0x5FC0000,0x56000000,0x56000000,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x5FC0000,0x5FC0000,0x56000000,0x56000000,0x5FC0000,
-0x5FC0000,0x56000000,0x56000000,0x56000000,0x4C80000,0x4B80000,0xAC0001,0xEC0000,0x1240000,0x1700000,0x1A80000,0x1BF80000,0x2D80000,0x3000000,0x1700000,0x56000000,0x1700000,0xC40001,0x3240000,0x17FC0000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,
-0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x62000000,0xF80000,0xD40000,0xD40000,0x14C0000,0x1E40000,0x3BFC0000,0x62000000,0x62000000,0x10C0000,0x1740000,0x59E40000,0x62000000,
-0x1A40000,0xA40274,0x909800F3,0x6C9800F4,0x629800F3,0x86900092,0x6E90004F,0x64900067,0x66900092,0x628C004E,0x5C8C0092,0x848400F4,0x72840033,0x64880051,0x6A80004A,0x62840002,0x5C84004D,0x628000F4,0x5E7C0053,0x5A7C0069,0x568000F5,0xF40274,0x786C00F3,0x628400F4,0x6E680092,0x62740049,0x5C780090,0x6C5800F3,0x62600032,0x5C60004D,0x566C00F4,0x1F00274,
-0x623800F3,0x5C340090,0x562800F4,0x50000278,0xDE840004,0xF498010A,0xF8A0015B,0x90840002,0x78840002,0x6A840002,0x62840006,0x60800006,0xDE6C0002,0x8E780000,0x647C0035,0x5C60004D,0x15C0274,0xB000F3,0x84A40032,0x6AA40032,0x62A40032,0x7A9C0049,0x6C980002,0x629C0005,0x64980049,0x6098000E,0x5C980049,0x10400F3,0x72840032,0x62940032,0x6A800049,0x62840001,
-0x5C880049,0x7FC00F3,0x625C0032,0x5C580048,0x560000F4,0x10400F3,0x72840032,0x62940032,0x6A800049,0x62840001,0x5C880049,0x7FC00F3,0x625C0032,0x5C580048,0x560000F4,0x7FC00F3,0x625C0032,0x5C580048,0x560000F4,0x560000F4,0xDE840003,0xF8A00059,0xFCA8004A,0x90840001,0x78840001,0x6A840001,0x628C0001,0x627C0004,0xE4680001,0x8E780000,0x64780033,0x5C580048,
-0x17400F3,0x9800F3,0x9800F3,0x9800F3,0x9800F3,0x7490004A,0x7490004A,0x7490004A,0x5E8C004A,0x5E8C004A,0x568C004A,0x72840033,0x72840033,0x72840033,0x60840002,0x60840002,0x5884000E,0x5A800033,0x5A800033,0x56800005,0x50800035,0xE000F3,0xE000F3,0xE000F3,0x62740049,0x62740049,0x567C004A,0x60640032,0x60640032,0x566C0001,0x52700034,0x1CC00F3,
-0x1CC00F3,0x564C0048,0x50400034,0x4C0000F4,0xDA840003,0xF090005A,0x9800F3,0x90840001,0x74840002,0x68840002,0x66840003,0x5E800001,0xCE700000,0x8E780000,0x60800033,0x566C0001,0x14000F3,0xA40032,0xA40032,0xA40032,0xA40032,0x689C0001,0x689C0001,0x689C0001,0x5A9C0001,0x5A9C0001,0x56980001,0xF40032,0xF40032,0xF40032,0x5E880001,0x5E880001,
-0x56900000,0x1F00032,0x1F00032,0x56700000,0x50000034,0xF40032,0xF40032,0xF40032,0x5E880001,0x5E880001,0x56900000,0x1F00032,0x1F00032,0x56700000,0x50000034,0x1F00032,0x1F00032,0x56700000,0x50000034,0x50000034,0xBC8C0000,0xD69C0001,0xA40032,0x90840000,0x6E8C0001,0x66880000,0x608C0001,0x5E800000,0xCE700000,0x887C0000,0x15C0032,0x56700000,
-0x15C0032,0xB8004A,0x78B00001,0x68AC0001,0x62AC0001,0x1140048,0x6C980001,0x62A00001,0xFF80048,0x627C0000,0x5C000048,0x1140048,0x6C980001,0x62A00001,0xFF80048,0x627C0000,0x5C000048,0xFF80048,0x627C0000,0x5C000048,0x5C000048,0x1140048,0x6C980001,0x62A00001,0xFF80048,0x627C0000,0x5C000048,0xFF80048,0x627C0000,0x5C000048,0x5C000048,0xFF80048,
-0x627C0000,0x5C000048,0x5C000048,0x5C000048,0xF27C0000,0xAC40048,0xF0B00002,0x94800000,0x78800001,0x6C7C0000,0x628C0000,0x62680000,0xE8640000,0x8E780000,0x64900000,0x5C000048,0x18C0048,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x8C004A,0x62840001,0x62840001,0x62840001,0x62840001,0x62840001,
-0x62840001,0x52800001,0x52800001,0x52800001,0x4C800001,0x2D00048,0x2D00048,0x2D00048,0x2D00048,0x2D00048,0x2D00048,0x566C0001,0x566C0001,0x566C0001,0x4C740001,0x1AC0048,0x1AC0048,0x1AC0048,0x4C500000,0x46000048,0xEA840001,0x8C004A,0x8C004A,0x8C840001,0x76840001,0x6A840001,0x6A840001,0x5C840001,0xCA700000,0x8E780000,0x56800001,0x566C0001,
-0x12C0048,};
-static const uint32_t g_etc1_to_bc7_m6_table43[] = {
-0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0x1B80000,
-0x1B80000,0x1B80000,0x1B80000,0x48000000,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x9C0000,0x9C0000,0x9C0000,0xD80000,0x1340000,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,
-0xFC0000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0x54000000,0x2B40000,0xA80001,0xA80001,0x2C40000,0xD40000,0xE80000,0xE80000,0x11C0000,0x2C40000,0xD40000,0x1680000,0x3F80000,
-0x1680000,0xBC0001,0xBC0001,0xBC0001,0xBC0001,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x11FC0000,0x11FC0000,0x5E000000,0x5E000000,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x11FC0000,0x11FC0000,0x5E000000,0x5E000000,0x11FC0000,
-0x11FC0000,0x5E000000,0x5E000000,0x5E000000,0xDC0000,0xCC80000,0xBC0001,0x3000000,0x1400000,0x1940000,0x1D00000,0x25FC0000,0x2EC0000,0x3180000,0x1940000,0x5E000000,0x1940000,0xD40001,0x33C0000,0x23FC0000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,
-0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x33C0000,0x23FC0000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0x6A000000,0x10C0000,0xE40000,0xE40000,0x1680000,0x7F80000,0x45FC0000,0x6A000000,0x6A000000,0x1240000,0x1900000,0x61F40000,0x6A000000,
-0x1C80000,0xB40274,0x98A800F3,0x74A800F4,0x6AA800F3,0x8EA00092,0x76A0004F,0x6CA00067,0x6EA00092,0x6A9C004E,0x649C0092,0x8C9400F4,0x7A940033,0x6C980051,0x7290004A,0x6A940002,0x6494004D,0x6A9000F4,0x668C0053,0x628C0069,0x5E9000F5,0x10C0274,0x807C00F3,0x6A9400F4,0x76780092,0x6A840049,0x64880090,0x746800F3,0x6A700032,0x6470004D,0x5E7C00F4,0xBF80274,
-0x6A4800F3,0x64440090,0x5E3800F4,0x58000278,0xE6940004,0xFCA8010A,0xFEAC015F,0x98940002,0x80940002,0x72940002,0x6A940006,0x68900006,0xE67C0002,0x96880000,0x6C8C0035,0x6470004D,0x17C0274,0xC000F3,0x8CB40032,0x72B40032,0x6AB40032,0x82AC0049,0x74A80002,0x6AAC0005,0x6CA80049,0x68A8000E,0x64A80049,0x11C00F3,0x7A940032,0x6AA40032,0x72900049,0x6A940001,
-0x64980049,0x13FC00F3,0x6A6C0032,0x64680048,0x5E0000F4,0x11C00F3,0x7A940032,0x6AA40032,0x72900049,0x6A940001,0x64980049,0x13FC00F3,0x6A6C0032,0x64680048,0x5E0000F4,0x13FC00F3,0x6A6C0032,0x64680048,0x5E0000F4,0x5E0000F4,0xE6940003,0xF0B00063,0xF4B80053,0x98940001,0x80940001,0x72940001,0x6A9C0001,0x6A8C0004,0xEC780001,0x96880000,0x6C880033,0x64680048,
-0x19800F3,0xA800F3,0xA800F3,0xA800F3,0xA800F3,0x7CA0004A,0x7CA0004A,0x7CA0004A,0x669C004A,0x669C004A,0x5E9C004A,0x7A940033,0x7A940033,0x7A940033,0x68940002,0x68940002,0x6094000E,0x62900033,0x62900033,0x5E900005,0x58900035,0xF800F3,0xF800F3,0xF800F3,0x6A840049,0x6A840049,0x5E8C004A,0x68740032,0x68740032,0x5E7C0001,0x5A800034,0x1FC00F3,
-0x1FC00F3,0x5E5C0048,0x58500034,0x540000F4,0xE2940003,0xF8A0005A,0xA800F3,0x98940001,0x7C940002,0x70940002,0x6E940003,0x66900001,0xD6800000,0x96880000,0x68900033,0x5E7C0001,0x16400F3,0xB40032,0xB40032,0xB40032,0xB40032,0x70AC0001,0x70AC0001,0x70AC0001,0x62AC0001,0x62AC0001,0x5EA80001,0x10C0032,0x10C0032,0x10C0032,0x66980001,0x66980001,
-0x5EA00000,0xBF80032,0xBF80032,0x5E800000,0x58000034,0x10C0032,0x10C0032,0x10C0032,0x66980001,0x66980001,0x5EA00000,0xBF80032,0xBF80032,0x5E800000,0x58000034,0xBF80032,0xBF80032,0x5E800000,0x58000034,0x58000034,0xC49C0000,0xDEAC0001,0xB40032,0x98940000,0x769C0001,0x6E980000,0x689C0001,0x66900000,0xD6800000,0x908C0000,0x17C0032,0x5E800000,
-0x17C0032,0xC8004A,0x80C00001,0x70BC0001,0x6ABC0001,0x12C0048,0x74A80001,0x6AB00001,0x1BF80048,0x6A8C0000,0x64000048,0x12C0048,0x74A80001,0x6AB00001,0x1BF80048,0x6A8C0000,0x64000048,0x1BF80048,0x6A8C0000,0x64000048,0x64000048,0x12C0048,0x74A80001,0x6AB00001,0x1BF80048,0x6A8C0000,0x64000048,0x1BF80048,0x6A8C0000,0x64000048,0x64000048,0x1BF80048,
-0x6A8C0000,0x64000048,0x64000048,0x64000048,0xFA8C0000,0xD80048,0xF8C00002,0x9C900000,0x80900001,0x748C0000,0x6A9C0000,0x6A780000,0xF0740000,0x96880000,0x6CA00000,0x64000048,0x1AC0048,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x9C004A,0x6A940001,0x6A940001,0x6A940001,0x6A940001,0x6A940001,
-0x6A940001,0x5A900001,0x5A900001,0x5A900001,0x54900001,0x2E80048,0x2E80048,0x2E80048,0x2E80048,0x2E80048,0x2E80048,0x5E7C0001,0x5E7C0001,0x5E7C0001,0x54840001,0x1DC0048,0x1DC0048,0x1DC0048,0x54600000,0x4E000048,0xF2940001,0x9C004A,0x9C004A,0x94940001,0x7E940001,0x72940001,0x72940001,0x64940001,0xD2800000,0x96880000,0x5E900001,0x5E7C0001,
-0x1500048,};
-static const uint32_t g_etc1_to_bc7_m6_table44[] = {
-0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0x1F00000,
-0x1F00000,0x1F00000,0x1F00000,0x50000001,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xEAC0000,0xEAC0000,0xEAC0000,0xF40000,0x15C0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,
-0x1180000,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x5C000001,0xC80000,0xBC0000,0xBC0000,0x4D80000,0xEC0000,0x1000000,0x1000000,0x13C0000,0x4D80000,0xEC0000,0x1900000,0x11F80000,
-0x1900000,0xD00000,0xD00000,0xD00000,0xD00000,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1FF80000,0x1FF80000,0x66000001,0x66000001,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1FF80000,0x1FF80000,0x66000001,0x66000001,0x1FF80000,
-0x1FF80000,0x66000001,0x66000001,0x66000001,0x2F00000,0x6DC0000,0xD00000,0x3180000,0x15C0000,0x1B80000,0x1FC0000,0x33F40000,0x1040000,0x1340000,0x1B80000,0x66000001,0x1B80000,0xE80000,0x1580000,0x31F80000,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,
-0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x72000001,0x5200000,0xF80000,0xF80000,0x1840000,0x15FC0000,0x51F80000,0x72000001,0x72000001,0x13C0000,0x1B40000,0x6BE80000,0x72000001,
-0x1EC0000,0xC40278,0xA0BC00F4,0x7EBC00F4,0x72BC00F5,0x9AB00090,0x82B0004D,0x76B40069,0x78B00090,0x72B0004D,0x6CB00092,0x96A400F3,0x84A40032,0x76AC0053,0x7AA40049,0x72A40002,0x6EA4004E,0x72A400F4,0x70A00051,0x6CA00067,0x68A400F3,0x3240274,0x889000F3,0x72A400F4,0x80880092,0x7494004A,0x6C9C0092,0x7E7800F3,0x72840033,0x6C84004F,0x688C00F4,0x17FC0274,
-0x726000F4,0x6C580092,0x684400F3,0x62000274,0xF6A40005,0xF6BC0120,0xF8C0016C,0xA4A40000,0x8AA40001,0x7AA40002,0x74A80006,0x72A40006,0xEE8C0002,0x9C9C0002,0x74A00035,0x6C84004F,0x1A40274,0xD000F4,0x94C80034,0x7CC40034,0x72C40035,0x8EBC0048,0x7CBC0001,0x74BC0005,0x74BC004A,0x72B8000E,0x6CBC004A,0x13800F3,0x82A80032,0x74B40033,0x7AA40049,0x72A40002,
-0x6CAC004A,0x21F800F3,0x72840033,0x6C7C004A,0x680000F3,0x13800F3,0x82A80032,0x74B40033,0x7AA40049,0x72A40002,0x6CAC004A,0x21F800F3,0x72840033,0x6C7C004A,0x680000F3,0x21F800F3,0x72840033,0x6C7C004A,0x680000F3,0x680000F3,0xEAA80003,0xFAC40060,0xFECC0054,0xA4A40000,0x8AA40001,0x7AA40002,0x74AC0001,0x729C0003,0xEE8C0001,0x9C9C0001,0x76980032,0x6C7C004A,
-0x1BC00F3,0xBC00F4,0xBC00F4,0xBC00F4,0xBC00F4,0x88B00048,0x88B00048,0x88B00048,0x6EB00049,0x6EB00049,0x66B00049,0x84A40032,0x84A40032,0x84A40032,0x72A40001,0x72A40001,0x68A8000E,0x6AA40032,0x6AA40032,0x66A40005,0x62A40032,0x11400F3,0x11400F3,0x11400F3,0x74940049,0x74940049,0x66A00049,0x72840032,0x72840032,0x66900002,0x62940032,0xFF800F3,
-0xFF800F3,0x66700049,0x625C0032,0x5C0000F3,0xEAA40004,0xF2B40060,0xBC00F4,0xA4A40000,0x86A40001,0x7AA40001,0x76A40004,0x6EA40001,0xE4900000,0x9C9C0001,0x72A00032,0x66900002,0x18C00F3,0xC40034,0xC40034,0xC40034,0xC40034,0x7ABC0000,0x7ABC0000,0x7ABC0000,0x6CBC0000,0x6CBC0000,0x66BC0001,0x3240032,0x3240032,0x3240032,0x6EAC0001,0x6EAC0001,
-0x66B40001,0x17FC0032,0x17FC0032,0x66980001,0x62000032,0x3240032,0x3240032,0x3240032,0x6EAC0001,0x6EAC0001,0x66B40001,0x17FC0032,0x17FC0032,0x66980001,0x62000032,0x17FC0032,0x17FC0032,0x66980001,0x62000032,0x62000032,0xD2AC0000,0xF6BC0000,0xC40034,0xA4A40000,0x84A80000,0x78A80000,0x74AC0000,0x6EA40001,0xE4900000,0x9C9C0000,0x1A40032,0x66980001,
-0x1A40032,0xDC0048,0x8CD00000,0x78D00001,0x72D00001,0x3440048,0x7CBC0001,0x72C40001,0x27FC0048,0x72A00001,0x6C00004A,0x3440048,0x7CBC0001,0x72C40001,0x27FC0048,0x72A00001,0x6C00004A,0x27FC0048,0x72A00001,0x6C00004A,0x6C00004A,0x3440048,0x7CBC0001,0x72C40001,0x27FC0048,0x72A00001,0x6C00004A,0x27FC0048,0x72A00001,0x6C00004A,0x6C00004A,0x27FC0048,
-0x72A00001,0x6C00004A,0x6C00004A,0x6C00004A,0xF6A40001,0xCE80048,0xF2D40005,0xA4A40000,0x88A40001,0x7CA00000,0x72B00001,0x72900001,0xF6880000,0xA0980000,0x74B40001,0x6C00004A,0x1D40048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0xB00048,0x76A40000,0x76A40000,0x76A40000,0x76A40000,0x76A40000,
-0x76A40000,0x62A40001,0x62A40001,0x62A40001,0x5CA40001,0x1040048,0x1040048,0x1040048,0x1040048,0x1040048,0x1040048,0x66900001,0x66900001,0x66900001,0x5C980001,0x7FC0048,0x7FC0048,0x7FC0048,0x5C740001,0x5600004A,0xFAA40001,0xB00048,0xB00048,0xA4A40000,0x8AA40000,0x7EA40000,0x7EA40000,0x6EA40000,0xE0900000,0xA4980000,0x6AA00000,0x66900001,
-0x1740048,};
-static const uint32_t g_etc1_to_bc7_m6_table45[] = {
-0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,
-0xBF80000,0xBF80000,0xBF80000,0x58000001,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xC00000,0xC00000,0xC00000,0x10C0000,0x17C0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,
-0x1300000,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x64000001,0x4D80000,0xCC0000,0xCC0000,0xEC0000,0x1000000,0x1140000,0x1140000,0x3540000,0xEC0000,0x1000000,0x1B00000,0x1DF40000,
-0x1B00000,0xE00000,0xE00000,0xE00000,0xE00000,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x2BF80000,0x2BF80000,0x6E000001,0x6E000001,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x2BF80000,0x2BF80000,0x6E000001,0x6E000001,0x2BF80000,
-0x2BF80000,0x6E000001,0x6E000001,0x6E000001,0x1040000,0xEEC0000,0xE00000,0x1300000,0x1780000,0x1DC0000,0xFFC0000,0x3DF80000,0x1180000,0x14C0000,0x1DC0000,0x6E000001,0x1DC0000,0xF80000,0x1700000,0x3DF80000,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,
-0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x7A000001,0x5340000,0x1080000,0x1080000,0x1A00000,0x23FC0000,0x5BF80000,0x7A000001,0x7A000001,0x1500000,0x1D00000,0x73F80000,0x7A000001,
-0x9FC0000,0xD40278,0xA8CC00F4,0x86CC00F4,0x7ACC00F5,0xA2C00090,0x8AC0004D,0x7EC40069,0x80C00090,0x7AC0004D,0x74C00092,0x9EB400F3,0x8CB40032,0x7EBC0053,0x82B40049,0x7AB40002,0x76B4004E,0x7AB400F4,0x78B00051,0x74B00067,0x70B400F3,0x33C0274,0x90A000F3,0x7AB400F4,0x88980092,0x7CA4004A,0x74AC0092,0x868800F3,0x7A940033,0x7494004F,0x709C00F4,0x23FC0274,
-0x7A7000F4,0x74680092,0x705400F3,0x6A000274,0xFEB40005,0xFECC0120,0xF0D00181,0xACB40000,0x92B40001,0x82B40002,0x7CB80006,0x7AB40006,0xF69C0002,0xA4AC0002,0x7CB00035,0x7494004F,0x1C80274,0xE000F4,0x9CD80034,0x84D40034,0x7AD40035,0x96CC0048,0x84CC0001,0x7CCC0005,0x7CCC004A,0x7AC8000E,0x74CC004A,0x15000F3,0x8AB80032,0x7CC40033,0x82B40049,0x7AB40002,
-0x74BC004A,0x2DF800F3,0x7A940033,0x748C004A,0x700000F3,0x15000F3,0x8AB80032,0x7CC40033,0x82B40049,0x7AB40002,0x74BC004A,0x2DF800F3,0x7A940033,0x748C004A,0x700000F3,0x2DF800F3,0x7A940033,0x748C004A,0x700000F3,0x700000F3,0xF2B80003,0xF2D4006A,0xF6DC005D,0xACB40000,0x92B40001,0x82B40002,0x7CBC0001,0x7AAC0003,0xF69C0001,0xA4AC0001,0x7EA80032,0x748C004A,
-0x1E000F3,0xCC00F4,0xCC00F4,0xCC00F4,0xCC00F4,0x90C00048,0x90C00048,0x90C00048,0x76C00049,0x76C00049,0x6EC00049,0x8CB40032,0x8CB40032,0x8CB40032,0x7AB40001,0x7AB40001,0x70B8000E,0x72B40032,0x72B40032,0x6EB40005,0x6AB40032,0x12C00F3,0x12C00F3,0x12C00F3,0x7CA40049,0x7CA40049,0x6EB00049,0x7A940032,0x7A940032,0x6EA00002,0x6AA40032,0x1BF800F3,
-0x1BF800F3,0x6E800049,0x6A6C0032,0x640000F3,0xF2B40004,0xFAC40060,0xCC00F4,0xACB40000,0x8EB40001,0x82B40001,0x7EB40004,0x76B40001,0xECA00000,0xA4AC0001,0x7AB00032,0x6EA00002,0x1AC00F3,0xD40034,0xD40034,0xD40034,0xD40034,0x82CC0000,0x82CC0000,0x82CC0000,0x74CC0000,0x74CC0000,0x6ECC0001,0x33C0032,0x33C0032,0x33C0032,0x76BC0001,0x76BC0001,
-0x6EC40001,0x23FC0032,0x23FC0032,0x6EA80001,0x6A000032,0x33C0032,0x33C0032,0x33C0032,0x76BC0001,0x76BC0001,0x6EC40001,0x23FC0032,0x23FC0032,0x6EA80001,0x6A000032,0x23FC0032,0x23FC0032,0x6EA80001,0x6A000032,0x6A000032,0xDABC0000,0xFECC0000,0xD40034,0xACB40000,0x8CB80000,0x80B80000,0x7CBC0000,0x76B40001,0xECA00000,0xA4AC0000,0x1C80032,0x6EA80001,
-0x1C80032,0xEC0048,0x94E00000,0x80E00001,0x7AE00001,0x35C0048,0x84CC0001,0x7AD40001,0x33FC0048,0x7AB00001,0x7400004A,0x35C0048,0x84CC0001,0x7AD40001,0x33FC0048,0x7AB00001,0x7400004A,0x33FC0048,0x7AB00001,0x7400004A,0x7400004A,0x35C0048,0x84CC0001,0x7AD40001,0x33FC0048,0x7AB00001,0x7400004A,0x33FC0048,0x7AB00001,0x7400004A,0x7400004A,0x33FC0048,
-0x7AB00001,0x7400004A,0x7400004A,0x7400004A,0xFEB40001,0xFC0048,0xFAE40005,0xACB40000,0x90B40001,0x84B00000,0x7AC00001,0x7AA00001,0xFE980000,0xA8A80000,0x7CC40001,0x7400004A,0x1F40048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0xC00048,0x7EB40000,0x7EB40000,0x7EB40000,0x7EB40000,0x7EB40000,
-0x7EB40000,0x6AB40001,0x6AB40001,0x6AB40001,0x64B40001,0x11C0048,0x11C0048,0x11C0048,0x11C0048,0x11C0048,0x11C0048,0x6EA00001,0x6EA00001,0x6EA00001,0x64A80001,0x13FC0048,0x13FC0048,0x13FC0048,0x64840001,0x5E00004A,0xF2B40004,0xC00048,0xC00048,0xACB40000,0x92B40000,0x86B40000,0x86B40000,0x76B40000,0xE8A00000,0xACA80000,0x72B00000,0x6EA00001,
-0x1980048,};
-static const uint32_t g_etc1_to_bc7_m6_table46[] = {
-0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x17F80000,
-0x17F80000,0x17F80000,0x17F80000,0x60000001,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xD00000,0xD00000,0xD00000,0x1240000,0x1A00000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,
-0x3440000,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x6C000001,0xCE80000,0xDC0000,0xDC0000,0x1000000,0x1140000,0x12C0000,0x12C0000,0x1700000,0x1000000,0x1140000,0x1D40000,0x27FC0000,
-0x1D40000,0xF00000,0xF00000,0xF00000,0xF00000,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x37F80000,0x37F80000,0x76000001,0x76000001,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x37F80000,0x37F80000,0x76000001,0x76000001,0x37F80000,
-0x37F80000,0x76000001,0x76000001,0x76000001,0x1180000,0x1000000,0xF00000,0x3440000,0x1940000,0x1FC0000,0x1DF80000,0x47FC0000,0x12C0000,0x1640000,0x1FC0000,0x76000001,0x1FC0000,0x1080000,0x1880000,0x49F80000,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,
-0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x82000001,0x14C0000,0x5180000,0x5180000,0x1BC0000,0x31FC0000,0x65F80000,0x82000001,0x82000001,0x1680000,0x1F00000,0x7DCC0000,0x82000001,
-0x19FC0000,0xE40278,0xB0DC00F4,0x8EDC00F4,0x82DC00F5,0xAAD00090,0x92D0004D,0x86D40069,0x88D00090,0x82D0004D,0x7CD00092,0xA6C400F3,0x94C40032,0x86CC0053,0x8AC40049,0x82C40002,0x7EC4004E,0x82C400F4,0x80C00051,0x7CC00067,0x78C400F3,0x1540274,0x98B000F3,0x82C400F4,0x90A80092,0x84B4004A,0x7CBC0092,0x8E9800F3,0x82A40033,0x7CA4004F,0x78AC00F4,0x2FFC0274,
-0x828000F4,0x7C780092,0x786400F3,0x72000274,0xFAC80007,0xF6DC013A,0xF8E00181,0xB4C40000,0x9AC40001,0x8AC40002,0x84C80006,0x82C40006,0xFEAC0002,0xACBC0002,0x84C00035,0x7CA4004F,0x1E80274,0xF000F4,0xA4E80034,0x8CE40034,0x82E40035,0x9EDC0048,0x8CDC0001,0x84DC0005,0x84DC004A,0x82D8000E,0x7CDC004A,0x16800F3,0x92C80032,0x84D40033,0x8AC40049,0x82C40002,
-0x7CCC004A,0x39F800F3,0x82A40033,0x7C9C004A,0x780000F3,0x16800F3,0x92C80032,0x84D40033,0x8AC40049,0x82C40002,0x7CCC004A,0x39F800F3,0x82A40033,0x7C9C004A,0x780000F3,0x39F800F3,0x82A40033,0x7C9C004A,0x780000F3,0x780000F3,0xFAC80003,0xFAE4006A,0xFEEC005D,0xB4C40000,0x9AC40001,0x8AC40002,0x84CC0001,0x82BC0003,0xFEAC0001,0xACBC0001,0x86B80032,0x7C9C004A,
-0x3FC00F3,0xDC00F4,0xDC00F4,0xDC00F4,0xDC00F4,0x98D00048,0x98D00048,0x98D00048,0x7ED00049,0x7ED00049,0x76D00049,0x94C40032,0x94C40032,0x94C40032,0x82C40001,0x82C40001,0x78C8000E,0x7AC40032,0x7AC40032,0x76C40005,0x72C40032,0x14400F3,0x14400F3,0x14400F3,0x84B40049,0x84B40049,0x76C00049,0x82A40032,0x82A40032,0x76B00002,0x72B40032,0x27F800F3,
-0x27F800F3,0x76900049,0x727C0032,0x6C0000F3,0xFAC40004,0xF2D40069,0xDC00F4,0xB4C40000,0x96C40001,0x8AC40001,0x86C40004,0x7EC40001,0xF4B00000,0xACBC0001,0x82C00032,0x76B00002,0x1D000F3,0xE40034,0xE40034,0xE40034,0xE40034,0x8ADC0000,0x8ADC0000,0x8ADC0000,0x7CDC0000,0x7CDC0000,0x76DC0001,0x1540032,0x1540032,0x1540032,0x7ECC0001,0x7ECC0001,
-0x76D40001,0x2FFC0032,0x2FFC0032,0x76B80001,0x72000032,0x1540032,0x1540032,0x1540032,0x7ECC0001,0x7ECC0001,0x76D40001,0x2FFC0032,0x2FFC0032,0x76B80001,0x72000032,0x2FFC0032,0x2FFC0032,0x76B80001,0x72000032,0x72000032,0xE2CC0000,0xF6DC0001,0xE40034,0xB4C40000,0x94C80000,0x88C80000,0x84CC0000,0x7EC40001,0xF4B00000,0xACBC0000,0x1E80032,0x76B80001,
-0x1E80032,0xFC0048,0x9CF00000,0x88F00001,0x82F00001,0x3740048,0x8CDC0001,0x82E40001,0x3FFC0048,0x82C00001,0x7C00004A,0x3740048,0x8CDC0001,0x82E40001,0x3FFC0048,0x82C00001,0x7C00004A,0x3FFC0048,0x82C00001,0x7C00004A,0x7C00004A,0x3740048,0x8CDC0001,0x82E40001,0x3FFC0048,0x82C00001,0x7C00004A,0x3FFC0048,0x82C00001,0x7C00004A,0x7C00004A,0x3FFC0048,
-0x82C00001,0x7C00004A,0x7C00004A,0x7C00004A,0xFAC80002,0x10C0048,0xF2F40008,0xB4C40000,0x98C40001,0x8CC00000,0x82D00001,0x82B00001,0xFEAC0001,0xB0B80000,0x84D40001,0x7C00004A,0xDFC0048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0xD00048,0x86C40000,0x86C40000,0x86C40000,0x86C40000,0x86C40000,
-0x86C40000,0x72C40001,0x72C40001,0x72C40001,0x6CC40001,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x76B00001,0x76B00001,0x76B00001,0x6CB80001,0x1FF80048,0x1FF80048,0x1FF80048,0x6C940001,0x6600004A,0xFAC40004,0xD00048,0xD00048,0xB4C40000,0x9AC40000,0x8EC40000,0x8EC40000,0x7EC40000,0xF0B00000,0xB4B80000,0x7AC00000,0x76B00001,
-0x1B80048,};
-static const uint32_t g_etc1_to_bc7_m6_table47[] = {
-0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,
-0x21FC0000,0x21FC0000,0x21FC0000,0x68000001,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x8E00000,0x8E00000,0x8E00000,0x13C0000,0x1C00000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,
-0x35C0000,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x74000001,0xFC0000,0xEC0000,0xEC0000,0x5100000,0x1280000,0x1400000,0x1400000,0x18C0000,0x5100000,0x1280000,0x1F40000,0x33FC0000,
-0x1F40000,0x1000000,0x1000000,0x1000000,0x1000000,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x43F80000,0x43F80000,0x7E000001,0x7E000001,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x43F80000,0x43F80000,0x7E000001,0x7E000001,0x43F80000,
-0x43F80000,0x7E000001,0x7E000001,0x7E000001,0x3280000,0x1100000,0x1000000,0x15C0000,0x3AC0000,0x11FC0000,0x29FC0000,0x53F80000,0x1400000,0x17C0000,0x11FC0000,0x7E000001,0x11FC0000,0x1180000,0x1A00000,0x55F80000,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,
-0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x1A00000,0x55F80000,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x8A000001,0x1600000,0xD280000,0xD280000,0x3D40000,0x3FF80000,0x6FF80000,0x8A000001,0x8A000001,0x17C0000,0xBFC0000,0x85DC0000,0x8A000001,
-0x27FC0000,0xF40278,0xB8EC00F4,0x96EC00F4,0x8AEC00F5,0xB2E00090,0x9AE0004D,0x8EE40069,0x90E00090,0x8AE0004D,0x84E00092,0xAED400F3,0x9CD40032,0x8EDC0053,0x92D40049,0x8AD40002,0x86D4004E,0x8AD400F4,0x88D00051,0x84D00067,0x80D400F3,0x16C0274,0xA0C000F3,0x8AD400F4,0x98B80092,0x8CC4004A,0x84CC0092,0x96A800F3,0x8AB40033,0x84B4004F,0x80BC00F4,0x3BFC0274,
-0x8A9000F4,0x84880092,0x807400F3,0x7A000274,0xFED8000A,0xFEEC013A,0xF0F00198,0xBCD40000,0xA2D40001,0x92D40002,0x8CD80006,0x8AD40006,0xFCC00004,0xB4CC0002,0x8CD00035,0x84B4004F,0x7FC0274,0x10000F4,0xACF80034,0x94F40034,0x8AF40035,0xA6EC0048,0x94EC0001,0x8CEC0005,0x8CEC004A,0x8AE8000E,0x84EC004A,0x18000F3,0x9AD80032,0x8CE40033,0x92D40049,0x8AD40002,
-0x84DC004A,0x45F800F3,0x8AB40033,0x84AC004A,0x800000F3,0x18000F3,0x9AD80032,0x8CE40033,0x92D40049,0x8AD40002,0x84DC004A,0x45F800F3,0x8AB40033,0x84AC004A,0x800000F3,0x45F800F3,0x8AB40033,0x84AC004A,0x800000F3,0x800000F3,0xF6DC0006,0xF4F80074,0xF6FC0068,0xBCD40000,0xA2D40001,0x92D40002,0x8CDC0001,0x8ACC0003,0xFCC00004,0xB4CC0001,0x8EC80032,0x84AC004A,
-0x13FC00F3,0xEC00F4,0xEC00F4,0xEC00F4,0xEC00F4,0xA0E00048,0xA0E00048,0xA0E00048,0x86E00049,0x86E00049,0x7EE00049,0x9CD40032,0x9CD40032,0x9CD40032,0x8AD40001,0x8AD40001,0x80D8000E,0x82D40032,0x82D40032,0x7ED40005,0x7AD40032,0x15C00F3,0x15C00F3,0x15C00F3,0x8CC40049,0x8CC40049,0x7ED00049,0x8AB40032,0x8AB40032,0x7EC00002,0x7AC40032,0x33F800F3,
-0x33F800F3,0x7EA00049,0x7A8C0032,0x740000F3,0xF6D80005,0xFAE40069,0xEC00F4,0xBCD40000,0x9ED40001,0x92D40001,0x8ED40004,0x86D40001,0xFCC00000,0xB4CC0001,0x8AD00032,0x7EC00002,0x1F000F3,0xF40034,0xF40034,0xF40034,0xF40034,0x92EC0000,0x92EC0000,0x92EC0000,0x84EC0000,0x84EC0000,0x7EEC0001,0x16C0032,0x16C0032,0x16C0032,0x86DC0001,0x86DC0001,
-0x7EE40001,0x3BFC0032,0x3BFC0032,0x7EC80001,0x7A000032,0x16C0032,0x16C0032,0x16C0032,0x86DC0001,0x86DC0001,0x7EE40001,0x3BFC0032,0x3BFC0032,0x7EC80001,0x7A000032,0x3BFC0032,0x3BFC0032,0x7EC80001,0x7A000032,0x7A000032,0xEADC0000,0xFEEC0001,0xF40034,0xBCD40000,0x9CD80000,0x90D80000,0x8CDC0000,0x86D40001,0xFCC00000,0xB4CC0000,0x7FC0032,0x7EC80001,
-0x7FC0032,0x10C0048,0xA5000000,0x91000001,0x8B000001,0x38C0048,0x94EC0001,0x8AF40001,0x4BFC0048,0x8AD00001,0x8400004A,0x38C0048,0x94EC0001,0x8AF40001,0x4BFC0048,0x8AD00001,0x8400004A,0x4BFC0048,0x8AD00001,0x8400004A,0x8400004A,0x38C0048,0x94EC0001,0x8AF40001,0x4BFC0048,0x8AD00001,0x8400004A,0x4BFC0048,0x8AD00001,0x8400004A,0x8400004A,0x4BFC0048,
-0x8AD00001,0x8400004A,0x8400004A,0x8400004A,0xF2E00005,0x71C0048,0xFB040008,0xBCD40000,0xA0D40001,0x94D00000,0x8AE00001,0x8AC00001,0xF4C80002,0xB8C80000,0x8CE40001,0x8400004A,0x1DF80048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0xE00048,0x8ED40000,0x8ED40000,0x8ED40000,0x8ED40000,0x8ED40000,
-0x8ED40000,0x7AD40001,0x7AD40001,0x7AD40001,0x74D40001,0x14C0048,0x14C0048,0x14C0048,0x14C0048,0x14C0048,0x14C0048,0x7EC00001,0x7EC00001,0x7EC00001,0x74C80001,0x2BF80048,0x2BF80048,0x2BF80048,0x74A40001,0x6E00004A,0xF4D80005,0xE00048,0xE00048,0xBCD40000,0xA2D40000,0x96D40000,0x96D40000,0x86D40000,0xF8C00000,0xBCC80000,0x82D00000,0x7EC00001,
-0x1DC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table48[] = {
-0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x2FFC0000,
-0x2FFC0000,0x2FFC0000,0x2FFC0000,0x72000000,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0x2F40000,0x2F40000,0x2F40000,0x1540000,0x1E80000,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,
-0x1780000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x7E000000,0xF0C0000,0xFC0001,0xFC0001,0x1280000,0x33C0000,0x1580000,0x1580000,0x1AC0000,0x1280000,0x33C0000,0xFFC0000,0x41FC0000,
-0xFFC0000,0x1100001,0x1100001,0x1100001,0x1100001,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x51F80000,0x51F80000,0x88000000,0x88000000,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x51F80000,0x51F80000,0x88000000,0x88000000,0x51F80000,
-0x51F80000,0x88000000,0x88000000,0x88000000,0x73C0000,0x1240000,0x1100001,0x1740000,0x1CC0000,0x21FC0000,0x39FC0000,0x5FF80000,0x1580000,0x1980000,0x21FC0000,0x88000000,0x21FC0000,0x1280001,0x1BC0000,0x61FC0000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,
-0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x94000000,0x3740000,0x73C0000,0x73C0000,0x1F40000,0x4DFC0000,0x7BF40000,0x94000000,0x94000000,0x1940000,0x1DFC0000,0x8FD00000,0x94000000,
-0x39FC0000,0x1080274,0xC2FC00F3,0x9EFC00F4,0x94FC00F3,0xB8F40092,0xA0F4004F,0x96F40067,0x98F40092,0x94F0004E,0x8EF00092,0xB6E800F4,0xA4E80033,0x96EC0051,0x9CE4004A,0x94E80002,0x8EE8004D,0x94E400F4,0x90E00053,0x8CE00069,0x88E400F5,0x1880274,0xAAD000F3,0x94E800F4,0xA0CC0092,0x94D80049,0x8EDC0090,0x9EBC00F3,0x94C40032,0x8EC4004D,0x88D000F4,0x49F80274,
-0x949C00F3,0x8E980090,0x888C00F4,0x82000278,0xFEEC0012,0xF9000154,0xFB040194,0xC2E80002,0xAAE80002,0x9CE80002,0x94E80006,0x92E40006,0xFCD80009,0xC0DC0000,0x96E00035,0x8EC4004D,0x19FC0274,0x11400F3,0xB7080032,0x9D080032,0x95080032,0xAD000049,0x9EFC0002,0x95000005,0x96FC0049,0x92FC000E,0x8EFC0049,0x39800F3,0xA4E80032,0x94F80032,0x9CE40049,0x94E80001,
-0x8EEC0049,0x51FC00F3,0x94C00032,0x8EBC0048,0x880000F4,0x39800F3,0xA4E80032,0x94F80032,0x9CE40049,0x94E80001,0x8EEC0049,0x51FC00F3,0x94C00032,0x8EBC0048,0x880000F4,0x51FC00F3,0x94C00032,0x8EBC0048,0x880000F4,0x880000F4,0xFAF00006,0xFD080073,0xFF0C006B,0xC2E80001,0xAAE80001,0x9CE80001,0x94F00001,0x94E00004,0xFCD80005,0xC0DC0000,0x96DC0033,0x8EBC0048,
-0x23FC00F3,0xFC00F3,0xFC00F3,0xFC00F3,0xFC00F3,0xA6F4004A,0xA6F4004A,0xA6F4004A,0x90F0004A,0x90F0004A,0x88F0004A,0xA4E80033,0xA4E80033,0xA4E80033,0x92E80002,0x92E80002,0x8AE8000E,0x8CE40033,0x8CE40033,0x88E40005,0x82E40035,0x37400F3,0x37400F3,0x37400F3,0x94D80049,0x94D80049,0x88E0004A,0x92C80032,0x92C80032,0x88D00001,0x84D40034,0x3FFC00F3,
-0x3FFC00F3,0x88B00048,0x82A40034,0x7E0000F4,0xFEE80006,0xF4F80073,0xFC00F3,0xC2E80001,0xA6E80002,0x9AE80002,0x98E80003,0x90E40001,0xFCD40001,0xC0DC0000,0x92E40033,0x88D00001,0xDFC00F3,0x1080032,0x1080032,0x1080032,0x1080032,0x9B000001,0x9B000001,0x9B000001,0x8D000001,0x8D000001,0x88FC0001,0x1880032,0x1880032,0x1880032,0x90EC0001,0x90EC0001,
-0x88F40000,0x49F80032,0x49F80032,0x88D40000,0x82000034,0x1880032,0x1880032,0x1880032,0x90EC0001,0x90EC0001,0x88F40000,0x49F80032,0x49F80032,0x88D40000,0x82000034,0x49F80032,0x49F80032,0x88D40000,0x82000034,0x82000034,0xEEF00000,0xF9000002,0x1080032,0xC2E80000,0xA0F00001,0x98EC0000,0x92F00001,0x90E40000,0xF0DC0001,0xBAE00000,0x19FC0032,0x88D40000,
-0x19FC0032,0x11C004A,0xAB140001,0x9B100001,0x95100001,0x1A80048,0x9EFC0001,0x95040001,0x59FC0048,0x94E00000,0x8E000048,0x1A80048,0x9EFC0001,0x95040001,0x59FC0048,0x94E00000,0x8E000048,0x59FC0048,0x94E00000,0x8E000048,0x8E000048,0x1A80048,0x9EFC0001,0x95040001,0x59FC0048,0x94E00000,0x8E000048,0x59FC0048,0x94E00000,0x8E000048,0x8E000048,0x59FC0048,
-0x94E00000,0x8E000048,0x8E000048,0x8E000048,0xFAF00005,0x1300048,0xF518000D,0xC6E40000,0xAAE40001,0x9EE00000,0x94F00000,0x94CC0000,0xFCD80004,0xC0DC0000,0x96F40000,0x8E000048,0x2DFC0048,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0xF0004A,0x94E80001,0x94E80001,0x94E80001,0x94E80001,0x94E80001,
-0x94E80001,0x84E40001,0x84E40001,0x84E40001,0x7EE40001,0x1680048,0x1680048,0x1680048,0x1680048,0x1680048,0x1680048,0x88D00001,0x88D00001,0x88D00001,0x7ED80001,0x39F80048,0x39F80048,0x39F80048,0x7EB40000,0x78000048,0xFCE80005,0xF0004A,0xF0004A,0xBEE80001,0xA8E80001,0x9CE80001,0x9CE80001,0x8EE80001,0xFCD40000,0xC0DC0000,0x88E40001,0x88D00001,
-0x3FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table49[] = {
-0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,
-0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xB040000,0xB040000,0xB040000,0x16C0000,0x7FC0000,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,
-0x1900000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x86000000,0x1200000,0x10C0001,0x10C0001,0x5380000,0x5500000,0x1700000,0x1700000,0x1C40000,0x5380000,0x5500000,0x1FF80000,0x4DFC0000,
-0x1FF80000,0x1200001,0x1200001,0x1200001,0x1200001,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x5DF40000,0x5DF40000,0x90000000,0x90000000,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x5DF40000,0x5DF40000,0x90000000,0x90000000,0x5DF40000,
-0x5DF40000,0x90000000,0x90000000,0x90000000,0x1500000,0x3340000,0x1200001,0x3880000,0x1E80000,0x31FC0000,0x47F80000,0x69FC0000,0x16C0000,0x1B00000,0x31FC0000,0x90000000,0x31FC0000,0x1380001,0x3D00000,0x6DFC0000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,
-0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x9C000000,0x3880000,0xF4C0000,0xF4C0000,0xFFC0000,0x5BFC0000,0x85F40000,0x9C000000,0x9C000000,0x1AC0000,0x2FFC0000,0x97E00000,0x9C000000,
-0x47FC0000,0x1180274,0xCB0C00F3,0xA70C00F4,0x9D0C00F3,0xC1040092,0xA904004F,0x9F040067,0xA1040092,0x9D00004E,0x97000092,0xBEF800F4,0xACF80033,0x9EFC0051,0xA4F4004A,0x9CF80002,0x96F8004D,0x9CF400F4,0x98F00053,0x94F00069,0x90F400F5,0x1A00274,0xB2E000F3,0x9CF800F4,0xA8DC0092,0x9CE80049,0x96EC0090,0xA6CC00F3,0x9CD40032,0x96D4004D,0x90E000F4,0x55F80274,
-0x9CAC00F3,0x96A80090,0x909C00F4,0x8A000278,0xFCFC0024,0xFF0C0164,0xF31401AB,0xCAF80002,0xB2F80002,0xA4F80002,0x9CF80006,0x9AF40006,0xFCEC0016,0xC8EC0000,0x9EF00035,0x96D4004D,0x27FC0274,0x12400F3,0xBF180032,0xA5180032,0x9D180032,0xB5100049,0xA70C0002,0x9D100005,0x9F0C0049,0x9B0C000E,0x970C0049,0x3B000F3,0xACF80032,0x9D080032,0xA4F40049,0x9CF80001,
-0x96FC0049,0x5DFC00F3,0x9CD00032,0x96CC0048,0x900000F4,0x3B000F3,0xACF80032,0x9D080032,0xA4F40049,0x9CF80001,0x96FC0049,0x5DFC00F3,0x9CD00032,0x96CC0048,0x900000F4,0x5DFC00F3,0x9CD00032,0x96CC0048,0x900000F4,0x900000F4,0xFF00000B,0xF5180081,0xF9200076,0xCAF80001,0xB2F80001,0xA4F80001,0x9D000001,0x9CF00004,0xFCEC000D,0xC8EC0000,0x9EEC0033,0x96CC0048,
-0x33FC00F3,0x10C00F3,0x10C00F3,0x10C00F3,0x10C00F3,0xAF04004A,0xAF04004A,0xAF04004A,0x9900004A,0x9900004A,0x9100004A,0xACF80033,0xACF80033,0xACF80033,0x9AF80002,0x9AF80002,0x92F8000E,0x94F40033,0x94F40033,0x90F40005,0x8AF40035,0x38C00F3,0x38C00F3,0x38C00F3,0x9CE80049,0x9CE80049,0x90F0004A,0x9AD80032,0x9AD80032,0x90E00001,0x8CE40034,0x4BFC00F3,
-0x4BFC00F3,0x90C00048,0x8AB40034,0x860000F4,0xFAFC000B,0xFD080073,0x10C00F3,0xCAF80001,0xAEF80002,0xA2F80002,0xA0F80003,0x98F40001,0xF8E80005,0xC8EC0000,0x9AF40033,0x90E00001,0x1DF800F3,0x1180032,0x1180032,0x1180032,0x1180032,0xA3100001,0xA3100001,0xA3100001,0x95100001,0x95100001,0x910C0001,0x1A00032,0x1A00032,0x1A00032,0x98FC0001,0x98FC0001,
-0x91040000,0x55F80032,0x55F80032,0x90E40000,0x8A000034,0x1A00032,0x1A00032,0x1A00032,0x98FC0001,0x98FC0001,0x91040000,0x55F80032,0x55F80032,0x90E40000,0x8A000034,0x55F80032,0x55F80032,0x90E40000,0x8A000034,0x8A000034,0xF7000000,0xF1100005,0x1180032,0xCAF80000,0xA9000001,0xA0FC0000,0x9B000001,0x98F40000,0xF8EC0001,0xC2F00000,0x27FC0032,0x90E40000,
-0x27FC0032,0x12C004A,0xB3240001,0xA3200001,0x9D200001,0x1C00048,0xA70C0001,0x9D140001,0x65F80048,0x9CF00000,0x96000048,0x1C00048,0xA70C0001,0x9D140001,0x65F80048,0x9CF00000,0x96000048,0x65F80048,0x9CF00000,0x96000048,0x96000048,0x1C00048,0xA70C0001,0x9D140001,0x65F80048,0x9CF00000,0x96000048,0x65F80048,0x9CF00000,0x96000048,0x96000048,0x65F80048,
-0x9CF00000,0x96000048,0x96000048,0x96000048,0xFB040008,0x9400048,0xFD28000D,0xCEF40000,0xB2F40001,0xA6F00000,0x9D000000,0x9CDC0000,0xFEF00005,0xC8EC0000,0x9F040000,0x96000048,0x3DF80048,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x100004A,0x9CF80001,0x9CF80001,0x9CF80001,0x9CF80001,0x9CF80001,
-0x9CF80001,0x8CF40001,0x8CF40001,0x8CF40001,0x86F40001,0x1800048,0x1800048,0x1800048,0x1800048,0x1800048,0x1800048,0x90E00001,0x90E00001,0x90E00001,0x86E80001,0x45F80048,0x45F80048,0x45F80048,0x86C40000,0x80000048,0xF4F8000A,0x100004A,0x100004A,0xC6F80001,0xB0F80001,0xA4F80001,0xA4F80001,0x96F80001,0xF8E80001,0xC8EC0000,0x90F40001,0x90E00001,
-0x13FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table50[] = {
-0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x47FC0000,
-0x47FC0000,0x47FC0000,0x47FC0000,0x82000000,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1180000,0x1180000,0x1180000,0x1840000,0x17FC0000,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,
-0x1A80000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x8E000000,0x1300000,0x11C0001,0x11C0001,0x14C0000,0x5640000,0x1840000,0x1840000,0x1E00000,0x14C0000,0x5640000,0x2DFC0000,0x59FC0000,
-0x2DFC0000,0x1300001,0x1300001,0x1300001,0x1300001,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x67FC0000,0x67FC0000,0x98000000,0x98000000,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x67FC0000,0x67FC0000,0x98000000,0x98000000,0x67FC0000,
-0x67FC0000,0x98000000,0x98000000,0x98000000,0x1640000,0xB440000,0x1300001,0x1A00000,0x5FC0000,0x3FFC0000,0x55F80000,0x75F80000,0x1800000,0x3C40000,0x3FFC0000,0x98000000,0x3FFC0000,0x1480001,0x3E80000,0x79FC0000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,
-0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0xA4000000,0x39C0000,0x1600000,0x1600000,0x23FC0000,0x69F80000,0x8FF40000,0xA4000000,0xA4000000,0x1C00000,0x41FC0000,0x9FF00000,0xA4000000,
-0x57FC0000,0x1280274,0xD31C00F3,0xAF1C00F4,0xA51C00F3,0xC9140092,0xB114004F,0xA7140067,0xA9140092,0xA510004E,0x9F100092,0xC70800F4,0xB5080033,0xA70C0051,0xAD04004A,0xA5080002,0x9F08004D,0xA50400F4,0xA1000053,0x9D000069,0x990400F5,0x1B80274,0xBAF000F3,0xA50800F4,0xB0EC0092,0xA4F80049,0x9EFC0090,0xAEDC00F3,0xA4E40032,0x9EE4004D,0x98F000F4,0x61F80274,
-0xA4BC00F3,0x9EB80090,0x98AC00F4,0x92000278,0xFF0C0032,0xF9200172,0xFB2401AB,0xD3080002,0xBB080002,0xAD080002,0xA5080006,0xA3040006,0xFCFC0024,0xD0FC0000,0xA7000035,0x9EE4004D,0x37FC0274,0x13400F3,0xC7280032,0xAD280032,0xA5280032,0xBD200049,0xAF1C0002,0xA5200005,0xA71C0049,0xA31C000E,0x9F1C0049,0x1C800F3,0xB5080032,0xA5180032,0xAD040049,0xA5080001,
-0x9F0C0049,0x69FC00F3,0xA4E00032,0x9EDC0048,0x980000F4,0x1C800F3,0xB5080032,0xA5180032,0xAD040049,0xA5080001,0x9F0C0049,0x69FC00F3,0xA4E00032,0x9EDC0048,0x980000F4,0x69FC00F3,0xA4E00032,0x9EDC0048,0x980000F4,0x980000F4,0xFF14000E,0xFD280081,0xFF2C007A,0xD3080001,0xBB080001,0xAD080001,0xA5100001,0xA5000004,0xFF000013,0xD0FC0000,0xA6FC0033,0x9EDC0048,
-0x41FC00F3,0x11C00F3,0x11C00F3,0x11C00F3,0x11C00F3,0xB714004A,0xB714004A,0xB714004A,0xA110004A,0xA110004A,0x9910004A,0xB5080033,0xB5080033,0xB5080033,0xA3080002,0xA3080002,0x9B08000E,0x9D040033,0x9D040033,0x99040005,0x93040035,0x3A400F3,0x3A400F3,0x3A400F3,0xA4F80049,0xA4F80049,0x9900004A,0xA2E80032,0xA2E80032,0x98F00001,0x94F40034,0x57FC00F3,
-0x57FC00F3,0x98D00048,0x92C40034,0x8E0000F4,0xFF0C000E,0xF518007E,0x11C00F3,0xD3080001,0xB7080002,0xAB080002,0xA9080003,0xA1040001,0xFEF80006,0xD0FC0000,0xA3040033,0x98F00001,0x2BFC00F3,0x1280032,0x1280032,0x1280032,0x1280032,0xAB200001,0xAB200001,0xAB200001,0x9D200001,0x9D200001,0x991C0001,0x1B80032,0x1B80032,0x1B80032,0xA10C0001,0xA10C0001,
-0x99140000,0x61F80032,0x61F80032,0x98F40000,0x92000034,0x1B80032,0x1B80032,0x1B80032,0xA10C0001,0xA10C0001,0x99140000,0x61F80032,0x61F80032,0x98F40000,0x92000034,0x61F80032,0x61F80032,0x98F40000,0x92000034,0x92000034,0xFF100000,0xF9200005,0x1280032,0xD3080000,0xB1100001,0xA90C0000,0xA3100001,0xA1040000,0xF5000002,0xCB000000,0x37FC0032,0x98F40000,
-0x37FC0032,0x13C004A,0xBB340001,0xAB300001,0xA5300001,0x1D80048,0xAF1C0001,0xA5240001,0x71F80048,0xA5000000,0x9E000048,0x1D80048,0xAF1C0001,0xA5240001,0x71F80048,0xA5000000,0x9E000048,0x71F80048,0xA5000000,0x9E000048,0x9E000048,0x1D80048,0xAF1C0001,0xA5240001,0x71F80048,0xA5000000,0x9E000048,0x71F80048,0xA5000000,0x9E000048,0x9E000048,0x71F80048,
-0xA5000000,0x9E000048,0x9E000048,0x9E000048,0xFF14000A,0x1540048,0xF5380012,0xD7040000,0xBB040001,0xAF000000,0xA5100000,0xA4EC0000,0xF7080008,0xD0FC0000,0xA7140000,0x9E000048,0x4BFC0048,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0x110004A,0xA5080001,0xA5080001,0xA5080001,0xA5080001,0xA5080001,
-0xA5080001,0x95040001,0x95040001,0x95040001,0x8F040001,0x1980048,0x1980048,0x1980048,0x1980048,0x1980048,0x1980048,0x98F00001,0x98F00001,0x98F00001,0x8EF80001,0x51F80048,0x51F80048,0x51F80048,0x8ED40000,0x88000048,0xFD08000A,0x110004A,0x110004A,0xCF080001,0xB9080001,0xAD080001,0xAD080001,0x9F080001,0xF8F80002,0xD0FC0000,0x99040001,0x98F00001,
-0x21FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table51[] = {
-0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,
-0x53FC0000,0x53FC0000,0x53FC0000,0x8A000000,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1280000,0x1280000,0x1280000,0x19C0000,0x25FC0000,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,
-0x1C00000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x96000000,0x9400000,0x12C0001,0x12C0001,0x1600000,0x5780000,0x3980000,0x3980000,0x1FC0000,0x1600000,0x5780000,0x3DF80000,0x65F80000,
-0x3DF80000,0x1400001,0x1400001,0x1400001,0x1400001,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x73FC0000,0x73FC0000,0xA0000000,0xA0000000,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x73FC0000,0x73FC0000,0xA0000000,0xA0000000,0x73FC0000,
-0x73FC0000,0xA0000000,0xA0000000,0xA0000000,0x5740000,0x1580000,0x1400001,0x3B40000,0x19FC0000,0x4FFC0000,0x61FC0000,0x7FFC0000,0x1940000,0x3DC0000,0x4FFC0000,0xA0000000,0x4FFC0000,0x1580001,0x7FC0000,0x85FC0000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,
-0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0x7FC0000,0x85FC0000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0xAC000000,0x3B00000,0x1700000,0x1700000,0x37FC0000,0x75FC0000,0x99F40000,0xAC000000,0xAC000000,0x3D40000,0x51FC0000,0xA9C40000,0xAC000000,
-0x65FC0000,0x1380274,0xDB2C00F3,0xB72C00F4,0xAD2C00F3,0xD1240092,0xB924004F,0xAF240067,0xB1240092,0xAD20004E,0xA7200092,0xCF1800F4,0xBD180033,0xAF1C0051,0xB514004A,0xAD180002,0xA718004D,0xAD1400F4,0xA9100053,0xA5100069,0xA11400F5,0x1D00274,0xC30000F3,0xAD1800F4,0xB8FC0092,0xAD080049,0xA70C0090,0xB6EC00F3,0xACF40032,0xA6F4004D,0xA10000F4,0x6DF80274,
-0xACCC00F3,0xA6C80090,0xA0BC00F4,0x9A000278,0xFF20003E,0xFF2C018A,0xF33401C4,0xDB180002,0xC3180002,0xB5180002,0xAD180006,0xAB140006,0xFF10002E,0xD90C0000,0xAF100035,0xA6F4004D,0x45FC0274,0x14400F3,0xCF380032,0xB5380032,0xAD380032,0xC5300049,0xB72C0002,0xAD300005,0xAF2C0049,0xAB2C000E,0xA72C0049,0x1E000F3,0xBD180032,0xAD280032,0xB5140049,0xAD180001,
-0xA71C0049,0x75FC00F3,0xACF00032,0xA6EC0048,0xA00000F4,0x1E000F3,0xBD180032,0xAD280032,0xB5140049,0xAD180001,0xA71C0049,0x75FC00F3,0xACF00032,0xA6EC0048,0xA00000F4,0x75FC00F3,0xACF00032,0xA6EC0048,0xA00000F4,0xA00000F4,0xF928001A,0xF73C008B,0xF9400083,0xDB180001,0xC3180001,0xB5180001,0xAD200001,0xAD100004,0xFD140019,0xD90C0000,0xAF0C0033,0xA6EC0048,
-0x51FC00F3,0x12C00F3,0x12C00F3,0x12C00F3,0x12C00F3,0xBF24004A,0xBF24004A,0xBF24004A,0xA920004A,0xA920004A,0xA120004A,0xBD180033,0xBD180033,0xBD180033,0xAB180002,0xAB180002,0xA318000E,0xA5140033,0xA5140033,0xA1140005,0x9B140035,0x3BC00F3,0x3BC00F3,0x3BC00F3,0xAD080049,0xAD080049,0xA110004A,0xAAF80032,0xAAF80032,0xA1000001,0x9D040034,0x63FC00F3,
-0x63FC00F3,0xA0E00048,0x9AD40034,0x960000F4,0xFD1C0016,0xFD28007E,0x12C00F3,0xDB180001,0xBF180002,0xB3180002,0xB1180003,0xA9140001,0xFD0C000D,0xD90C0000,0xAB140033,0xA1000001,0x3BFC00F3,0x1380032,0x1380032,0x1380032,0x1380032,0xB3300001,0xB3300001,0xB3300001,0xA5300001,0xA5300001,0xA12C0001,0x1D00032,0x1D00032,0x1D00032,0xA91C0001,0xA91C0001,
-0xA1240000,0x6DF80032,0x6DF80032,0xA1040000,0x9A000034,0x1D00032,0x1D00032,0x1D00032,0xA91C0001,0xA91C0001,0xA1240000,0x6DF80032,0x6DF80032,0xA1040000,0x9A000034,0x6DF80032,0x6DF80032,0xA1040000,0x9A000034,0x9A000034,0xFF200001,0xF130000A,0x1380032,0xDB180000,0xB9200001,0xB11C0000,0xAB200001,0xA9140000,0xFD100002,0xD3100000,0x45FC0032,0xA1040000,
-0x45FC0032,0x14C004A,0xC3440001,0xB3400001,0xAD400001,0x1F00048,0xB72C0001,0xAD340001,0x7DF80048,0xAD100000,0xA6000048,0x1F00048,0xB72C0001,0xAD340001,0x7DF80048,0xAD100000,0xA6000048,0x7DF80048,0xAD100000,0xA6000048,0xA6000048,0x1F00048,0xB72C0001,0xAD340001,0x7DF80048,0xAD100000,0xA6000048,0x7DF80048,0xAD100000,0xA6000048,0xA6000048,0x7DF80048,
-0xAD100000,0xA6000048,0xA6000048,0xA6000048,0xFB2C000D,0x1640048,0xFD480012,0xDF140000,0xC3140001,0xB7100000,0xAD200000,0xACFC0000,0xFF180008,0xD90C0000,0xAF240000,0xA6000048,0x5BFC0048,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0x120004A,0xAD180001,0xAD180001,0xAD180001,0xAD180001,0xAD180001,
-0xAD180001,0x9D140001,0x9D140001,0x9D140001,0x97140001,0x1B00048,0x1B00048,0x1B00048,0x1B00048,0x1B00048,0x1B00048,0xA1000001,0xA1000001,0xA1000001,0x97080001,0x5DF40048,0x5DF40048,0x5DF40048,0x96E40000,0x90000048,0xF71C000D,0x120004A,0x120004A,0xD7180001,0xC1180001,0xB5180001,0xB5180001,0xA7180001,0xFB080004,0xD90C0000,0xA1140001,0xA1000001,
-0x31FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table52[] = {
-0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x61F80000,
-0x61F80000,0x61F80000,0x61F80000,0x92000001,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x13C0000,0x13C0000,0x13C0000,0x1B80000,0x37FC0000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,
-0x1DC0000,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x9E000001,0x3540000,0x1400000,0x1400000,0x1740000,0x1900000,0x1B40000,0x1B40000,0x17FC0000,0x1740000,0x1900000,0x4DFC0000,0x73F80000,
-0x4DFC0000,0x1540000,0x1540000,0x1540000,0x1540000,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x81FC0000,0x81FC0000,0xA8000001,0xA8000001,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x81FC0000,0x81FC0000,0xA8000001,0xA8000001,0x81FC0000,
-0x81FC0000,0xA8000001,0xA8000001,0xA8000001,0x18C0000,0xD680000,0x1540000,0x3CC0000,0x2FFC0000,0x5FFC0000,0x71FC0000,0x8BFC0000,0x3A80000,0x1F80000,0x5FFC0000,0xA8000001,0x5FFC0000,0x16C0000,0x23FC0000,0x93FC0000,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,
-0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0xB4000001,0x1C80000,0x1840000,0x1840000,0x4DFC0000,0x85FC0000,0xA5F00000,0xB4000001,0xB4000001,0x1F00000,0x65FC0000,0xB1F40000,0xB4000001,
-0x77FC0000,0x1480278,0xE34000F4,0xC14000F4,0xB54000F5,0xDD340090,0xC534004D,0xB9380069,0xBB340090,0xB534004D,0xAF340092,0xD92800F3,0xC7280032,0xB9300053,0xBD280049,0xB5280002,0xB128004E,0xB52800F4,0xB3240051,0xAF240067,0xAB2800F3,0x3E80274,0xCB1400F3,0xB52800F4,0xC30C0092,0xB718004A,0xAF200092,0xC0FC00F3,0xB5080033,0xAF08004F,0xAB1000F4,0x79FC0274,
-0xB4E400F4,0xAEDC0092,0xAAC800F3,0xA4000274,0xFF340059,0xFB440190,0xFD4801C4,0xE7280000,0xCD280001,0xBD280002,0xB72C0006,0xB5280006,0xFF24004F,0xDF200002,0xB7240035,0xAF08004F,0x57FC0274,0x15400F4,0xD74C0034,0xBF480034,0xB5480035,0xD1400048,0xBF400001,0xB7400005,0xB740004A,0xB53C000E,0xAF40004A,0x1FC00F3,0xC52C0032,0xB7380033,0xBD280049,0xB5280002,
-0xAF30004A,0x83F800F3,0xB5080033,0xAF00004A,0xAA0000F3,0x1FC00F3,0xC52C0032,0xB7380033,0xBD280049,0xB5280002,0xAF30004A,0x83F800F3,0xB5080033,0xAF00004A,0xAA0000F3,0x83F800F3,0xB5080033,0xAF00004A,0xAA0000F3,0xAA0000F3,0xFF3C0024,0xFF4C008C,0xF1500095,0xE7280000,0xCD280001,0xBD280002,0xB7300001,0xB5200003,0xFB2C002A,0xDF200001,0xB91C0032,0xAF00004A,
-0x61FC00F3,0x14000F4,0x14000F4,0x14000F4,0x14000F4,0xCB340048,0xCB340048,0xCB340048,0xB1340049,0xB1340049,0xA9340049,0xC7280032,0xC7280032,0xC7280032,0xB5280001,0xB5280001,0xAB2C000E,0xAD280032,0xAD280032,0xA9280005,0xA5280032,0x1D800F3,0x1D800F3,0x1D800F3,0xB7180049,0xB7180049,0xA9240049,0xB5080032,0xB5080032,0xA9140002,0xA5180032,0x71F800F3,
-0x71F800F3,0xA8F40049,0xA4E00032,0x9E0000F3,0xF9300024,0xF73C008C,0x14000F4,0xE7280000,0xC9280001,0xBD280001,0xB9280004,0xB1280001,0xFF200012,0xDF200001,0xB5240032,0xA9140002,0x4BFC00F3,0x1480034,0x1480034,0x1480034,0x1480034,0xBD400000,0xBD400000,0xBD400000,0xAF400000,0xAF400000,0xA9400001,0x3E80032,0x3E80032,0x3E80032,0xB1300001,0xB1300001,
-0xA9380001,0x79FC0032,0x79FC0032,0xA91C0001,0xA4000032,0x3E80032,0x3E80032,0x3E80032,0xB1300001,0xB1300001,0xA9380001,0x79FC0032,0x79FC0032,0xA91C0001,0xA4000032,0x79FC0032,0x79FC0032,0xA91C0001,0xA4000032,0xA4000032,0xFB340004,0xFB440008,0x1480034,0xE7280000,0xC72C0000,0xBB2C0000,0xB7300000,0xB1280001,0xF9280005,0xDF200000,0x57FC0032,0xA91C0001,
-0x57FC0032,0x1600048,0xCF540000,0xBB540001,0xB5540001,0xFFC0048,0xBF400001,0xB5480001,0x8BF80048,0xB5240001,0xAE00004A,0xFFC0048,0xBF400001,0xB5480001,0x8BF80048,0xB5240001,0xAE00004A,0x8BF80048,0xB5240001,0xAE00004A,0xAE00004A,0xFFC0048,0xBF400001,0xB5480001,0x8BF80048,0xB5240001,0xAE00004A,0x8BF80048,0xB5240001,0xAE00004A,0xAE00004A,0x8BF80048,
-0xB5240001,0xAE00004A,0xAE00004A,0xAE00004A,0xFD400012,0x1780048,0xF75C0019,0xE7280000,0xCB280001,0xBF240000,0xB5340001,0xB5140001,0xFF2C0011,0xE31C0000,0xB7380001,0xAE00004A,0x6BFC0048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0x1340048,0xB9280000,0xB9280000,0xB9280000,0xB9280000,0xB9280000,
-0xB9280000,0xA5280001,0xA5280001,0xA5280001,0x9F280001,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0xA9140001,0xA9140001,0xA9140001,0x9F1C0001,0x69FC0048,0x69FC0048,0x69FC0048,0x9EF80001,0x9800004A,0xFF2C000D,0x1340048,0x1340048,0xE7280000,0xCD280000,0xC1280000,0xC1280000,0xB1280000,0xFD1C0005,0xE71C0000,0xAD240000,0xA9140001,
-0x41FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table53[] = {
-0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,
-0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x14C0000,0x14C0000,0x14C0000,0x1D00000,0x45FC0000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,
-0x1F40000,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0xA6000001,0xB640000,0x1500000,0x1500000,0x7840000,0x1A40000,0x1C80000,0x1C80000,0x2BFC0000,0x7840000,0x1A40000,0x5DF80000,0x7FF80000,
-0x5DF80000,0x1640000,0x1640000,0x1640000,0x1640000,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x8DFC0000,0x8DFC0000,0xB0000001,0xB0000001,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x8DFC0000,0x8DFC0000,0xB0000001,0xB0000001,0x8DFC0000,
-0x8DFC0000,0xB0000001,0xB0000001,0xB0000001,0x59C0000,0x17C0000,0x1640000,0x1E40000,0x43FC0000,0x6FFC0000,0x7FF80000,0x97F80000,0x3BC0000,0x15FC0000,0x6FFC0000,0xB0000001,0x6FFC0000,0x17C0000,0x3BFC0000,0x9FF80000,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,
-0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0xBC000001,0x1DC0000,0x3940000,0x3940000,0x61FC0000,0x93F80000,0xAFF00000,0xBC000001,0xBC000001,0xDFC0000,0x75FC0000,0xBBC80000,0xBC000001,
-0x85FC0000,0x1580278,0xEB5000F4,0xC95000F4,0xBD5000F5,0xE5440090,0xCD44004D,0xC1480069,0xC3440090,0xBD44004D,0xB7440092,0xE13800F3,0xCF380032,0xC1400053,0xC5380049,0xBD380002,0xB938004E,0xBD3800F4,0xBB340051,0xB7340067,0xB33800F3,0x7FC0274,0xD32400F3,0xBD3800F4,0xCB1C0092,0xBF28004A,0xB7300092,0xC90C00F3,0xBD180033,0xB718004F,0xB32000F4,0x85FC0274,
-0xBCF400F4,0xB6EC0092,0xB2D800F3,0xAC000274,0xFF44007E,0xF35401B2,0xF55801DD,0xEF380000,0xD5380001,0xC5380002,0xBF3C0006,0xBD380006,0xFF34006A,0xE7300002,0xBF340035,0xB718004F,0x65FC0274,0x16400F4,0xDF5C0034,0xC7580034,0xBD580035,0xD9500048,0xC7500001,0xBF500005,0xBF50004A,0xBD4C000E,0xB750004A,0x19FC00F3,0xCD3C0032,0xBF480033,0xC5380049,0xBD380002,
-0xB740004A,0x8FF800F3,0xBD180033,0xB710004A,0xB20000F3,0x19FC00F3,0xCD3C0032,0xBF480033,0xC5380049,0xBD380002,0xB740004A,0x8FF800F3,0xBD180033,0xB710004A,0xB20000F3,0x8FF800F3,0xBD180033,0xB710004A,0xB20000F3,0xB20000F3,0xFD4C002D,0xF960009A,0xF9600095,0xEF380000,0xD5380001,0xC5380002,0xBF400001,0xBD300003,0xFD400033,0xE7300001,0xC12C0032,0xB710004A,
-0x71FC00F3,0x15000F4,0x15000F4,0x15000F4,0x15000F4,0xD3440048,0xD3440048,0xD3440048,0xB9440049,0xB9440049,0xB1440049,0xCF380032,0xCF380032,0xCF380032,0xBD380001,0xBD380001,0xB33C000E,0xB5380032,0xB5380032,0xB1380005,0xAD380032,0x1F000F3,0x1F000F3,0x1F000F3,0xBF280049,0xBF280049,0xB1340049,0xBD180032,0xBD180032,0xB1240002,0xAD280032,0x7DF800F3,
-0x7DF800F3,0xB1040049,0xACF00032,0xA60000F3,0xF940002D,0xFF4C008C,0x15000F4,0xEF380000,0xD1380001,0xC5380001,0xC1380004,0xB9380001,0xFF340019,0xE7300001,0xBD340032,0xB1240002,0x5BFC00F3,0x1580034,0x1580034,0x1580034,0x1580034,0xC5500000,0xC5500000,0xC5500000,0xB7500000,0xB7500000,0xB1500001,0x7FC0032,0x7FC0032,0x7FC0032,0xB9400001,0xB9400001,
-0xB1480001,0x85FC0032,0x85FC0032,0xB12C0001,0xAC000032,0x7FC0032,0x7FC0032,0x7FC0032,0xB9400001,0xB9400001,0xB1480001,0x85FC0032,0x85FC0032,0xB12C0001,0xAC000032,0x85FC0032,0x85FC0032,0xB12C0001,0xAC000032,0xAC000032,0xF7480005,0xF354000D,0x1580034,0xEF380000,0xCF3C0000,0xC33C0000,0xBF400000,0xB9380001,0xF53C0008,0xE7300000,0x65FC0032,0xB12C0001,
-0x65FC0032,0x1700048,0xD7640000,0xC3640001,0xBD640001,0x29FC0048,0xC7500001,0xBD580001,0x97F80048,0xBD340001,0xB600004A,0x29FC0048,0xC7500001,0xBD580001,0x97F80048,0xBD340001,0xB600004A,0x97F80048,0xBD340001,0xB600004A,0xB600004A,0x29FC0048,0xC7500001,0xBD580001,0x97F80048,0xBD340001,0xB600004A,0x97F80048,0xBD340001,0xB600004A,0xB600004A,0x97F80048,
-0xBD340001,0xB600004A,0xB600004A,0xB600004A,0xFF500014,0x1880048,0xFF6C0019,0xEF380000,0xD3380001,0xC7340000,0xBD440001,0xBD240001,0xF9480012,0xEB2C0000,0xBF480001,0xB600004A,0x7BFC0048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0x1440048,0xC1380000,0xC1380000,0xC1380000,0xC1380000,0xC1380000,
-0xC1380000,0xAD380001,0xAD380001,0xAD380001,0xA7380001,0x1E00048,0x1E00048,0x1E00048,0x1E00048,0x1E00048,0x1E00048,0xB1240001,0xB1240001,0xB1240001,0xA72C0001,0x75FC0048,0x75FC0048,0x75FC0048,0xA7080001,0xA000004A,0xF73C0014,0x1440048,0x1440048,0xEF380000,0xD5380000,0xC9380000,0xC9380000,0xB9380000,0xF9300008,0xEF2C0000,0xB5340000,0xB1240001,
-0x51FC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table54[] = {
-0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x79F80000,
-0x79F80000,0x79F80000,0x79F80000,0xA2000001,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x75C0000,0x75C0000,0x75C0000,0x1E80000,0x55FC0000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,
-0xFFC0000,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0xAE000001,0x1780000,0x1600000,0x1600000,0x3980000,0x1B80000,0x3DC0000,0x3DC0000,0x3DFC0000,0x3980000,0x1B80000,0x6BFC0000,0x8BF80000,
-0x6BFC0000,0x1740000,0x1740000,0x1740000,0x1740000,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x99FC0000,0x99FC0000,0xB8000001,0xB8000001,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x99FC0000,0x99FC0000,0xB8000001,0xB8000001,0x99FC0000,
-0x99FC0000,0xB8000001,0xB8000001,0xB8000001,0x1B00000,0x18C0000,0x1740000,0x3F80000,0x57FC0000,0x7FF80000,0x8BFC0000,0xA1FC0000,0x5D00000,0x2FFC0000,0x7FF80000,0xB8000001,0x7FF80000,0x18C0000,0x53FC0000,0xABF80000,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,
-0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0xC4000001,0x1F00000,0xBA40000,0xBA40000,0x73FC0000,0x9FFC0000,0xB9F00000,0xC4000001,0xC4000001,0x2BFC0000,0x87FC0000,0xC3D80000,0xC4000001,
-0x95FC0000,0x1680278,0xF36000F4,0xD16000F4,0xC56000F5,0xED540090,0xD554004D,0xC9580069,0xCB540090,0xC554004D,0xBF540092,0xE94800F3,0xD7480032,0xC9500053,0xCD480049,0xC5480002,0xC148004E,0xC54800F4,0xC3440051,0xBF440067,0xBB4800F3,0x1FFC0274,0xDB3400F3,0xC54800F4,0xD32C0092,0xC738004A,0xBF400092,0xD11C00F3,0xC5280033,0xBF28004F,0xBB3000F4,0x91FC0274,
-0xC50400F4,0xBEFC0092,0xBAE800F3,0xB4000274,0xFF580096,0xFB6401B2,0xFD6801DD,0xF7480000,0xDD480001,0xCD480002,0xC74C0006,0xC5480006,0xFD4C008A,0xEF400002,0xC7440035,0xBF28004F,0x75FC0274,0x17400F4,0xE76C0034,0xCF680034,0xC5680035,0xE1600048,0xCF600001,0xC7600005,0xC760004A,0xC55C000E,0xBF60004A,0x31FC00F3,0xD54C0032,0xC7580033,0xCD480049,0xC5480002,
-0xBF50004A,0x9BF800F3,0xC5280033,0xBF20004A,0xBA0000F3,0x31FC00F3,0xD54C0032,0xC7580033,0xCD480049,0xC5480002,0xBF50004A,0x9BF800F3,0xC5280033,0xBF20004A,0xBA0000F3,0x9BF800F3,0xC5280033,0xBF20004A,0xBA0000F3,0xBA0000F3,0xFD60003E,0xFF6C009E,0xF37400A4,0xF7480000,0xDD480001,0xCD480002,0xC7500001,0xC5400003,0xFB54003D,0xEF400001,0xC93C0032,0xBF20004A,
-0x7FFC00F3,0x16000F4,0x16000F4,0x16000F4,0x16000F4,0xDB540048,0xDB540048,0xDB540048,0xC1540049,0xC1540049,0xB9540049,0xD7480032,0xD7480032,0xD7480032,0xC5480001,0xC5480001,0xBB4C000E,0xBD480032,0xBD480032,0xB9480005,0xB5480032,0xDFC00F3,0xDFC00F3,0xDFC00F3,0xC7380049,0xC7380049,0xB9440049,0xC5280032,0xC5280032,0xB9340002,0xB5380032,0x89F800F3,
-0x89F800F3,0xB9140049,0xB5000032,0xAE0000F3,0xFD540035,0xF75C0099,0x16000F4,0xF7480000,0xD9480001,0xCD480001,0xC9480004,0xC1480001,0xFB480029,0xEF400001,0xC5440032,0xB9340002,0x69FC00F3,0x1680034,0x1680034,0x1680034,0x1680034,0xCD600000,0xCD600000,0xCD600000,0xBF600000,0xBF600000,0xB9600001,0x1FFC0032,0x1FFC0032,0x1FFC0032,0xC1500001,0xC1500001,
-0xB9580001,0x91FC0032,0x91FC0032,0xB93C0001,0xB4000032,0x1FFC0032,0x1FFC0032,0x1FFC0032,0xC1500001,0xC1500001,0xB9580001,0x91FC0032,0x91FC0032,0xB93C0001,0xB4000032,0x91FC0032,0x91FC0032,0xB93C0001,0xB4000032,0xB4000032,0xFF580005,0xFB64000D,0x1680034,0xF7480000,0xD74C0000,0xCB4C0000,0xC7500000,0xC1480001,0xFD4C0008,0xEF400000,0x75FC0032,0xB93C0001,
-0x75FC0032,0x1800048,0xDF740000,0xCB740001,0xC5740001,0x41FC0048,0xCF600001,0xC5680001,0xA1FC0048,0xC5440001,0xBE00004A,0x41FC0048,0xCF600001,0xC5680001,0xA1FC0048,0xC5440001,0xBE00004A,0xA1FC0048,0xC5440001,0xBE00004A,0xBE00004A,0x41FC0048,0xCF600001,0xC5680001,0xA1FC0048,0xC5440001,0xBE00004A,0xA1FC0048,0xC5440001,0xBE00004A,0xBE00004A,0xA1FC0048,
-0xC5440001,0xBE00004A,0xBE00004A,0xBE00004A,0xF7680019,0x5980048,0xF77C0020,0xF7480000,0xDB480001,0xCF440000,0xC5540001,0xC5340001,0xFD580014,0xF33C0000,0xC7580001,0xBE00004A,0x89FC0048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0x1540048,0xC9480000,0xC9480000,0xC9480000,0xC9480000,0xC9480000,
-0xC9480000,0xB5480001,0xB5480001,0xB5480001,0xAF480001,0x1F80048,0x1F80048,0x1F80048,0x1F80048,0x1F80048,0x1F80048,0xB9340001,0xB9340001,0xB9340001,0xAF3C0001,0x81FC0048,0x81FC0048,0x81FC0048,0xAF180001,0xA800004A,0xFF4C0014,0x1540048,0x1540048,0xF7480000,0xDD480000,0xD1480000,0xD1480000,0xC1480000,0xF544000D,0xF73C0000,0xBD440000,0xB9340001,
-0x5FFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table55[] = {
-0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,
-0x85F80000,0x85F80000,0x85F80000,0xAA000001,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0xF6C0000,0xF6C0000,0xF6C0000,0x3FC0000,0x63FC0000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,
-0x29FC0000,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0xB6000001,0x1880000,0x1700000,0x1700000,0x1AC0000,0x3CC0000,0x1F40000,0x1F40000,0x51FC0000,0x1AC0000,0x3CC0000,0x7BFC0000,0x97F80000,
-0x7BFC0000,0x1840000,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0xA5F80000,0xA5F80000,0xC0000001,0xC0000001,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0xA5F80000,0xA5F80000,0xC0000001,0xC0000001,0xA5F80000,
-0xA5F80000,0xC0000001,0xC0000001,0xC0000001,0x1C40000,0x79C0000,0x1840000,0x1BFC0000,0x6BFC0000,0x8DFC0000,0x99FC0000,0xADF80000,0x5E40000,0x47FC0000,0x8DFC0000,0xC0000001,0x8DFC0000,0x19C0000,0x6BFC0000,0xB7F80000,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,
-0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0x6BFC0000,0xB7F80000,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0xCC000001,0x11FC0000,0x1B80000,0x1B80000,0x87FC0000,0xADFC0000,0xC3F00000,0xCC000001,0xCC000001,0x49FC0000,0x97FC0000,0xCBE80000,0xCC000001,
-0xA3FC0000,0x1780278,0xFB7000F4,0xD97000F4,0xCD7000F5,0xF5640090,0xDD64004D,0xD1680069,0xD3640090,0xCD64004D,0xC7640092,0xF15800F3,0xDF580032,0xD1600053,0xD5580049,0xCD580002,0xC958004E,0xCD5800F4,0xCB540051,0xC7540067,0xC35800F3,0x37FC0274,0xE34400F3,0xCD5800F4,0xDB3C0092,0xCF48004A,0xC7500092,0xD92C00F3,0xCD380033,0xC738004F,0xC34000F4,0x9DFC0274,
-0xCD1400F4,0xC70C0092,0xC2F800F3,0xBC000274,0xFF6C00C2,0xF37401D8,0xF57801F8,0xFF580000,0xE5580001,0xD5580002,0xCF5C0006,0xCD580006,0xFF5C00A6,0xF7500002,0xCF540035,0xC738004F,0x83FC0274,0x18400F4,0xEF7C0034,0xD7780034,0xCD780035,0xE9700048,0xD7700001,0xCF700005,0xCF70004A,0xCD6C000E,0xC770004A,0x49FC00F3,0xDD5C0032,0xCF680033,0xD5580049,0xCD580002,
-0xC760004A,0xA7F800F3,0xCD380033,0xC730004A,0xC20000F3,0x49FC00F3,0xDD5C0032,0xCF680033,0xD5580049,0xCD580002,0xC760004A,0xA7F800F3,0xCD380033,0xC730004A,0xC20000F3,0xA7F800F3,0xCD380033,0xC730004A,0xC20000F3,0xC20000F3,0xFD740049,0xF98000A8,0xFB8400A4,0xFF580000,0xE5580001,0xD5580002,0xCF600001,0xCD500003,0xFB680055,0xF7500001,0xD14C0032,0xC730004A,
-0x8FFC00F3,0x17000F4,0x17000F4,0x17000F4,0x17000F4,0xE3640048,0xE3640048,0xE3640048,0xC9640049,0xC9640049,0xC1640049,0xDF580032,0xDF580032,0xDF580032,0xCD580001,0xCD580001,0xC35C000E,0xC5580032,0xC5580032,0xC1580005,0xBD580032,0x25FC00F3,0x25FC00F3,0x25FC00F3,0xCF480049,0xCF480049,0xC1540049,0xCD380032,0xCD380032,0xC1440002,0xBD480032,0x95F800F3,
-0x95F800F3,0xC1240049,0xBD100032,0xB60000F3,0xFD64003E,0xFF6C0099,0x17000F4,0xFF580000,0xE1580001,0xD5580001,0xD1580004,0xC9580001,0xFF580032,0xF7500001,0xCD540032,0xC1440002,0x79FC00F3,0x1780034,0x1780034,0x1780034,0x1780034,0xD5700000,0xD5700000,0xD5700000,0xC7700000,0xC7700000,0xC1700001,0x37FC0032,0x37FC0032,0x37FC0032,0xC9600001,0xC9600001,
-0xC1680001,0x9DFC0032,0x9DFC0032,0xC14C0001,0xBC000032,0x37FC0032,0x37FC0032,0x37FC0032,0xC9600001,0xC9600001,0xC1680001,0x9DFC0032,0x9DFC0032,0xC14C0001,0xBC000032,0x9DFC0032,0x9DFC0032,0xC14C0001,0xBC000032,0xBC000032,0xFB6C0008,0xF3740014,0x1780034,0xFF580000,0xDF5C0000,0xD35C0000,0xCF600000,0xC9580001,0xF564000D,0xF7500000,0x83FC0032,0xC14C0001,
-0x83FC0032,0x1900048,0xE7840000,0xD3840001,0xCD840001,0x59FC0048,0xD7700001,0xCD780001,0xADFC0048,0xCD540001,0xC600004A,0x59FC0048,0xD7700001,0xCD780001,0xADFC0048,0xCD540001,0xC600004A,0xADFC0048,0xCD540001,0xC600004A,0xC600004A,0x59FC0048,0xD7700001,0xCD780001,0xADFC0048,0xCD540001,0xC600004A,0xADFC0048,0xCD540001,0xC600004A,0xC600004A,0xADFC0048,
-0xCD540001,0xC600004A,0xC600004A,0xC600004A,0xFF780019,0xDA80048,0xFF8C0020,0xFF580000,0xE3580001,0xD7540000,0xCD640001,0xCD440001,0xFF700019,0xFB4C0000,0xCF680001,0xC600004A,0x99FC0048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0x1640048,0xD1580000,0xD1580000,0xD1580000,0xD1580000,0xD1580000,
-0xD1580000,0xBD580001,0xBD580001,0xBD580001,0xB7580001,0x15FC0048,0x15FC0048,0x15FC0048,0x15FC0048,0x15FC0048,0x15FC0048,0xC1440001,0xC1440001,0xC1440001,0xB74C0001,0x8DFC0048,0x8DFC0048,0x8DFC0048,0xB7280001,0xB000004A,0xF9600019,0x1640048,0x1640048,0xFF580000,0xE5580000,0xD9580000,0xD9580000,0xC9580000,0xFD54000D,0xFF4C0000,0xC5540000,0xC1440001,
-0x6FFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table56[] = {
-0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,
-0x91FC0000,0x91FC0000,0x91FC0000,0xB4000000,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x9800000,0x9800000,0x9800000,0x1FFC0000,0x75FC0000,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,
-0x43FC0000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0xC0000000,0x19C0000,0x1800001,0x1800001,0x1C00000,0x1E40000,0x17FC0000,0x17FC0000,0x67FC0000,0x1C00000,0x1E40000,0x8BFC0000,0xA3FC0000,
-0x8BFC0000,0x1940001,0x1940001,0x1940001,0x1940001,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0xB3F80000,0xB3F80000,0xCA000000,0xCA000000,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0xB3F80000,0xB3F80000,0xCA000000,0xCA000000,0xB3F80000,
-0xB3F80000,0xCA000000,0xCA000000,0xCA000000,0x1D80000,0x1B00000,0x1940001,0x3DFC0000,0x81FC0000,0x9FF80000,0xA9F80000,0xB9F80000,0x1FC0000,0x63FC0000,0x9FF80000,0xCA000000,0x9FF80000,0x1AC0001,0x87FC0000,0xC5F80000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,
-0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0xD6000000,0x3DFC0000,0xDC80000,0xDC80000,0x9DFC0000,0xBDF80000,0xCDF80000,0xD6000000,0xD6000000,0x6BFC0000,0xABFC0000,0xD5DC0000,0xD6000000,
-0xB5FC0000,0x18C0274,0xFF8000F7,0xE18000F4,0xD78000F3,0xFB780092,0xE378004F,0xD9780067,0xDB780092,0xD774004E,0xD1740092,0xF96C00F4,0xE76C0033,0xD9700051,0xDF68004A,0xD76C0002,0xD16C004D,0xD76800F4,0xD3640053,0xCF640069,0xCB6800F5,0x53FC0274,0xED5400F3,0xD76C00F4,0xE3500092,0xD75C0049,0xD1600090,0xE14000F3,0xD7480032,0xD148004D,0xCB5400F4,0xABF80274,
-0xD72000F3,0xD11C0090,0xCB1000F4,0xC4000278,0xFF8000EA,0xFD8801D4,0xFD8801FC,0xFF70000E,0xED6C0002,0xDF6C0002,0xD76C0006,0xD5680006,0xFD7400D7,0xFF600006,0xD9640035,0xD148004D,0x95FC0274,0x19800F3,0xF98C0032,0xDF8C0032,0xD78C0032,0xEF840049,0xE1800002,0xD7840005,0xD9800049,0xD580000E,0xD1800049,0x65FC00F3,0xE76C0032,0xD77C0032,0xDF680049,0xD76C0001,
-0xD1700049,0xB3FC00F3,0xD7440032,0xD1400048,0xCA0000F4,0x65FC00F3,0xE76C0032,0xD77C0032,0xDF680049,0xD76C0001,0xD1700049,0xB3FC00F3,0xD7440032,0xD1400048,0xCA0000F4,0xB3FC00F3,0xD7440032,0xD1400048,0xCA0000F4,0xCA0000F4,0xFD880063,0xF39400B9,0xF39400B6,0xFF700005,0xED6C0001,0xDF6C0001,0xD7740001,0xD7640004,0xFF780062,0xFF640002,0xD9600033,0xD1400048,
-0x9FFC00F3,0x18000F3,0x18000F3,0x18000F3,0x18000F3,0xE978004A,0xE978004A,0xE978004A,0xD374004A,0xD374004A,0xCB74004A,0xE76C0033,0xE76C0033,0xE76C0033,0xD56C0002,0xD56C0002,0xCD6C000E,0xCF680033,0xCF680033,0xCB680005,0xC5680035,0x41FC00F3,0x41FC00F3,0x41FC00F3,0xD75C0049,0xD75C0049,0xCB64004A,0xD54C0032,0xD54C0032,0xCB540001,0xC7580034,0xA1FC00F3,
-0xA1FC00F3,0xCB340048,0xC5280034,0xC00000F4,0xFF740053,0xF77C00AB,0x18000F3,0xFD6C0006,0xE96C0002,0xDD6C0002,0xDB6C0003,0xD3680001,0xFF6C0049,0xFF600002,0xD5680033,0xCB540001,0x89FC00F3,0x18C0032,0x18C0032,0x18C0032,0x18C0032,0xDD840001,0xDD840001,0xDD840001,0xCF840001,0xCF840001,0xCB800001,0x53FC0032,0x53FC0032,0x53FC0032,0xD3700001,0xD3700001,
-0xCB780000,0xABF80032,0xABF80032,0xCB580000,0xC4000034,0x53FC0032,0x53FC0032,0x53FC0032,0xD3700001,0xD3700001,0xCB780000,0xABF80032,0xABF80032,0xCB580000,0xC4000034,0xABF80032,0xABF80032,0xCB580000,0xC4000034,0xC4000034,0xF780000D,0xFD880012,0x18C0032,0xFB700001,0xE3740001,0xDB700000,0xD5740001,0xD3680000,0xFD74000D,0xFD640000,0x95FC0032,0xCB580000,
-0x95FC0032,0x1A0004A,0xED980001,0xDD940001,0xD7940001,0x75FC0048,0xE1800001,0xD7880001,0xBBFC0048,0xD7640000,0xD0000048,0x75FC0048,0xE1800001,0xD7880001,0xBBFC0048,0xD7640000,0xD0000048,0xBBFC0048,0xD7640000,0xD0000048,0xD0000048,0x75FC0048,0xE1800001,0xD7880001,0xBBFC0048,0xD7640000,0xD0000048,0xBBFC0048,0xD7640000,0xD0000048,0xD0000048,0xBBFC0048,
-0xD7640000,0xD0000048,0xD0000048,0xD0000048,0xFD900020,0x7BC0048,0xF9A00029,0xFF740002,0xED680001,0xE1640000,0xD7740000,0xD7500000,0xFB880020,0xFF640001,0xD9780000,0xD0000048,0xA9FC0048,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0x174004A,0xD76C0001,0xD76C0001,0xD76C0001,0xD76C0001,0xD76C0001,
-0xD76C0001,0xC7680001,0xC7680001,0xC7680001,0xC1680001,0x31FC0048,0x31FC0048,0x31FC0048,0x31FC0048,0x31FC0048,0x31FC0048,0xCB540001,0xCB540001,0xCB540001,0xC15C0001,0x9BF80048,0x9BF80048,0x9BF80048,0xC1380000,0xBA000048,0xF1700022,0x174004A,0x174004A,0xFB6C0002,0xEB6C0001,0xDF6C0001,0xDF6C0001,0xD16C0001,0xF9680012,0xFD600001,0xCB680001,0xCB540001,
-0x7FFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table57[] = {
-0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,
-0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1940000,0x1940000,0x1940000,0x37FC0000,0x83FC0000,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,
-0x5BFC0000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0xC8000000,0x1AC0000,0x1900001,0x1900001,0x1D40000,0x1F80000,0x35FC0000,0x35FC0000,0x7BFC0000,0x1D40000,0x1F80000,0x9BFC0000,0xAFFC0000,
-0x9BFC0000,0x1A40001,0x1A40001,0x1A40001,0x1A40001,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0xBFF80000,0xBFF80000,0xD2000000,0xD2000000,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0xBFF80000,0xBFF80000,0xD2000000,0xD2000000,0xBFF80000,
-0xBFF80000,0xD2000000,0xD2000000,0xD2000000,0x7E80000,0x9C00000,0x1A40001,0x5BFC0000,0x93FC0000,0xADFC0000,0xB5FC0000,0xC3FC0000,0x29FC0000,0x7BFC0000,0xADFC0000,0xD2000000,0xADFC0000,0x1BC0001,0x9FFC0000,0xD1F80000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,
-0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0xDE000000,0x63FC0000,0x1DC0000,0x1DC0000,0xB1FC0000,0xC9FC0000,0xD7F80000,0xDE000000,0xDE000000,0x89FC0000,0xBBFC0000,0xDDEC0000,0xDE000000,
-0xC3FC0000,0x19C0274,0xFD94010B,0xE99000F4,0xDF9000F3,0xFD88009A,0xEB88004F,0xE1880067,0xE3880092,0xDF84004E,0xD9840092,0xFD7C00F5,0xEF7C0033,0xE1800051,0xE778004A,0xDF7C0002,0xD97C004D,0xDF7800F4,0xDB740053,0xD7740069,0xD37800F5,0x6BFC0274,0xF56400F3,0xDF7C00F4,0xEB600092,0xDF6C0049,0xD9700090,0xE95000F3,0xDF580032,0xD958004D,0xD36400F4,0xB7F80274,
-0xDF3000F3,0xD92C0090,0xD32000F4,0xCC000278,0xFF8C0114,0xF59801FA,0xF79C0217,0xFF800026,0xF57C0002,0xE77C0002,0xDF7C0006,0xDD780006,0xFF880104,0xFF78001B,0xE1740035,0xD958004D,0xA3FC0274,0x1A800F3,0xFF9C0033,0xE79C0032,0xDF9C0032,0xF7940049,0xE9900002,0xDF940005,0xE1900049,0xDD90000E,0xD9900049,0x7DFC00F3,0xEF7C0032,0xDF8C0032,0xE7780049,0xDF7C0001,
-0xD9800049,0xBFFC00F3,0xDF540032,0xD9500048,0xD20000F4,0x7DFC00F3,0xEF7C0032,0xDF8C0032,0xE7780049,0xDF7C0001,0xD9800049,0xBFFC00F3,0xDF540032,0xD9500048,0xD20000F4,0xBFFC00F3,0xDF540032,0xD9500048,0xD20000F4,0xD20000F4,0xFB980075,0xFBA400B9,0xFBA400B6,0xFF880013,0xF57C0001,0xE77C0001,0xDF840001,0xDF740004,0xFF900071,0xFD7C000D,0xE1700033,0xD9500048,
-0xAFFC00F3,0x19000F3,0x19000F3,0x19000F3,0x19000F3,0xF188004A,0xF188004A,0xF188004A,0xDB84004A,0xDB84004A,0xD384004A,0xEF7C0033,0xEF7C0033,0xEF7C0033,0xDD7C0002,0xDD7C0002,0xD57C000E,0xD7780033,0xD7780033,0xD3780005,0xCD780035,0x59FC00F3,0x59FC00F3,0x59FC00F3,0xDF6C0049,0xDF6C0049,0xD374004A,0xDD5C0032,0xDD5C0032,0xD3640001,0xCF680034,0xADFC00F3,
-0xADFC00F3,0xD3440048,0xCD380034,0xC80000F4,0xFF840062,0xFF8C00AB,0x19000F3,0xFF80000D,0xF17C0002,0xE57C0002,0xE37C0003,0xDB780001,0xFF800055,0xFF740006,0xDD780033,0xD3640001,0x99FC00F3,0x19C0032,0x19C0032,0x19C0032,0x19C0032,0xE5940001,0xE5940001,0xE5940001,0xD7940001,0xD7940001,0xD3900001,0x6BFC0032,0x6BFC0032,0x6BFC0032,0xDB800001,0xDB800001,
-0xD3880000,0xB7F80032,0xB7F80032,0xD3680000,0xCC000034,0x6BFC0032,0x6BFC0032,0x6BFC0032,0xDB800001,0xDB800001,0xD3880000,0xB7F80032,0xB7F80032,0xD3680000,0xCC000034,0xB7F80032,0xB7F80032,0xD3680000,0xCC000034,0xCC000034,0xFF90000D,0xF5980019,0x19C0032,0xFD840002,0xEB840001,0xE3800000,0xDD840001,0xDB780000,0xFD880012,0xFB7C0002,0xA3FC0032,0xD3680000,
-0xA3FC0032,0x1B0004A,0xF5A80001,0xE5A40001,0xDFA40001,0x8DFC0048,0xE9900001,0xDF980001,0xC7FC0048,0xDF740000,0xD8000048,0x8DFC0048,0xE9900001,0xDF980001,0xC7FC0048,0xDF740000,0xD8000048,0xC7FC0048,0xDF740000,0xD8000048,0xD8000048,0x8DFC0048,0xE9900001,0xDF980001,0xC7FC0048,0xDF740000,0xD8000048,0xC7FC0048,0xDF740000,0xD8000048,0xD8000048,0xC7FC0048,
-0xDF740000,0xD8000048,0xD8000048,0xD8000048,0xF7A40029,0xFCC0048,0xFFAC002D,0xFF8C0008,0xF5780001,0xE9740000,0xDF840000,0xDF600000,0xFF980022,0xFD800005,0xE1880000,0xD8000048,0xB9FC0048,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0x184004A,0xDF7C0001,0xDF7C0001,0xDF7C0001,0xDF7C0001,0xDF7C0001,
-0xDF7C0001,0xCF780001,0xCF780001,0xCF780001,0xC9780001,0x49FC0048,0x49FC0048,0x49FC0048,0x49FC0048,0x49FC0048,0x49FC0048,0xD3640001,0xD3640001,0xD3640001,0xC96C0001,0xA7F80048,0xA7F80048,0xA7F80048,0xC9480000,0xC2000048,0xF9800022,0x184004A,0x184004A,0xFB7C0005,0xF37C0001,0xE77C0001,0xE77C0001,0xD97C0001,0xF57C0019,0xFD740002,0xD3780001,0xD3640001,
-0x8FFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table58[] = {
-0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,
-0xA9FC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1A40000,0x1A40000,0x1A40000,0x4FFC0000,0x93FC0000,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,
-0x75FC0000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0xD0000000,0x7BC0000,0x1A00001,0x1A00001,0x5E40000,0x1FFC0000,0x53FC0000,0x53FC0000,0x8FFC0000,0x5E40000,0x1FFC0000,0xA9FC0000,0xBBFC0000,
-0xA9FC0000,0x1B40001,0x1B40001,0x1B40001,0x1B40001,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0xCBF80000,0xCBF80000,0xDA000000,0xDA000000,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0xCBF80000,0xCBF80000,0xDA000000,0xDA000000,0xCBF80000,
-0xCBF80000,0xDA000000,0xDA000000,0xDA000000,0x3FC0000,0x1D40000,0x1B40001,0x79FC0000,0xA7FC0000,0xBDF80000,0xC3FC0000,0xCFF80000,0x51FC0000,0x93FC0000,0xBDF80000,0xDA000000,0xBDF80000,0x1CC0001,0xB7FC0000,0xDDF40000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,
-0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xE6000000,0x8BFC0000,0x1EC0000,0x1EC0000,0xC5FC0000,0xD7FC0000,0xE1FC0000,0xE6000000,0xE6000000,0xA7FC0000,0xCDFC0000,0xE5FC0000,0xE6000000,
-0xD3FC0000,0x1AC0274,0xFFA4011F,0xF1A000F4,0xE7A000F3,0xFF9800B2,0xF398004F,0xE9980067,0xEB980092,0xE794004E,0xE1940092,0xFF900104,0xF78C0033,0xE9900051,0xEF88004A,0xE78C0002,0xE18C004D,0xE78800F4,0xE3840053,0xDF840069,0xDB8800F5,0x83FC0274,0xFD7400F3,0xE78C00F4,0xF3700092,0xE77C0049,0xE1800090,0xF16000F3,0xE7680032,0xE168004D,0xDB7400F4,0xC3F80274,
-0xE74000F3,0xE13C0090,0xDB3000F4,0xD4000278,0xFFA00136,0xFDA801FA,0xFFAC0217,0xFF940053,0xFD8C0002,0xEF8C0002,0xE78C0006,0xE5880006,0xFF98012A,0xFF8C0042,0xE9840035,0xE168004D,0xB3FC0274,0x1B800F3,0xFFB0003E,0xEFAC0032,0xE7AC0032,0xFFA40049,0xF1A00002,0xE7A40005,0xE9A00049,0xE5A0000E,0xE1A00049,0x95FC00F3,0xF78C0032,0xE79C0032,0xEF880049,0xE78C0001,
-0xE1900049,0xCBFC00F3,0xE7640032,0xE1600048,0xDA0000F4,0x95FC00F3,0xF78C0032,0xE79C0032,0xEF880049,0xE78C0001,0xE1900049,0xCBFC00F3,0xE7640032,0xE1600048,0xDA0000F4,0xCBFC00F3,0xE7640032,0xE1600048,0xDA0000F4,0xDA0000F4,0xFBAC0082,0xF3B400CB,0xF3B400CB,0xFF980029,0xFD8C0001,0xEF8C0001,0xE7940001,0xE7840004,0xFFA40082,0xFF90001E,0xE9800033,0xE1600048,
-0xBFF800F3,0x1A000F3,0x1A000F3,0x1A000F3,0x1A000F3,0xF998004A,0xF998004A,0xF998004A,0xE394004A,0xE394004A,0xDB94004A,0xF78C0033,0xF78C0033,0xF78C0033,0xE58C0002,0xE58C0002,0xDD8C000E,0xDF880033,0xDF880033,0xDB880005,0xD5880035,0x71FC00F3,0x71FC00F3,0x71FC00F3,0xE77C0049,0xE77C0049,0xDB84004A,0xE56C0032,0xE56C0032,0xDB740001,0xD7780034,0xB9FC00F3,
-0xB9FC00F3,0xDB540048,0xD5480034,0xD00000F4,0xFB980075,0xF9A000BA,0x1A000F3,0xFF90001A,0xF98C0002,0xED8C0002,0xEB8C0003,0xE3880001,0xFF940064,0xFD8C0019,0xE5880033,0xDB740001,0xA7FC00F3,0x1AC0032,0x1AC0032,0x1AC0032,0x1AC0032,0xEDA40001,0xEDA40001,0xEDA40001,0xDFA40001,0xDFA40001,0xDBA00001,0x83FC0032,0x83FC0032,0x83FC0032,0xE3900001,0xE3900001,
-0xDB980000,0xC3F80032,0xC3F80032,0xDB780000,0xD4000034,0x83FC0032,0x83FC0032,0x83FC0032,0xE3900001,0xE3900001,0xDB980000,0xC3F80032,0xC3F80032,0xDB780000,0xD4000034,0xC3F80032,0xC3F80032,0xDB780000,0xD4000034,0xD4000034,0xFFA00014,0xFDA80019,0x1AC0032,0xFF980005,0xF3940001,0xEB900000,0xE5940001,0xE3880000,0xF5A00019,0xFB900005,0xB3FC0032,0xDB780000,
-0xB3FC0032,0x1C0004A,0xFDB80001,0xEDB40001,0xE7B40001,0xA5FC0048,0xF1A00001,0xE7A80001,0xD3FC0048,0xE7840000,0xE0000048,0xA5FC0048,0xF1A00001,0xE7A80001,0xD3FC0048,0xE7840000,0xE0000048,0xD3FC0048,0xE7840000,0xE0000048,0xE0000048,0xA5FC0048,0xF1A00001,0xE7A80001,0xD3FC0048,0xE7840000,0xE0000048,0xD3FC0048,0xE7840000,0xE0000048,0xE0000048,0xD3FC0048,
-0xE7840000,0xE0000048,0xE0000048,0xE0000048,0xFFB40029,0x1E00048,0xF9C00032,0xFFA00011,0xFD880001,0xF1840000,0xE7940000,0xE7700000,0xFFAC002D,0xFD9C000D,0xE9980000,0xE0000048,0xC7FC0048,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0x194004A,0xE78C0001,0xE78C0001,0xE78C0001,0xE78C0001,0xE78C0001,
-0xE78C0001,0xD7880001,0xD7880001,0xD7880001,0xD1880001,0x63FC0048,0x63FC0048,0x63FC0048,0x63FC0048,0x63FC0048,0x63FC0048,0xDB740001,0xDB740001,0xDB740001,0xD17C0001,0xB3F80048,0xB3F80048,0xB3F80048,0xD1580000,0xCA000048,0xF3940029,0x194004A,0x194004A,0xFD8C000A,0xFB8C0001,0xEF8C0001,0xEF8C0001,0xE18C0001,0xFD8C0019,0xF9880008,0xDB880001,0xDB740001,
-0x9FF80048,};
-static const uint32_t g_etc1_to_bc7_m6_table59[] = {
-0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,
-0xB5FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x3B40000,0x3B40000,0x3B40000,0x69FC0000,0xA1FC0000,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x1B00001,0x8DFC0000,0x8DFC0000,0x8DFC0000,0x8DFC0000,0x8DFC0000,
-0x8DFC0000,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xD8000000,0x8DFC0000,0x8DFC0000,0x8DFC0000,0x8DFC0000,0x8DFC0000,0x8DFC0000,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xD8000000,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xD8000000,0xD8000000,0xFCC0000,0x1B00001,0x1B00001,0x1F80000,0x47FC0000,0x71FC0000,0x71FC0000,0xA3FC0000,0x1F80000,0x47FC0000,0xB9FC0000,0xC7FC0000,
-0xB9FC0000,0x1C40001,0x1C40001,0x1C40001,0x1C40001,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xD7F80000,0xD7F80000,0xE2000000,0xE2000000,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xD7F80000,0xD7F80000,0xE2000000,0xE2000000,0xD7F80000,
-0xD7F80000,0xE2000000,0xE2000000,0xE2000000,0x3BFC0000,0x1E40000,0x1C40001,0x97FC0000,0xBBFC0000,0xCBFC0000,0xD1FC0000,0xD9FC0000,0x77FC0000,0xABFC0000,0xCBFC0000,0xE2000000,0xCBFC0000,0x1DC0001,0xCFFC0000,0xE7FC0000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,
-0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xCFFC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0xEE000000,0xB3FC0000,0x7FC0000,0x7FC0000,0xD9FC0000,0xE5FC0000,0xEBFC0000,0xEE000000,0xEE000000,0xC5FC0000,0xDDFC0000,0xEFD00000,0xEE000000,
-0xE1FC0000,0x1BC0274,0xFFB4014C,0xF9B000F4,0xEFB000F3,0xFFB000E2,0xFBA8004F,0xF1A80067,0xF3A80092,0xEFA4004E,0xE9A40092,0xFFA40121,0xFF9C0033,0xF1A00051,0xF798004A,0xEF9C0002,0xE99C004D,0xEF9800F4,0xEB940053,0xE7940069,0xE39800F5,0x9BFC0274,0xFD9000FD,0xEF9C00F4,0xFB800092,0xEF8C0049,0xE9900090,0xF97000F3,0xEF780032,0xE978004D,0xE38400F4,0xCFF80274,
-0xEF5000F3,0xE94C0090,0xE34000F4,0xDC000278,0xFFB4016B,0xF5B80224,0xF7BC0234,0xFFAC009E,0xFF9C0013,0xF79C0002,0xEF9C0006,0xED980006,0xFDB0016B,0xFFA00086,0xF1940035,0xE978004D,0xC1FC0274,0x1C800F3,0xFDC00053,0xF7BC0032,0xEFBC0032,0xFDB80059,0xF9B00002,0xEFB40005,0xF1B00049,0xEDB0000E,0xE9B00049,0xAFFC00F3,0xFF9C0032,0xEFAC0032,0xF7980049,0xEF9C0001,
-0xE9A00049,0xD7FC00F3,0xEF740032,0xE9700048,0xE20000F4,0xAFFC00F3,0xFF9C0032,0xEFAC0032,0xF7980049,0xEF9C0001,0xE9A00049,0xD7FC00F3,0xEF740032,0xE9700048,0xE20000F4,0xD7FC00F3,0xEF740032,0xE9700048,0xE20000F4,0xE20000F4,0xFFBC0095,0xFBC400CB,0xFBC400CB,0xFFB00042,0xFFA40009,0xF79C0001,0xEFA40001,0xEF940004,0xFFB4009E,0xFFAC003D,0xF1900033,0xE9700048,
-0xCDFC00F3,0x1B000F3,0x1B000F3,0x1B000F3,0x1B000F3,0xFFA8004B,0xFFA8004B,0xFFA8004B,0xEBA4004A,0xEBA4004A,0xE3A4004A,0xFF9C0033,0xFF9C0033,0xFF9C0033,0xED9C0002,0xED9C0002,0xE59C000E,0xE7980033,0xE7980033,0xE3980005,0xDD980035,0x89FC00F3,0x89FC00F3,0x89FC00F3,0xEF8C0049,0xEF8C0049,0xE394004A,0xED7C0032,0xED7C0032,0xE3840001,0xDF880034,0xC5FC00F3,
-0xC5FC00F3,0xE3640048,0xDD580034,0xD80000F4,0xFFAC0082,0xFFAC00BE,0x1B000F3,0xFFA40033,0xFF9C0003,0xF59C0002,0xF39C0003,0xEB980001,0xFFA00079,0xFF9C002A,0xED980033,0xE3840001,0xB7FC00F3,0x1BC0032,0x1BC0032,0x1BC0032,0x1BC0032,0xF5B40001,0xF5B40001,0xF5B40001,0xE7B40001,0xE7B40001,0xE3B00001,0x9BFC0032,0x9BFC0032,0x9BFC0032,0xEBA00001,0xEBA00001,
-0xE3A80000,0xCFF80032,0xCFF80032,0xE3880000,0xDC000034,0x9BFC0032,0x9BFC0032,0x9BFC0032,0xEBA00001,0xEBA00001,0xE3A80000,0xCFF80032,0xCFF80032,0xE3880000,0xDC000034,0xCFF80032,0xCFF80032,0xE3880000,0xDC000034,0xDC000034,0xFBB40019,0xF5B80022,0x1BC0032,0xFBAC000D,0xFBA40001,0xF3A00000,0xEDA40001,0xEB980000,0xFDB00019,0xFFA40008,0xC1FC0032,0xE3880000,
-0xC1FC0032,0x1D0004A,0xFFC80005,0xF5C40001,0xEFC40001,0xBDFC0048,0xF9B00001,0xEFB80001,0xDFF80048,0xEF940000,0xE8000048,0xBDFC0048,0xF9B00001,0xEFB80001,0xDFF80048,0xEF940000,0xE8000048,0xDFF80048,0xEF940000,0xE8000048,0xE8000048,0xBDFC0048,0xF9B00001,0xEFB80001,0xDFF80048,0xEF940000,0xE8000048,0xDFF80048,0xEF940000,0xE8000048,0xE8000048,0xDFF80048,
-0xEF940000,0xE8000048,0xE8000048,0xE8000048,0xFBC80034,0x1F00048,0xFFCC003A,0xFDBC0019,0xFFA40005,0xF9940000,0xEFA40000,0xEF800000,0xF9C80032,0xFBB80019,0xF1A80000,0xE8000048,0xD7FC0048,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0x1A4004A,0xEF9C0001,0xEF9C0001,0xEF9C0001,0xEF9C0001,0xEF9C0001,
-0xEF9C0001,0xDF980001,0xDF980001,0xDF980001,0xD9980001,0x7BFC0048,0x7BFC0048,0x7BFC0048,0x7BFC0048,0x7BFC0048,0x7BFC0048,0xE3840001,0xE3840001,0xE3840001,0xD98C0001,0xBFF80048,0xBFF80048,0xBFF80048,0xD9680000,0xD2000048,0xFBA40029,0x1A4004A,0x1A4004A,0xFFA0000D,0xFD9C0002,0xF79C0001,0xF79C0001,0xE99C0001,0xFD9C0020,0xFB98000D,0xE3980001,0xE3840001,
-0xADFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table60[] = {
-0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,
-0xC3F80000,0xC3F80000,0xC3F80000,0xD4000001,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1C80000,0x1C80000,0x1C80000,0x83FC0000,0xB3FC0000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,
-0xA9FC0000,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xE0000001,0x9E00000,0x1C40000,0x1C40000,0x35FC0000,0x73FC0000,0x93FC0000,0x93FC0000,0xB9FC0000,0x35FC0000,0x73FC0000,0xC9FC0000,0xD5F80000,
-0xC9FC0000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xE3FC0000,0xEA000001,0xEA000001,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xE3FC0000,0xEA000001,0xEA000001,0xE3FC0000,
-0xE3FC0000,0xEA000001,0xEA000001,0xEA000001,0x7BFC0000,0x1F80000,0x1D80000,0xB9FC0000,0xD1FC0000,0xDDF80000,0xDFFC0000,0xE5FC0000,0xA3FC0000,0xC7FC0000,0xDDF80000,0xEA000001,0xDDF80000,0x1F00000,0xEBFC0000,0xF5FC0000,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,
-0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xF6000001,0xDFFC0000,0x97FC0000,0x97FC0000,0xEFFC0000,0xF5F80000,0xF7F40000,0xF6000001,0xF6000001,0xE7FC0000,0xF1FC0000,0xF9C40000,0xF6000001,
-0xF3FC0000,0x1CC0278,0xFFC80181,0xFFC400FD,0xF7C400F5,0xFFC40120,0xFFBC0069,0xFBBC0069,0xFDB80090,0xF7B8004D,0xF1B80092,0xFFBC015B,0xFFB4005D,0xFBB40053,0xFFAC0049,0xF7AC0002,0xF3AC004E,0xF7AC00F4,0xF5A80051,0xF1A80067,0xEDAC00F3,0xB7FC0274,0xFFAC011F,0xF7AC00F4,0xFF9C00AA,0xF99C004A,0xF1A40092,0xFF8800F5,0xF78C0033,0xF18C004F,0xED9400F4,0xDDF40274,
-0xF76800F4,0xF1600092,0xED4C00F3,0xE6000274,0xFFC401B8,0xFFCC0220,0xFFCC0234,0xFFC000F1,0xFFB40059,0xFFAC0002,0xF9B00006,0xF7AC0006,0xFFC401A8,0xFFB800EA,0xF9A80035,0xF18C004F,0xD3FC0274,0x1D800F4,0xFFD00074,0xFFCC0035,0xF7CC0035,0xFFD00074,0xFFC40009,0xF9C40005,0xF9C4004A,0xF7C0000E,0xF1C4004A,0xC9FC00F3,0xFFB80042,0xF9BC0033,0xFFAC0049,0xF7AC0002,
-0xF1B4004A,0xE5F800F3,0xF78C0033,0xF184004A,0xEC0000F3,0xC9FC00F3,0xFFB80042,0xF9BC0033,0xFFAC0049,0xF7AC0002,0xF1B4004A,0xE5F800F3,0xF78C0033,0xF184004A,0xEC0000F3,0xE5F800F3,0xF78C0033,0xF184004A,0xEC0000F3,0xEC0000F3,0xFFD000AB,0xF5D800DE,0xF5D800DD,0xFFC80076,0xFFBC002E,0xFFAC0002,0xF9B40001,0xF7A40003,0xFDD000B5,0xFFC80063,0xFBA00032,0xF184004A,
-0xDFF800F3,0x1C400F4,0x1C400F4,0x1C400F4,0x1C400F4,0xFFBC0059,0xFFBC0059,0xFFBC0059,0xF3B80049,0xF3B80049,0xEBB80049,0xFFB0003E,0xFFB0003E,0xFFB0003E,0xF7AC0001,0xF7AC0001,0xEDB0000E,0xEFAC0032,0xEFAC0032,0xEBAC0005,0xE7AC0032,0xA5FC00F3,0xA5FC00F3,0xA5FC00F3,0xF99C0049,0xF99C0049,0xEBA80049,0xF78C0032,0xF78C0032,0xEB980002,0xE79C0032,0xD3FC00F3,
-0xD3FC00F3,0xEB780049,0xE7640032,0xE00000F3,0xFFBC0095,0xF9C000CC,0x1C400F4,0xFDB80056,0xFFB00018,0xFFAC0001,0xFBAC0004,0xF3AC0001,0xFFB40091,0xFFB00045,0xF7A80032,0xEB980002,0xC7FC00F3,0x1CC0034,0x1CC0034,0x1CC0034,0x1CC0034,0xFFC40000,0xFFC40000,0xFFC40000,0xF1C40000,0xF1C40000,0xEBC40001,0xB7FC0032,0xB7FC0032,0xB7FC0032,0xF3B40001,0xF3B40001,
-0xEBBC0001,0xDDF40032,0xDDF40032,0xEBA00001,0xE6000032,0xB7FC0032,0xB7FC0032,0xB7FC0032,0xF3B40001,0xF3B40001,0xEBBC0001,0xDDF40032,0xDDF40032,0xEBA00001,0xE6000032,0xDDF40032,0xDDF40032,0xEBA00001,0xE6000032,0xE6000032,0xFDC80020,0xFFCC0020,0x1CC0034,0xFDC00014,0xFDBC0005,0xFDB00000,0xF9B40000,0xF3AC0001,0xFFC40020,0xFBC00012,0xD3FC0032,0xEBA00001,
-0xD3FC0032,0x1E40048,0xFFDC0014,0xFDD80001,0xF7D80001,0xD9FC0048,0xFFC80005,0xF7CC0001,0xEDF80048,0xF7A80001,0xF000004A,0xD9FC0048,0xFFC80005,0xF7CC0001,0xEDF80048,0xF7A80001,0xF000004A,0xEDF80048,0xF7A80001,0xF000004A,0xF000004A,0xD9FC0048,0xFFC80005,0xF7CC0001,0xEDF80048,0xF7A80001,0xF000004A,0xEDF80048,0xF7A80001,0xF000004A,0xF000004A,0xEDF80048,
-0xF7A80001,0xF000004A,0xF000004A,0xF000004A,0xF5E4003D,0x37FC0048,0xFBE4003D,0xFFD40029,0xFDCC0019,0xFFAC0001,0xF7B80001,0xF7980001,0xFFDC0032,0xFFCC0028,0xF9BC0001,0xF000004A,0xE7FC0048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0x1B80048,0xFBAC0000,0xFBAC0000,0xFBAC0000,0xFBAC0000,0xFBAC0000,
-0xFBAC0000,0xE7AC0001,0xE7AC0001,0xE7AC0001,0xE1AC0001,0x95FC0048,0x95FC0048,0x95FC0048,0x95FC0048,0x95FC0048,0x95FC0048,0xEB980001,0xEB980001,0xEB980001,0xE1A00001,0xCBFC0048,0xCBFC0048,0xCBFC0048,0xE17C0001,0xDA00004A,0xF3B40034,0x1B80048,0x1B80048,0xFBB40019,0xFDB00008,0xFFAC0001,0xFFAC0001,0xF3AC0000,0xF9B00029,0xFBAC0014,0xEFA80000,0xEB980001,
-0xBFF80048,};
-static const uint32_t g_etc1_to_bc7_m6_table61[] = {
-0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,
-0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x5D80000,0x5D80000,0x5D80000,0x9BFC0000,0xC1FC0000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,
-0xC1FC0000,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xE8000001,0x1F40000,0x1D40000,0x1D40000,0x6DFC0000,0x9BFC0000,0xB1FC0000,0xB1FC0000,0xCDFC0000,0x6DFC0000,0x9BFC0000,0xD9FC0000,0xE1F80000,
-0xD9FC0000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xEFFC0000,0xEFFC0000,0xF2000001,0xF2000001,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xEFFC0000,0xEFFC0000,0xF2000001,0xF2000001,0xEFFC0000,
-0xEFFC0000,0xF2000001,0xF2000001,0xF2000001,0xB5FC0000,0x57FC0000,0x1E80000,0xD7FC0000,0xE5FC0000,0xEBFC0000,0xEDFC0000,0xF1F80000,0xCBFC0000,0xDFFC0000,0xEBFC0000,0xF2000001,0xEBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1DC0255,0xFFDC01AD,0xFFD40125,0xFFD400F4,0xFFD00159,0xFFCC00B4,0xFFCC0074,0xFFCC0099,0xFFC8004C,0xF9C80085,0xFFD00179,0xFFC800AF,0xFFC40054,0xFFC00069,0xFFBC0001,0xF9C00045,0xFFBC00DD,0xFBBC004A,0xF9B8005A,0xF5BC00DE,0xCFFC0253,0xFFC00158,0xFFBC00F3,0xFFB800D3,0xFFAC004E,0xF9B40085,0xFFAC0106,0xFF9C0032,0xF99C0042,0xF5A400DF,0xE7FC0253,
-0xFF7800F3,0xF9700085,0xF55C00DE,0xEE000253,0xFFD801C7,0xF7DC0229,0xF7DC0234,0xFFD40139,0xFFCC00B5,0xFFC0003C,0xFFC0000C,0xFFBC0005,0xFFD401B7,0xFFD00122,0xFFB80036,0xF99C0042,0xE1FC0253,0x1E800DD,0xFDE40095,0xFFE0004D,0xFFDC0034,0xFFDC0089,0xFFD8002C,0xFFD80008,0xFFD40041,0xFDD0000C,0xF9D4003D,0xDFFC00DD,0xFFD80068,0xFFCC0034,0xFFCC0059,0xFFBC0001,
-0xF9C4003D,0xEFFC00DD,0xFF9C0032,0xF994003D,0xF40000DE,0xDFFC00DD,0xFFD80068,0xFFCC0034,0xFFCC0059,0xFFBC0001,0xF9C4003D,0xEFFC00DD,0xFF9C0032,0xF994003D,0xF40000DE,0xEFFC00DD,0xFF9C0032,0xF994003D,0xF40000DE,0xF40000DE,0xFFE400B5,0xFDE800C9,0xFDE800C8,0xFFDC0089,0xFFD40062,0xFFC8001E,0xFFC40001,0xFFB40002,0xFFE000BA,0xFFDC0082,0xFFB80036,0xF994003D,
-0xEBFC00DD,0x1D400F4,0x1D400F4,0x1D400F4,0x1D400F4,0xFDCC0074,0xFDCC0074,0xFDCC0074,0xFBC80049,0xFBC80049,0xF3C80049,0xFFC40054,0xFFC40054,0xFFC40054,0xFFBC0001,0xFFBC0001,0xF5C0000E,0xF7BC0032,0xF7BC0032,0xF3BC0005,0xEFBC0032,0xBDFC00F3,0xBDFC00F3,0xBDFC00F3,0xFDB4004E,0xFDB4004E,0xF3B80049,0xFF9C0032,0xFF9C0032,0xF3A80002,0xEFAC0032,0xDFF800F3,
-0xDFF800F3,0xF3880049,0xEF740032,0xE80000F3,0xFFCC00A8,0xFFCC00E0,0x1D400F4,0xFFCC0071,0xFFC40038,0xFFC00018,0xFFC0000C,0xFBBC0001,0xFDCC00A3,0xFFC40068,0xFFB80032,0xF3A80002,0xD7FC00F3,0x1DC0034,0x1DC0034,0x1DC0034,0x1DC0034,0xFDD80008,0xFDD80008,0xFDD80008,0xF9D40000,0xF9D40000,0xF3D40001,0xCFFC0032,0xCFFC0032,0xCFFC0032,0xFBC40001,0xFBC40001,
-0xF3CC0001,0xE7FC0032,0xE7FC0032,0xF3B00001,0xEE000032,0xCFFC0032,0xCFFC0032,0xCFFC0032,0xFBC40001,0xFBC40001,0xF3CC0001,0xE7FC0032,0xE7FC0032,0xF3B00001,0xEE000032,0xE7FC0032,0xE7FC0032,0xF3B00001,0xEE000032,0xEE000032,0xFFD80022,0xF7DC0029,0x1DC0034,0xFDD80019,0xFDD0000D,0xFFC80005,0xFFC40001,0xFBBC0001,0xF1DC0029,0xFBD40019,0xE1FC0032,0xF3B00001,
-0xE1FC0032,0x1F4003D,0xFFEC0028,0xFFEC000D,0xFFE80000,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xF7F8003D,0xFFB80000,0xF800003D,0xF800003D,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xF7F8003D,0xFFB80000,0xF800003D,0xF800003D,0xF7F8003D,
-0xFFB80000,0xF800003D,0xF800003D,0xF800003D,0xFBF00034,0xA7FC003D,0xF3F4003D,0xFFEC0029,0xFFE80020,0xFFD40011,0xFFC80000,0xFFA80000,0xFFF00032,0xFFEC0032,0xFFD00001,0xF800003D,0xF5FC003D,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0x1C80048,0xFFBC0001,0xFFBC0001,0xFFBC0001,0xFFBC0001,0xFFBC0001,
-0xFFBC0001,0xEFBC0001,0xEFBC0001,0xEFBC0001,0xE9BC0001,0xAFFC0048,0xAFFC0048,0xAFFC0048,0xAFFC0048,0xAFFC0048,0xAFFC0048,0xF3A80001,0xF3A80001,0xF3A80001,0xE9B00001,0xD7FC0048,0xD7FC0048,0xD7FC0048,0xE98C0001,0xE200004A,0xFBC40034,0x1C80048,0x1C80048,0xFBC40020,0xFDC00014,0xFDC00008,0xFDC00008,0xFBBC0000,0xF5C40032,0xFFBC001D,0xF7B80000,0xF3A80001,
-0xCDFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table62[] = {
-0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xDBF80000,
-0xDBF80000,0xDBF80000,0xDBF80000,0xE4000001,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0xDE80000,0xDE80000,0xDE80000,0xB5FC0000,0xD1FC0000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,
-0xD9FC0000,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xF0000001,0x37FC0000,0x1E40000,0x1E40000,0xA7FC0000,0xC1FC0000,0xCFFC0000,0xCFFC0000,0xE1FC0000,0xA7FC0000,0xC1FC0000,0xE7FC0000,0xEDF80000,
-0xE7FC0000,0x1F80000,0x1F80000,0x1F80000,0x1F80000,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0xFBFC0000,0xFA000001,0xFA000001,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0xFBFC0000,0xFA000001,0xFA000001,0xFBFC0000,
-0xFBFC0000,0xFA000001,0xFA000001,0xFA000001,0xEDFC0000,0xD7FC0000,0x1F80000,0xF5FC0000,0xF9FC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xF3FC0000,0xF7FC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1E80181,0xFFE80139,0xFFE40104,0xFFE400F4,0xFFE000FD,0xFFE000B2,0xFFDC0090,0xFFD80075,0xFFD80051,0xFDD8004D,0xFFDC00FD,0xFFDC00A8,0xFFDC0084,0xFFD80051,0xFFD00021,0xFFD00011,0xFFCC0072,0xFFCC0032,0xFDCC000E,0xF9CC005E,0xE3FC017F,0xFFD80118,0xFFD400F3,0xFFCC00A9,0xFFCC0069,0xFDC8004D,0xFFC000AE,0xFFB80045,0xFDB40006,0xF9B8005E,0xF1F8017F,
-0xFFA800F3,0xFD90004D,0xF97C005E,0xF400017F,0xFFE40135,0xFDE8015D,0xFDE8016C,0xFFE000F5,0xFFDC00A3,0xFFD40055,0xFFD00038,0xFFD0000E,0xFFE40142,0xFFDC00E3,0xFFD0004E,0xFDB40006,0xEDFC017F,0x1F4005D,0xFDF0004D,0xFFF00038,0xFFEC0034,0xFFEC003D,0xFFEC0022,0xFFE80014,0xFFE40011,0xFFE40001,0xFDE40005,0xF1FC005D,0xFFEC0043,0xFFE40034,0xFFE40021,0xFFD80009,
-0xFDD80005,0xF9F8005D,0xFFCC0032,0xFDB40005,0xF800005E,0xF1FC005D,0xFFEC0043,0xFFE40034,0xFFE40021,0xFFD80009,0xFDD80005,0xF9F8005D,0xFFCC0032,0xFDB40005,0xF800005E,0xF9F8005D,0xFFCC0032,0xFDB40005,0xF800005E,0xF800005E,0xFDF00051,0xF3F4005D,0xF3F4005D,0xFFEC0042,0xFFE80030,0xFFE4001B,0xFFE0000D,0xFFD80008,0xFBF00051,0xFFE80044,0xFFDC0033,0xFDB40005,
-0xF7FC005D,0x1E400F4,0x1E400F4,0x1E400F4,0x1E400F4,0xFFDC0090,0xFFDC0090,0xFFDC0090,0xFFD80051,0xFFD80051,0xFBD80049,0xFFDC0084,0xFFDC0084,0xFFDC0084,0xFFD00021,0xFFD00021,0xFDD0000E,0xFFCC0032,0xFFCC0032,0xFBCC0005,0xF7CC0032,0xD5FC00F3,0xD5FC00F3,0xD5FC00F3,0xFFCC0069,0xFFCC0069,0xFBC80049,0xFFB80045,0xFFB80045,0xFBB80002,0xF7BC0032,0xEBF800F3,
-0xEBF800F3,0xFB980049,0xF7840032,0xF00000F3,0xFBE000C9,0xF9E000E1,0x1E400F4,0xFFDC0095,0xFFD80068,0xFFD40045,0xFFD00038,0xFFD0000E,0xFDE000CA,0xFFDC0092,0xFFD0004D,0xFBB80002,0xE5FC00F3,0x1EC0034,0x1EC0034,0x1EC0034,0x1EC0034,0xFFE80014,0xFFE80014,0xFFE80014,0xFFE40001,0xFFE40001,0xFBE40001,0xE9FC0032,0xE9FC0032,0xE9FC0032,0xFFD80009,0xFFD80009,
-0xFBDC0001,0xF3FC0032,0xF3FC0032,0xFBC00001,0xF6000032,0xE9FC0032,0xE9FC0032,0xE9FC0032,0xFFD80009,0xFFD80009,0xFBDC0001,0xF3FC0032,0xF3FC0032,0xFBC00001,0xF6000032,0xF3FC0032,0xF3FC0032,0xFBC00001,0xF6000032,0xF6000032,0xFBEC0029,0xFFEC0029,0x1EC0034,0xF9EC0029,0xFDE80020,0xFFE40012,0xFFE0000D,0xFFD80008,0xF9EC0029,0xFFE80020,0xF1FC0032,0xFBC00001,
-0xF1FC0032,0x1FC0005,0xFFF80004,0xFFF80001,0xFFF80000,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFDF80005,0xFFEC0000,0xFC000005,0xFC000005,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFDF80005,0xFFEC0000,0xFC000005,0xFC000005,0xFDF80005,
-0xFFEC0000,0xFC000005,0xFC000005,0xFC000005,0xFFF80004,0xE7FC0005,0xF7FC0005,0xFBFC0005,0xFFF80002,0xFFF00001,0xFFF00000,0xFFE40000,0xF9FC0005,0xFBFC0005,0xFFF00000,0xFC000005,0xFDF80005,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0x1D80048,0xFFD00008,0xFFD00008,0xFFD00008,0xFFD00008,0xFFD00008,
-0xFFD00008,0xF7CC0001,0xF7CC0001,0xF7CC0001,0xF1CC0001,0xC7FC0048,0xC7FC0048,0xC7FC0048,0xC7FC0048,0xC7FC0048,0xC7FC0048,0xFBB80001,0xFBB80001,0xFBB80001,0xF1C00001,0xE3FC0048,0xE3FC0048,0xE3FC0048,0xF19C0001,0xEA00004A,0xF5D8003D,0x1D80048,0x1D80048,0xFDD40029,0xFFD0001D,0xFDD00014,0xFDD00014,0xFFCC0004,0xFDD40032,0xFFD00022,0xFFC80000,0xFBB80001,
-0xDDF80048,};
-static const uint32_t g_etc1_to_bc7_m6_table63[] = {
-0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xE7F80000,
-0xE7F80000,0xE7F80000,0xE7F80000,0xEC000001,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1FC0000,0x1FC0000,0x1FC0000,0xCDFC0000,0xDFFC0000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF1FC0000,
-0xF1FC0000,0xF9F80000,0xF9F80000,0xF9F80000,0xF8000001,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF1FC0000,0xF9F80000,0xF9F80000,0xF9F80000,0xF8000001,0xF9F80000,0xF9F80000,0xF9F80000,0xF8000001,0xF8000001,0xB7FC0000,0x1F40000,0x1F40000,0xDFFC0000,0xE9FC0000,0xEFFC0000,0xEFFC0000,0xF3FC0000,0xDFFC0000,0xE9FC0000,0xF7FC0000,0xF9F80000,
-0xF7FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1F400C2,0xFDF000B2,0xFFF0009D,0xFFF00099,0xFFEC0098,0xFFEC007D,0xFFEC0074,0xFFEC0062,0xFFE80058,0xFFE80048,0xFFEC0089,0xFFE80069,0xFFE80059,0xFFE40040,0xFFE40030,0xFFE0001D,0xFFE40031,0xFFE0001D,0xFFDC0001,0xFDDC0011,0xEFFC00C2,0xFFEC00A7,0xFFE80099,0xFFE40070,0xFFE40060,0xFFDC0048,0xFFD80059,0xFFD80035,0xFFCC0009,0xFDCC0011,0xF7F800C2,
-0xFFD40099,0xFFB80048,0xFD9C0011,0xF80000C2,0xFDF000B0,0xF3F400C2,0xF3F400C2,0xFFF0008E,0xFFEC006D,0xFFE8004A,0xFFE8003D,0xFFE40022,0xFFF000A2,0xFFF0008E,0xFFE0003F,0xFFCC0009,0xF5FC00C2,0x1FC0012,0xFFF80011,0xFFF8000E,0xFFF8000D,0xFFF8000C,0xFFF80009,0xFFF80008,0xFFF80006,0xFFF40004,0xFFF40000,0xFBFC0011,0xFFF8000E,0xFFF4000D,0xFFF00008,0xFFF00004,
-0xFFF00000,0xFDF80011,0xFFEC000D,0xFFE00000,0xFC000011,0xFBFC0011,0xFFF8000E,0xFFF4000D,0xFFF00008,0xFFF00004,0xFFF00000,0xFDF80011,0xFFEC000D,0xFFE00000,0xFC000011,0xFDF80011,0xFFEC000D,0xFFE00000,0xFC000011,0xFC000011,0xFFF80011,0xF7FC0012,0xF7FC0012,0xFFF8000C,0xFFF4000E,0xFFF40006,0xFFF40005,0xFFEC0004,0xFFF8000C,0xFBFC0011,0xFFF0000D,0xFFE00000,
-0xFDF80011,0x1F00099,0x1F00099,0x1F00099,0x1F00099,0xFFEC0074,0xFFEC0074,0xFFEC0074,0xFFE80058,0xFFE80058,0xFFE80048,0xFFE80059,0xFFE80059,0xFFE80059,0xFFE40030,0xFFE40030,0xFFE0001D,0xFFE0001D,0xFFE0001D,0xFFDC0001,0xFDDC000D,0xEBFC0099,0xEBFC0099,0xEBFC0099,0xFFE40060,0xFFE40060,0xFFDC0048,0xFFD80035,0xFFD80035,0xFFCC0009,0xFDCC000D,0xF5FC0099,
-0xF5FC0099,0xFFB80048,0xFD9C000D,0xF600009A,0xFFEC0089,0xFFEC0090,0x1F00099,0xFFE80074,0xFFEC005D,0xFFE80046,0xFFE8003D,0xFFE40022,0xFBEC0089,0xFFE80072,0xFFE0003E,0xFFCC0009,0xF3FC0099,0x1F8000D,0x1F8000D,0x1F8000D,0x1F8000D,0xFFF80008,0xFFF80008,0xFFF80008,0xFFF40004,0xFFF40004,0xFFF40000,0xF7FC000D,0xF7FC000D,0xF7FC000D,0xFFF00004,0xFFF00004,
-0xFFF00000,0xFBFC000D,0xFBFC000D,0xFFE00000,0xFC00000D,0xF7FC000D,0xF7FC000D,0xF7FC000D,0xFFF00004,0xFFF00004,0xFFF00000,0xFBFC000D,0xFBFC000D,0xFFE00000,0xFC00000D,0xFBFC000D,0xFBFC000D,0xFFE00000,0xFC00000D,0xFC00000D,0xF5FC000D,0xF5F8000D,0x1F8000D,0xFFF80008,0xFFF4000A,0xFFF40005,0xFFF40005,0xFFEC0004,0xFFF80008,0xFFF4000A,0xFBFC000D,0xFFE00000,
-0xFBFC000D,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0x1E80048,0xFFE0001D,0xFFE0001D,0xFFE0001D,0xFFE0001D,0xFFE0001D,
-0xFFE0001D,0xFFDC0001,0xFFDC0001,0xFFDC0001,0xF9DC0001,0xDFFC0048,0xDFFC0048,0xDFFC0048,0xDFFC0048,0xDFFC0048,0xDFFC0048,0xFFCC0009,0xFFCC0009,0xFFCC0009,0xF9D00001,0xEFFC0048,0xEFFC0048,0xEFFC0048,0xF9AC0001,0xF200004A,0xFDE8003D,0x1E80048,0x1E80048,0xFDE40034,0xFFE40029,0xFFE00028,0xFFE00028,0xFFE00014,0xF9E8003D,0xFFE40032,0xFFDC000A,0xFFCC0009,
-0xEBFC0048,};
-static const uint32_t g_etc1_to_bc7_m6_table64[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x100001,0x100001,0x100001,0x100001,0x2180000,0x2180000,0x2180000,0x300000,0x300000,0x8000000,0x2180000,0x2180000,0x2180000,0x300000,0x300000,0x8000000,0x300000,0x300000,0x8000000,0x8000000,0x2180000,0x2180000,0x2180000,0x300000,0x300000,0x8000000,0x300000,0x300000,0x8000000,0x8000000,0x300000,
-0x300000,0x8000000,0x8000000,0x8000000,0x140000,0x140000,0x100001,0x180000,0x1C0000,0x240000,0x280000,0x3C0000,0x2140000,0x2180000,0x240000,0x8000000,0x240000,0x380001,0x540000,0xAC0000,0x1C000000,0x540000,0xAC0000,0x1C000000,0xAC0000,0x1C000000,0x1C000000,0x540000,0xAC0000,0x1C000000,0xAC0000,0x1C000000,
-0x1C000000,0xAC0000,0x1C000000,0x1C000000,0x1C000000,0x540000,0xAC0000,0x1C000000,0xAC0000,0x1C000000,0x1C000000,0xAC0000,0x1C000000,0x1C000000,0x1C000000,0xAC0000,0x1C000000,0x1C000000,0x1C000000,0x1C000000,0x480000,0x63C0000,0x63C0000,0x600000,0x8C0000,0x1140000,0x1C000000,0x1C000000,0x24C0000,0x6C0000,0xFD00000,0x1C000000,
-0x780000,0x140232,0x4E080039,0x28080039,0x1C040039,0x380000C8,0x28000011,0x1C000001,0x1C0000C8,0x16000048,0x120000C8,0x260001B9,0x22000096,0x18000051,0x18000110,0x16000088,0x120000EC,0x120001B9,0x1000010D,0x10000149,0xC0001B9,0x1C0232,0x1E000109,0x18000091,0x18000150,0x140000BB,0x12000110,0x120001DD,0x10000131,0xE000165,0xC0001C9,0x300232,
-0x10000186,0xA0001AA,0xA0001F2,0x8000232,0x7400007B,0xFA0400D2,0xFE0C0082,0x3400007B,0x2200008A,0x1A00007B,0x16000061,0x140000B5,0x460000F1,0x280000BD,0x160000CB,0xE000165,0x240232,0x1801BA,0x4E080029,0x28080029,0x1C080029,0x380000C8,0x28000011,0x1C000001,0x1C0000C8,0x16000048,0x120000C8,0x22401B9,0x22000096,0x18000051,0x18000110,0x16000088,
-0x120000EC,0x4C01B9,0x1000010D,0x10000149,0xC0001B9,0x22401B9,0x22000096,0x18000051,0x18000110,0x16000088,0x120000EC,0x4C01B9,0x1000010D,0x10000149,0xC0001B9,0x4C01B9,0x1000010D,0x10000149,0xC0001B9,0xC0001B9,0x7400007B,0xFA0400CE,0xFE0C005E,0x3400007B,0x2200008A,0x1A00007B,0x16000061,0x140000B5,0x500000DB,0x280000B4,0x160000CA,0x10000149,
-0x3401B9,0x40039,0x40039,0x40039,0x40039,0x1A000000,0x1A000000,0x1A000000,0xC000000,0xC000000,0x8000000,0xA000029,0xA000029,0xA000029,0xC000010,0xC000010,0x6000008,0x6000029,0x6000029,0x4000019,0x4000029,0x80036,0x80036,0x80036,0x6000018,0x6000018,0x600000C,0x600002D,0x600002D,0x400001D,0x400002D,0xC0036,
-0xC0036,0x4000022,0x2000031,0x2000036,0x3600000A,0x88000000,0x40039,0x1C000011,0x1000000D,0xC00000D,0xC00000A,0xA000011,0x1600001A,0x10000013,0x6000029,0x400001D,0xC0036,0x80029,0x80029,0x80029,0x80029,0x1A000000,0x1A000000,0x1A000000,0xC000000,0xC000000,0x8000000,0xC0029,0xC0029,0xC0029,0xC000010,0xC000010,
-0x6000008,0x140029,0x140029,0x4000019,0x4000029,0xC0029,0xC0029,0xC0029,0xC000010,0xC000010,0x6000008,0x140029,0x140029,0x4000019,0x4000029,0x140029,0x140029,0x4000019,0x4000029,0x4000029,0x3600000A,0x88000000,0x80029,0x1C000011,0x1000000D,0xC00000D,0xC00000A,0xA000011,0x16000019,0x10000012,0x100029,0x4000019,
-0x100029,0x2400CA,0x42140001,0x26100001,0x1C100001,0x3800C8,0x28000011,0x1C000001,0x7000C8,0x16000048,0x120000C8,0x3800C8,0x28000011,0x1C000001,0x7000C8,0x16000048,0x120000C8,0x7000C8,0x16000048,0x120000C8,0x120000C8,0x3800C8,0x28000011,0x1C000001,0x7000C8,0x16000048,0x120000C8,0x7000C8,0x16000048,0x120000C8,0x120000C8,0x7000C8,
-0x16000048,0x120000C8,0x120000C8,0x120000C8,0x7400004A,0x2800C8,0xF418000D,0x3C000041,0x2A000049,0x1E000041,0x18000034,0x18000061,0x50000062,0x34000050,0x1C000028,0x120000C8,0x5000C8,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table65[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x200001,0x200001,0x200001,0x200001,0x2300000,0x2300000,0x2300000,0x640000,0x640000,0x10000000,0x2300000,0x2300000,0x2300000,0x640000,0x640000,0x10000000,0x640000,0x640000,0x10000000,0x10000000,0x2300000,0x2300000,0x2300000,0x640000,0x640000,0x10000000,0x640000,0x640000,0x10000000,0x10000000,0x640000,
-0x640000,0x10000000,0x10000000,0x10000000,0x280000,0x240000,0x200001,0x22C0000,0x380000,0x480000,0x500000,0x780000,0x2280000,0x2300000,0x480000,0x10000000,0x480000,0x480001,0x6C0000,0xDC0000,0x24000000,0x6C0000,0xDC0000,0x24000000,0xDC0000,0x24000000,0x24000000,0x6C0000,0xDC0000,0x24000000,0xDC0000,0x24000000,
-0x24000000,0xDC0000,0x24000000,0x24000000,0x24000000,0x6C0000,0xDC0000,0x24000000,0xDC0000,0x24000000,0x24000000,0xDC0000,0x24000000,0x24000000,0x24000000,0xDC0000,0x24000000,0x24000000,0x24000000,0x24000000,0x5C0000,0xE4C0000,0xE4C0000,0x7C0000,0xB40000,0x1600000,0x24000000,0x24000000,0x640000,0x880000,0x17E00000,0x24000000,
-0x9C0000,0x1C03A2,0x620C00C1,0x320C00C2,0x240C00C1,0x500000C8,0x34000001,0x26000014,0x260000CA,0x20000029,0x1A0000C8,0x360002AE,0x2E0000FE,0x220000A1,0x22000153,0x1E000098,0x18000110,0x1A0002AE,0x160001A9,0x160001C9,0x100002B1,0x2803A2,0x280001B3,0x2200011A,0x220001CC,0x1E0000FC,0x18000150,0x180002F1,0x160001E9,0x14000205,0x100002D5,0x4C03A2,
-0x1600028A,0x10000282,0x1000032A,0xC0003A2,0x96000085,0xFE0C0142,0xF2140189,0x48000099,0x360000A2,0x26000098,0x1E000065,0x1C0000E0,0x6400015C,0x3C0000E9,0x1E000145,0x14000205,0x3403A2,0x2402AE,0x5E100091,0x32100091,0x24100091,0x500000C8,0x34000001,0x26040011,0x260000CA,0x20000029,0x1A0000C8,0x3402AE,0x2E0000FE,0x220000A1,0x22000153,0x1E000098,
-0x18000110,0x6802AE,0x160001A9,0x160001C9,0x100002B1,0x3402AE,0x2E0000FE,0x220000A1,0x22000153,0x1E000098,0x18000110,0x6802AE,0x160001A9,0x160001C9,0x100002B1,0x6802AE,0x160001A9,0x160001C9,0x100002B1,0x100002B1,0x96000085,0xFE0C011E,0xF418010D,0x48000099,0x360000A2,0x26000098,0x1E000065,0x1C0000E0,0x64000138,0x3C0000D9,0x1E000141,0x160001C9,
-0x4C02AE,0xC00C1,0xC00C1,0xC00C1,0xC00C1,0x32000000,0x32000000,0x32000000,0x18000000,0x18000000,0x10000000,0x16000091,0x16000091,0x16000091,0x12000034,0x12000034,0xE00001D,0xC000091,0xC000091,0xA000055,0x8000091,0x1000C1,0x1000C1,0x1000C1,0x12000058,0x12000058,0xC000030,0xC0000A1,0xC0000A1,0xA000065,0x6000099,0x2000C1,
-0x2000C1,0xA000086,0x60000AE,0x40000C2,0x6000002D,0xF8000001,0xC00C1,0x3200003A,0x1E000034,0x1A00003A,0x1600002D,0xE000048,0x3400005E,0x26000054,0xE000092,0xA000065,0x1800C1,0x100091,0x100091,0x100091,0x100091,0x32000000,0x32000000,0x32000000,0x18000000,0x18000000,0x10000000,0x180091,0x180091,0x180091,0x12000034,0x12000034,
-0xE00001D,0x2C0091,0x2C0091,0xA000055,0x8000091,0x180091,0x180091,0x180091,0x12000034,0x12000034,0xE00001D,0x2C0091,0x2C0091,0xA000055,0x8000091,0x2C0091,0x2C0091,0xA000055,0x8000091,0x8000091,0x6000002D,0xF8000001,0x100091,0x3200003A,0x1E000034,0x1A00003A,0x1600002D,0xE000048,0x34000055,0x26000050,0x200091,0xA000055,
-0x200091,0x3400CA,0x4A240001,0x2E200001,0x24200001,0x5000C8,0x34000001,0x240C0001,0xA000C8,0x20000029,0x1A0000C8,0x5000C8,0x34000001,0x240C0001,0xA000C8,0x20000029,0x1A0000C8,0xA000C8,0x20000029,0x1A0000C8,0x1A0000C8,0x5000C8,0x34000001,0x240C0001,0xA000C8,0x20000029,0x1A0000C8,0xA000C8,0x20000029,0x1A0000C8,0x1A0000C8,0xA000C8,
-0x20000029,0x1A0000C8,0x1A0000C8,0x1A0000C8,0xA400002D,0x43800C8,0xFC28000D,0x52000022,0x36000029,0x28000029,0x22000014,0x20000041,0x72000048,0x46000034,0x2400000D,0x1A0000C8,0x7000C8,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table66[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0x140000,0x140000,0x140000,0x140000,0x140000,
-0x140000,0x240000,0x240000,0x240000,0x6000000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x240000,0x240000,0x240000,0x6000000,0x240000,0x240000,0x240000,0x6000000,0x6000000,0xE0C0000,0xC0001,0xC0001,0x100000,0x100000,0x2100000,0x2100000,0x2140000,0x100000,0x100000,0x1C0000,0x240000,
-0x1C0000,0x300001,0x300001,0x300001,0x300001,0x480000,0x480000,0x480000,0x940000,0x940000,0x18000000,0x480000,0x480000,0x480000,0x940000,0x940000,0x18000000,0x940000,0x940000,0x18000000,0x18000000,0x480000,0x480000,0x480000,0x940000,0x940000,0x18000000,0x940000,0x940000,0x18000000,0x18000000,0x940000,
-0x940000,0x18000000,0x18000000,0x18000000,0x4380000,0x2340000,0x300001,0x440000,0x540000,0x680000,0x780000,0xB40000,0x23C0000,0x480000,0x680000,0x18000000,0x680000,0x580001,0x840000,0x10C0000,0x2C000000,0x840000,0x10C0000,0x2C000000,0x10C0000,0x2C000000,0x2C000000,0x840000,0x10C0000,0x2C000000,0x10C0000,0x2C000000,
-0x2C000000,0x10C0000,0x2C000000,0x2C000000,0x2C000000,0x840000,0x10C0000,0x2C000000,0x10C0000,0x2C000000,0x2C000000,0x10C0000,0x2C000000,0x2C000000,0x2C000000,0x10C0000,0x2C000000,0x2C000000,0x2C000000,0x2C000000,0x700000,0x600000,0x600000,0x2940000,0xDC0000,0x1B00000,0x2C000000,0x2C000000,0x2780000,0xA80000,0x1FF00000,0x2C000000,
-0xBC0000,0x2804C3,0x7414014E,0x3C14014F,0x2C14014E,0x600800E1,0x4008001A,0x2C0C0049,0x300800E3,0x28040036,0x220800E1,0x4E0002D3,0x3A0000CD,0x2C0000A4,0x2E00011A,0x2800003D,0x220000D8,0x260002D3,0x2000017A,0x1C0001A0,0x180002D4,0x3804C1,0x34000216,0x2800016D,0x280001F7,0x280000E6,0x22000151,0x22000354,0x200001F3,0x1C000204,0x18000314,0x7004C1,
-0x1C00031D,0x1A000305,0x160003C9,0x120004C1,0xC2000036,0xF41801F1,0xF8200266,0x6600003E,0x40000049,0x34000043,0x2A000015,0x24000084,0x82000121,0x5000009F,0x26000118,0x1C000204,0x5004C1,0x3402D3,0x662000A2,0x3A2000A2,0x2C2000A2,0x581000C9,0x3C100002,0x2E100015,0x300C00C9,0x2A080026,0x220C00C9,0x4C02D3,0x3A0000CD,0x2C0400A2,0x2E00011A,0x2800003D,
-0x220000D8,0x9802D3,0x2000017A,0x1C0001A0,0x180002D4,0x4C02D3,0x3A0000CD,0x2C0400A2,0x2E00011A,0x2800003D,0x220000D8,0x9802D3,0x2000017A,0x1C0001A0,0x180002D4,0x9802D3,0x2000017A,0x1C0001A0,0x180002D4,0x180002D4,0xC2000036,0xF8200139,0xFC280126,0x6600003E,0x40000049,0x34000043,0x2A000015,0x24000084,0x900000DD,0x50000086,0x26000114,0x1C0001A0,
-0x6C02D3,0x14014E,0x14014E,0x14014E,0x14014E,0x42080019,0x42080019,0x42080019,0x22080019,0x22080019,0x18080019,0x300000A2,0x300000A2,0x300000A2,0x22000011,0x22000011,0x18000001,0x160000A4,0x160000A4,0x14000041,0xE0000A4,0x20014D,0x20014D,0x20014D,0x1E00007D,0x1E00007D,0x14000041,0x120000D8,0x120000D8,0x1200006C,0xE0000BD,0x3C014D,
-0x3C014D,0x100000C9,0xA000105,0xA00014D,0xA000000D,0xFE0C002E,0x14014E,0x50000019,0x32000019,0x2A000014,0x2600000D,0x20000025,0x56000063,0x3C000042,0x1E0000A6,0x1200006C,0x2C014D,0x2000A2,0x2000A2,0x2000A2,0x2000A2,0x3A100001,0x3A100001,0x3A100001,0x20100001,0x20100001,0x180C0001,0x3000A2,0x3000A2,0x3000A2,0x22000011,0x22000011,
-0x18000001,0x5C00A2,0x5C00A2,0x14000041,0xE0000A4,0x3000A2,0x3000A2,0x3000A2,0x22000011,0x22000011,0x18000001,0x5C00A2,0x5C00A2,0x14000041,0xE0000A4,0x5C00A2,0x5C00A2,0x14000041,0xE0000A4,0xE0000A4,0xA000000D,0xF0100005,0x2000A2,0x50000019,0x32000019,0x2A000014,0x2600000D,0x20000025,0x64000041,0x3C000032,0x4000A2,0x14000041,
-0x4000A2,0x4400CA,0x52340001,0x36300001,0x2C300001,0x6800C8,0x3C100001,0x2C1C0001,0xD000C8,0x2A000014,0x220000C8,0x6800C8,0x3C100001,0x2C1C0001,0xD000C8,0x2A000014,0x220000C8,0xD000C8,0x2A000014,0x220000C8,0x220000C8,0x6800C8,0x3C100001,0x2C1C0001,0xD000C8,0x2A000014,0x220000C8,0xD000C8,0x2A000014,0x220000C8,0x220000C8,0xD000C8,
-0x2A000014,0x220000C8,0x220000C8,0x220000C8,0xC4040019,0xC4800C8,0xF4380012,0x6600000D,0x42040019,0x36000011,0x2C000004,0x28000029,0x94000032,0x58000019,0x2E000001,0x220000C8,0x9400C8,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x14000000,0x14000000,0x14000000,0x14000000,0x14000000,
-0x14000000,0xA000000,0xA000000,0xA000000,0x6000000,0x80019,0x80019,0x80019,0x80019,0x80019,0x80019,0x6000008,0x6000008,0x6000008,0x6000004,0xC0019,0xC0019,0xC0019,0x400000D,0x2000019,0x68000000,0x80019,0x80019,0x2E000000,0x20000000,0x18000000,0x18000000,0x10000000,0x20000008,0x20000004,0xC000001,0x6000008,
-0xC0019,};
-static const uint32_t g_etc1_to_bc7_m6_table67[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,
-0x2C0000,0x580000,0x580000,0x580000,0xE000000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x580000,0x580000,0x580000,0xE000000,0x580000,0x580000,0x580000,0xE000000,0xE000000,0x200000,0x1C0001,0x1C0001,0x6200000,0x240000,0x280000,0x280000,0x300000,0x6200000,0x240000,0x3C0000,0x580000,
-0x3C0000,0x400001,0x400001,0x400001,0x400001,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0x20000000,0x20000000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0x20000000,0x20000000,0xC40000,
-0xC40000,0x20000000,0x20000000,0x20000000,0x4C0000,0xA440000,0x400001,0x580000,0x26C0000,0x8C0000,0xA00000,0xF00000,0x4500000,0x600000,0x8C0000,0x20000000,0x8C0000,0x680001,0x9C0000,0x13C0000,0x34000000,0x9C0000,0x13C0000,0x34000000,0x13C0000,0x34000000,0x34000000,0x9C0000,0x13C0000,0x34000000,0x13C0000,0x34000000,
-0x34000000,0x13C0000,0x34000000,0x34000000,0x34000000,0x9C0000,0x13C0000,0x34000000,0x13C0000,0x34000000,0x34000000,0x13C0000,0x34000000,0x34000000,0x34000000,0x13C0000,0x34000000,0x34000000,0x34000000,0x34000000,0x840000,0x700000,0x700000,0xB00000,0x1000000,0x3F00000,0x34000000,0x34000000,0x900000,0xC40000,0x29C40000,0x34000000,
-0xE00000,0x340627,0x80200222,0x46200222,0x34200222,0x72100139,0x4C100076,0x361400C1,0x3A10013B,0x320C0082,0x2A100139,0x660002D3,0x4C0000A5,0x360800C8,0x3C0000E9,0x32000009,0x2A0000C9,0x320002D3,0x2C00012A,0x26000161,0x200002D4,0x480625,0x400002BE,0x34000225,0x3400024F,0x2E000106,0x2800017D,0x2E0003BC,0x280001FB,0x24000204,0x1E00034C,0x940625,
-0x26000412,0x200003A5,0x1C00047D,0x18000625,0xF8000009,0xFA2402DD,0xFE2C038E,0x7E00000B,0x5600000E,0x40000009,0x36000004,0x30000032,0xA40000F6,0x66000062,0x340000D4,0x24000204,0x680625,0x4402D3,0x6E3000A2,0x423000A2,0x343000A2,0x602000C9,0x44200002,0x36200015,0x381C00C9,0x32180026,0x2A1C00C9,0x6402D3,0x4C0000A5,0x341400A2,0x3C0000E9,0x32000009,
-0x2A0400C8,0xCC02D3,0x2C00012A,0x26000161,0x200002D4,0x6402D3,0x4C0000A5,0x341400A2,0x3C0000E9,0x32000009,0x2A0400C8,0xCC02D3,0x2C00012A,0x26000161,0x200002D4,0xCC02D3,0x2C00012A,0x26000161,0x200002D4,0x200002D4,0xF8000009,0xFE2C0141,0xF438013B,0x7E00000B,0x5600000E,0x40000009,0x34040002,0x30000032,0xAC000086,0x68000035,0x340000CB,0x26000161,
-0x9002D3,0x200222,0x200222,0x200222,0x200222,0x52100071,0x52100071,0x52100071,0x2C100071,0x2C100071,0x20100071,0x480000A2,0x480000A2,0x480000A2,0x2E000001,0x2E000001,0x2204000C,0x220000A2,0x220000A2,0x1C000020,0x160000A4,0x300222,0x300222,0x300222,0x280000C6,0x280000C6,0x1E000081,0x1E000118,0x1E000118,0x1A00007E,0x160000E4,0x5C0222,
-0x5C0222,0x16000145,0x14000178,0xE000225,0xE8000000,0xF21400BD,0x200222,0x7A000004,0x50000001,0x3C000002,0x36000000,0x2A000005,0x8200006D,0x56000032,0x2C0000AB,0x1A00007E,0x400222,0x3000A2,0x3000A2,0x3000A2,0x3000A2,0x42200001,0x42200001,0x42200001,0x28200001,0x28200001,0x201C0001,0x24400A2,0x24400A2,0x24400A2,0x2E000001,0x2E000001,
-0x200C0000,0x8C00A2,0x8C00A2,0x1C000020,0x160000A4,0x24400A2,0x24400A2,0x24400A2,0x2E000001,0x2E000001,0x200C0000,0x8C00A2,0x8C00A2,0x1C000020,0x160000A4,0x8C00A2,0x8C00A2,0x1C000020,0x160000A4,0x160000A4,0xE8000000,0xF8200005,0x3000A2,0x7A000004,0x50000001,0x3C000002,0x36000000,0x2A000005,0x96000028,0x5A000012,0x6400A2,0x1C000020,
-0x6400A2,0x5400CA,0x5A440001,0x3E400001,0x34400001,0x8000C8,0x44200001,0x342C0001,0x10000C8,0x32000005,0x2A0000C8,0x8000C8,0x44200001,0x342C0001,0x10000C8,0x32000005,0x2A0000C8,0x10000C8,0x32000005,0x2A0000C8,0x2A0000C8,0x8000C8,0x44200001,0x342C0001,0x10000C8,0x32000005,0x2A0000C8,0x10000C8,0x32000005,0x2A0000C8,0x2A0000C8,0x10000C8,
-0x32000005,0x2A0000C8,0x2A0000C8,0x2A0000C8,0xF8000008,0x5C00C8,0xFC480012,0x7E000002,0x56000005,0x3E040005,0x34080000,0x32000014,0xB6000020,0x6E00000A,0x36100001,0x2A0000C8,0xB400C8,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x2C000000,0x2C000000,0x2C000000,0x2C000000,0x2C000000,
-0x2C000000,0x16000000,0x16000000,0x16000000,0xE000000,0x140071,0x140071,0x140071,0x140071,0x140071,0x140071,0x12000028,0x12000028,0x12000028,0xC000014,0x240071,0x240071,0x240071,0xA000041,0x6000071,0xE8000000,0x100071,0x100071,0x68000000,0x48000000,0x36000000,0x36000000,0x24000000,0x52000022,0x42000011,0x1C000004,0x12000028,
-0x1C0071,};
-static const uint32_t g_etc1_to_bc7_m6_table68[] = {
-0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x140000,
-0x140000,0x140000,0x140000,0x2000001,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0xC0000,0x100000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,
-0x2440000,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x16000001,0x340000,0x300000,0x300000,0x380000,0x3C0000,0x400000,0x400000,0x500000,0x380000,0x3C0000,0x640000,0x8C0000,
-0x640000,0x540000,0x540000,0x540000,0x540000,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0xFC0000,0xFC0000,0x28000001,0x28000001,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0xFC0000,0xFC0000,0x28000001,0x28000001,0xFC0000,
-0xFC0000,0x28000001,0x28000001,0x28000001,0x2600000,0x4580000,0x540000,0x2700000,0x8C0000,0xB00000,0xCC0000,0x1340000,0x680000,0x7C0000,0xB00000,0x28000001,0xB00000,0x7C0000,0xB80000,0x1740000,0x3C000001,0xB80000,0x1740000,0x3C000001,0x1740000,0x3C000001,0x3C000001,0xB80000,0x1740000,0x3C000001,0x1740000,0x3C000001,
-0x3C000001,0x1740000,0x3C000001,0x3C000001,0x3C000001,0xB80000,0x1740000,0x3C000001,0x1740000,0x3C000001,0x3C000001,0x1740000,0x3C000001,0x3C000001,0x3C000001,0x1740000,0x3C000001,0x3C000001,0x3C000001,0x3C000001,0x9C0000,0x840000,0x840000,0xD00000,0x12C0000,0xDF80000,0x3C000001,0x3C000001,0xA80000,0xE80000,0x31F40000,0x3C000001,
-0x1080000,0x400738,0x8A3002D4,0x503002D4,0x3C3002D5,0x801C0190,0x581C00D5,0x3E24013A,0x461C0190,0x3A1C00D5,0x321C0192,0x7A0802D3,0x5A0800A2,0x421400FF,0x4A0800D1,0x3C080002,0x320C00D7,0x3C0802D4,0x380000FD,0x3000013B,0x2A0802D3,0x600734,0x5200031F,0x3C0802D4,0x46000252,0x3A000107,0x320001A4,0x3A0003C9,0x340001AE,0x2E0001B3,0x2800032C,0xC40734,
-0x2C000479,0x2C0003F2,0x260004DF,0x20000734,0xFC100021,0xF23403D0,0xF63C049C,0x92080000,0x64080001,0x4A080002,0x3E0C0012,0x3A080011,0xD000008E,0x7E000018,0x3E0000AB,0x2E0001B3,0x8C0734,0x5402D4,0x764400A4,0x4A4400A4,0x3C4000A5,0x6C3000C8,0x4E300001,0x3E340015,0x403000C8,0x3A2C0026,0x323000CA,0x8002D3,0x580C00A2,0x3C2800A3,0x4C0000C9,0x3C080002,
-0x341400CA,0x10002D3,0x380000ED,0x3000012B,0x2A0002D3,0x8002D3,0x580C00A2,0x3C2800A3,0x4C0000C9,0x3C080002,0x341400CA,0x10002D3,0x380000ED,0x3000012B,0x2A0002D3,0x10002D3,0x380000ED,0x3000012B,0x2A0002D3,0x2A0002D3,0xFE140009,0xFA44014C,0xFE4C0138,0x92080000,0x64080001,0x4A080002,0x3E140002,0x3A00000A,0xD000003D,0x7E000008,0x3E0000AB,0x3000012B,
-0xB402D3,0x3002D4,0x3002D4,0x3002D4,0x3002D4,0x621C00C8,0x621C00C8,0x621C00C8,0x361C00C8,0x361C00C8,0x281C00C9,0x5A0800A2,0x5A0800A2,0x5A0800A2,0x3C080001,0x3C080001,0x2C0C0026,0x2E0800A2,0x2E0800A2,0x28040015,0x200800A2,0x4402D3,0x4402D3,0x4402D3,0x340000FE,0x340000FE,0x2A0000C9,0x2E00011D,0x2E00011D,0x2400005A,0x1E0000CE,0x8802D3,
-0x8802D3,0x20000199,0x1C0001A3,0x160002D3,0xFE0C0009,0xFA240139,0x3002D4,0x92080000,0x60080001,0x4A080001,0x440C0006,0x36080002,0xB400004B,0x76000012,0x3E0000A2,0x2400005A,0x6002D3,0x4000A4,0x4000A4,0x4000A4,0x4000A4,0x4E300000,0x4E300000,0x4E300000,0x32300000,0x32300000,0x28300001,0x6000A2,0x6000A2,0x6000A2,0x38100001,0x38100001,
-0x28200001,0xC400A2,0xC400A2,0x2600000D,0x200000A2,0x6000A2,0x6000A2,0x6000A2,0x38100001,0x38100001,0x28200001,0xC400A2,0xC400A2,0x2600000D,0x200000A2,0xC400A2,0xC400A2,0x2600000D,0x200000A2,0x200000A2,0xEA140000,0xF2340008,0x4000A4,0x92080000,0x5E0C0000,0x480C0000,0x3E140001,0x38040000,0xC200000D,0x7C000002,0x8C00A2,0x2600000D,
-0x8C00A2,0x6800C8,0x66540000,0x46540001,0x3C540001,0x29800C8,0x4E300001,0x3C400001,0x13800C8,0x3C040001,0x320000CA,0x29800C8,0x4E300001,0x3C400001,0x13800C8,0x3C040001,0x320000CA,0x13800C8,0x3C040001,0x320000CA,0x320000CA,0x29800C8,0x4E300001,0x3C400001,0x13800C8,0x3C040001,0x320000CA,0x13800C8,0x3C040001,0x320000CA,0x320000CA,0x13800C8,
-0x3C040001,0x320000CA,0x320000CA,0x320000CA,0xFE140008,0xE6C00C8,0xF65C0019,0x8E0C0000,0x64040001,0x4C040000,0x3C1C0001,0x3A00000A,0xD6040012,0x7E040002,0x40200000,0x320000CA,0xDC00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x40080000,0x40080000,0x40080000,0x40080000,0x40080000,
-0x40080000,0x20080001,0x20080001,0x20080001,0x16080001,0x2800C8,0x2800C8,0x2800C8,0x2800C8,0x2800C8,0x2800C8,0x1E00002D,0x1E00002D,0x1E00002D,0x16000011,0x5000C8,0x5000C8,0x5000C8,0x10000062,0xC0000CA,0xFE0C0008,0x1C00C8,0x1C00C8,0x8E080000,0x64080000,0x4E080000,0x4E080000,0x34080000,0x92000022,0x74000009,0x28040001,0x1E00002D,
-0x3800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table69[] = {
-0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x240000,0x240000,0x240000,0x240000,0x240000,0x240000,0x240000,0x240000,0x240000,0x240000,0x440000,
-0x440000,0x440000,0x440000,0xA000001,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x4180000,0x4180000,0x4180000,0x240000,0x300000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,
-0x25C0000,0xBC0000,0xBC0000,0xBC0000,0x1E000001,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0xBC0000,0xBC0000,0xBC0000,0x1E000001,0xBC0000,0xBC0000,0xBC0000,0x1E000001,0x1E000001,0x440000,0x400000,0x400000,0x4480000,0x500000,0x2540000,0x2540000,0x6C0000,0x4480000,0x500000,0x880000,0xBC0000,
-0x880000,0x640000,0x640000,0x640000,0x640000,0x940000,0x940000,0x940000,0x12C0000,0x12C0000,0x30000001,0x940000,0x940000,0x940000,0x12C0000,0x12C0000,0x30000001,0x12C0000,0x12C0000,0x30000001,0x30000001,0x940000,0x940000,0x940000,0x12C0000,0x12C0000,0x30000001,0x12C0000,0x12C0000,0x30000001,0x30000001,0x12C0000,
-0x12C0000,0x30000001,0x30000001,0x30000001,0x740000,0xC680000,0x640000,0x880000,0xA80000,0xD40000,0xF40000,0x1700000,0x7C0000,0x940000,0xD40000,0x30000001,0xD40000,0x8C0000,0xD00000,0x1A40000,0x44000001,0xD00000,0x1A40000,0x44000001,0x1A40000,0x44000001,0x44000001,0xD00000,0x1A40000,0x44000001,0x1A40000,0x44000001,
-0x44000001,0x1A40000,0x44000001,0x44000001,0x44000001,0xD00000,0x1A40000,0x44000001,0x1A40000,0x44000001,0x44000001,0x1A40000,0x44000001,0x44000001,0x44000001,0x1A40000,0x44000001,0x44000001,0x44000001,0x44000001,0xB00000,0x2940000,0x2940000,0xEC0000,0x1540000,0x17F80000,0x44000001,0x44000001,0x2BC0000,0x1040000,0x3BC80000,0x44000001,
-0x1280000,0x500738,0x924002D4,0x584002D4,0x444002D5,0x882C0190,0x602C00D5,0x4634013A,0x4E2C0190,0x422C00D5,0x3A2C0192,0x821802D3,0x621800A2,0x4A2400FF,0x521800D1,0x44180002,0x3A1C00D7,0x441802D4,0x401000FD,0x3810013B,0x321802D3,0x780734,0x620002DD,0x441802D4,0x520001E2,0x460000CF,0x3A080192,0x46000361,0x4000011E,0x3800012F,0x300002EB,0xF40734,
-0x38000401,0x32000352,0x2C000477,0x28000734,0xFE20002A,0xFA4403D0,0xFE4C049C,0x9A180000,0x6C180001,0x52180002,0x461C0012,0x42180011,0xF6000022,0x8E080002,0x480C00A9,0x3800012F,0xAC0734,0x6402D4,0x7E5400A4,0x525400A4,0x445000A5,0x744000C8,0x56400001,0x46440015,0x484000C8,0x423C0026,0x3A4000CA,0x9802D3,0x601C00A2,0x443800A3,0x541000C9,0x44180002,
-0x3C2400CA,0x13002D3,0x420000CB,0x380000FE,0x320002D3,0x9802D3,0x601C00A2,0x443800A3,0x541000C9,0x44180002,0x3C2400CA,0x13002D3,0x420000CB,0x380000FE,0x320002D3,0x13002D3,0x420000CB,0x380000FE,0x320002D3,0x320002D3,0xF8280011,0xF254015E,0xF65C014D,0x9A180000,0x6C180001,0x52180002,0x46240002,0x440C0005,0xFA000013,0x8E080001,0x4A0400A2,0x380000FE,
-0xD802D3,0x4002D4,0x4002D4,0x4002D4,0x4002D4,0x6A2C00C8,0x6A2C00C8,0x6A2C00C8,0x3E2C00C8,0x3E2C00C8,0x302C00C9,0x621800A2,0x621800A2,0x621800A2,0x44180001,0x44180001,0x341C0026,0x361800A2,0x361800A2,0x30140015,0x281800A2,0x5C02D3,0x5C02D3,0x5C02D3,0x460000CE,0x460000CE,0x321000C9,0x3A0000D5,0x3A0000D5,0x2E000012,0x280000A3,0xB802D3,
-0xB802D3,0x2A00015B,0x2400016B,0x1E0002D3,0xFC1C0011,0xF234014C,0x4002D4,0x9A180000,0x68180001,0x52180001,0x4C1C0006,0x3E180002,0xE400000D,0x8E080001,0x461000A2,0x2E000012,0x8002D3,0x5000A4,0x5000A4,0x5000A4,0x5000A4,0x56400000,0x56400000,0x56400000,0x3A400000,0x3A400000,0x30400001,0x7800A2,0x7800A2,0x7800A2,0x40200001,0x40200001,
-0x30300001,0xF400A2,0xF400A2,0x30000002,0x280000A2,0x7800A2,0x7800A2,0x7800A2,0x40200001,0x40200001,0x30300001,0xF400A2,0xF400A2,0x30000002,0x280000A2,0xF400A2,0xF400A2,0x30000002,0x280000A2,0x280000A2,0xF2240000,0xFA440008,0x5000A4,0x9A180000,0x661C0000,0x501C0000,0x46240001,0x40140000,0xF4000004,0x8E080000,0xAC00A2,0x30000002,
-0xAC00A2,0x7800C8,0x6E640000,0x4E640001,0x44640001,0x2B000C8,0x56400001,0x44500001,0x16800C8,0x44140001,0x3A0000CA,0x2B000C8,0x56400001,0x44500001,0x16800C8,0x44140001,0x3A0000CA,0x16800C8,0x44140001,0x3A0000CA,0x3A0000CA,0x2B000C8,0x56400001,0x44500001,0x16800C8,0x44140001,0x3A0000CA,0x16800C8,0x44140001,0x3A0000CA,0x3A0000CA,0x16800C8,
-0x44140001,0x3A0000CA,0x3A0000CA,0x3A0000CA,0xF828000D,0x8000C8,0xFE6C0019,0x961C0000,0x6C140001,0x54140000,0x442C0001,0x44000002,0xF8040008,0x90080000,0x48300000,0x3A0000CA,0xFC00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x2C00C8,0x48180000,0x48180000,0x48180000,0x48180000,0x48180000,
-0x48180000,0x28180001,0x28180001,0x28180001,0x1E180001,0x4000C8,0x4000C8,0x4000C8,0x4000C8,0x4000C8,0x4000C8,0x2E000009,0x2E000009,0x2E000009,0x1E040001,0x8000C8,0x8000C8,0x8000C8,0x1C00003A,0x140000CA,0xF61C000D,0x2C00C8,0x2C00C8,0x96180000,0x6C180000,0x56180000,0x56180000,0x3C180000,0xE0000004,0x86080001,0x30140001,0x2E000009,
-0x5C00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table70[] = {
-0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x740000,
-0x740000,0x740000,0x740000,0x12000001,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0xC280000,0xC280000,0xC280000,0x3C0000,0x540000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,
-0x2740000,0xF00000,0xF00000,0xF00000,0x26000001,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0xF00000,0xF00000,0xF00000,0x26000001,0xF00000,0xF00000,0xF00000,0x26000001,0x26000001,0x2540000,0x500000,0x500000,0x5C0000,0x640000,0x6C0000,0x6C0000,0x840000,0x5C0000,0x640000,0xA80000,0xF00000,
-0xA80000,0x740000,0x740000,0x740000,0x740000,0xAC0000,0xAC0000,0xAC0000,0x15C0000,0x15C0000,0x38000001,0xAC0000,0xAC0000,0xAC0000,0x15C0000,0x15C0000,0x38000001,0x15C0000,0x15C0000,0x38000001,0x38000001,0xAC0000,0xAC0000,0xAC0000,0x15C0000,0x15C0000,0x38000001,0x15C0000,0x15C0000,0x38000001,0x38000001,0x15C0000,
-0x15C0000,0x38000001,0x38000001,0x38000001,0x6840000,0x7C0000,0x740000,0x9C0000,0x2C00000,0xF40000,0x11C0000,0x1AC0000,0x900000,0xAC0000,0xF40000,0x38000001,0xF40000,0x9C0000,0xE80000,0x1D80000,0x4C000001,0xE80000,0x1D80000,0x4C000001,0x1D80000,0x4C000001,0x4C000001,0xE80000,0x1D80000,0x4C000001,0x1D80000,0x4C000001,
-0x4C000001,0x1D80000,0x4C000001,0x4C000001,0x4C000001,0xE80000,0x1D80000,0x4C000001,0x1D80000,0x4C000001,0x4C000001,0x1D80000,0x4C000001,0x4C000001,0x4C000001,0x1D80000,0x4C000001,0x4C000001,0x4C000001,0x4C000001,0xC40000,0xAA40000,0xAA40000,0x1040000,0x17C0000,0x21FC0000,0x4C000001,0x4C000001,0xD40000,0x1240000,0x43D80000,0x4C000001,
-0x14C0000,0x600738,0x9A5002D4,0x605002D4,0x4C5002D5,0x903C0190,0x683C00D5,0x4E44013A,0x563C0190,0x4A3C00D5,0x423C0192,0x8A2802D3,0x6A2800A2,0x523400FF,0x5A2800D1,0x4C280002,0x422C00D7,0x4C2802D4,0x482000FD,0x4020013B,0x3A2802D3,0x900734,0x720402D3,0x4C2802D4,0x6200019A,0x4E1000CF,0x42180192,0x52000319,0x4A0000C6,0x420000E7,0x3A0002D4,0x1240734,
-0x440003A9,0x3E0002E2,0x3600041C,0x30000734,0xFE34003D,0xF2540402,0xF65C04C1,0xA2280000,0x74280001,0x5A280002,0x4E2C0012,0x4A280011,0xFE100022,0x96180002,0x501C00A9,0x420000E7,0xD00734,0x7402D4,0x866400A4,0x5A6400A4,0x4C6000A5,0x7C5000C8,0x5E500001,0x4E540015,0x505000C8,0x4A4C0026,0x425000CA,0xB002D3,0x682C00A2,0x4C4800A3,0x5C2000C9,0x4C280002,
-0x443400CA,0x16402D3,0x4A0000AD,0x420000E3,0x3A0002D3,0xB002D3,0x682C00A2,0x4C4800A3,0x5C2000C9,0x4C280002,0x443400CA,0x16402D3,0x4A0000AD,0x420000E3,0x3A0002D3,0x16402D3,0x4A0000AD,0x420000E3,0x3A0002D3,0x3A0002D3,0xFC380018,0xFA64015E,0xFE6C014D,0xA2280000,0x74280001,0x5A280002,0x4E340002,0x4C1C0005,0xFE100019,0x96180001,0x521400A2,0x420000E3,
-0xF802D3,0x5002D4,0x5002D4,0x5002D4,0x5002D4,0x723C00C8,0x723C00C8,0x723C00C8,0x463C00C8,0x463C00C8,0x383C00C9,0x6A2800A2,0x6A2800A2,0x6A2800A2,0x4C280001,0x4C280001,0x3C2C0026,0x3E2800A2,0x3E2800A2,0x38240015,0x302800A2,0x7402D3,0x7402D3,0x7402D3,0x520800C9,0x520800C9,0x3A2000C9,0x460000AD,0x460000AD,0x38040002,0x300C00A2,0xE802D3,
-0xE802D3,0x32000119,0x2C000126,0x260002D3,0xFE2C0016,0xFA44014C,0x5002D4,0xA2280000,0x70280001,0x5A280001,0x542C0006,0x46280002,0xFA0C0005,0x96180001,0x4E2000A2,0x38040002,0xA402D3,0x6000A4,0x6000A4,0x6000A4,0x6000A4,0x5E500000,0x5E500000,0x5E500000,0x42500000,0x42500000,0x38500001,0x9000A2,0x9000A2,0x9000A2,0x48300001,0x48300001,
-0x38400001,0x12400A2,0x12400A2,0x38080001,0x300000A2,0x9000A2,0x9000A2,0x9000A2,0x48300001,0x48300001,0x38400001,0x12400A2,0x12400A2,0x38080001,0x300000A2,0x12400A2,0x12400A2,0x38080001,0x300000A2,0x300000A2,0xFA340000,0xF254000D,0x6000A4,0xA2280000,0x6E2C0000,0x582C0000,0x4E340001,0x48240000,0xFE0C0002,0x96180000,0xD000A2,0x38080001,
-0xD000A2,0x8800C8,0x76740000,0x56740001,0x4C740001,0xC800C8,0x5E500001,0x4C600001,0x19800C8,0x4C240001,0x420000CA,0xC800C8,0x5E500001,0x4C600001,0x19800C8,0x4C240001,0x420000CA,0x19800C8,0x4C240001,0x420000CA,0x420000CA,0xC800C8,0x5E500001,0x4C600001,0x19800C8,0x4C240001,0x420000CA,0x19800C8,0x4C240001,0x420000CA,0x420000CA,0x19800C8,
-0x4C240001,0x420000CA,0x420000CA,0x420000CA,0xFA3C0012,0x9000C8,0xF67C0020,0x9E2C0000,0x74240001,0x5C240000,0x4C3C0001,0x4C080001,0xFC14000A,0x98180000,0x50400000,0x420000CA,0x12000C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x3C00C8,0x50280000,0x50280000,0x50280000,0x50280000,0x50280000,
-0x50280000,0x30280001,0x30280001,0x30280001,0x26280001,0x5800C8,0x5800C8,0x5800C8,0x5800C8,0x5800C8,0x5800C8,0x38040001,0x38040001,0x38040001,0x26140001,0xB000C8,0xB000C8,0xB000C8,0x24000022,0x1C0000CA,0xFE2C000D,0x3C00C8,0x3C00C8,0x9E280000,0x74280000,0x5E280000,0x5E280000,0x44280000,0xF40C0001,0x8E180001,0x38240001,0x38040001,
-0x7C00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table71[] = {
-0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0xA40000,
-0xA40000,0xA40000,0xA40000,0x1A000001,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x3C0000,0x3C0000,0x3C0000,0x2500000,0x740000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,
-0x28C0000,0x1200000,0x1200000,0x1200000,0x2E000001,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x1200000,0x1200000,0x1200000,0x2E000001,0x1200000,0x1200000,0x1200000,0x2E000001,0x2E000001,0xA640000,0x600000,0x600000,0x700000,0x780000,0x2800000,0x2800000,0xA00000,0x700000,0x780000,0xCC0000,0x1200000,
-0xCC0000,0x840000,0x840000,0x840000,0x840000,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x40000001,0x40000001,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x40000001,0x40000001,0x18C0000,
-0x18C0000,0x40000001,0x40000001,0x40000001,0x2980000,0x8C0000,0x840000,0xB40000,0xDC0000,0x1180000,0x1400000,0x1E80000,0xA40000,0xC40000,0x1180000,0x40000001,0x1180000,0xAC0000,0x1000000,0x5F80000,0x54000001,0x1000000,0x5F80000,0x54000001,0x5F80000,0x54000001,0x54000001,0x1000000,0x5F80000,0x54000001,0x5F80000,0x54000001,
-0x54000001,0x5F80000,0x54000001,0x54000001,0x54000001,0x1000000,0x5F80000,0x54000001,0x5F80000,0x54000001,0x54000001,0x5F80000,0x54000001,0x54000001,0x54000001,0x5F80000,0x54000001,0x54000001,0x54000001,0x54000001,0xD80000,0xB80000,0xB80000,0x1200000,0x1A40000,0x2BFC0000,0x54000001,0x54000001,0x2E80000,0x3400000,0x4BE80000,0x54000001,
-0x16C0000,0x700738,0xA26002D4,0x686002D4,0x546002D5,0x984C0190,0x704C00D5,0x5654013A,0x5E4C0190,0x524C00D5,0x4A4C0192,0x923802D3,0x723800A2,0x5A4400FF,0x623800D1,0x54380002,0x4A3C00D7,0x543802D4,0x503000FD,0x4830013B,0x423802D3,0xA80734,0x7A1402D3,0x543802D4,0x6C080192,0x562000CF,0x4A280192,0x620002E3,0x540000A3,0x4A0000D7,0x421002D4,0x1580734,
-0x4A000361,0x44000272,0x3E0003BF,0x38000734,0xFE440056,0xFA640402,0xFE6C04C1,0xAA380000,0x7C380001,0x62380002,0x563C0012,0x52380011,0xFE24003B,0x9E280002,0x582C00A9,0x4A0000D7,0xF00734,0x8402D4,0x8E7400A4,0x627400A4,0x547000A5,0x846000C8,0x66600001,0x56640015,0x586000C8,0x525C0026,0x4A6000CA,0x2C402D3,0x703C00A2,0x545800A3,0x643000C9,0x54380002,
-0x4C4400CA,0x19402D3,0x540000A3,0x4A0000CE,0x420002D3,0x2C402D3,0x703C00A2,0x545800A3,0x643000C9,0x54380002,0x4C4400CA,0x19402D3,0x540000A3,0x4A0000CE,0x420002D3,0x19402D3,0x540000A3,0x4A0000CE,0x420002D3,0x420002D3,0xFC4C001D,0xF2740174,0xF67C0164,0xAA380000,0x7C380001,0x62380002,0x56440002,0x542C0005,0xFE240022,0x9E280001,0x5A2400A2,0x4A0000CE,
-0x11C02D3,0x6002D4,0x6002D4,0x6002D4,0x6002D4,0x7A4C00C8,0x7A4C00C8,0x7A4C00C8,0x4E4C00C8,0x4E4C00C8,0x404C00C9,0x723800A2,0x723800A2,0x723800A2,0x54380001,0x54380001,0x443C0026,0x463800A2,0x463800A2,0x40340015,0x383800A2,0x8C02D3,0x8C02D3,0x8C02D3,0x5A1800C9,0x5A1800C9,0x423000C9,0x540000A2,0x540000A2,0x40140002,0x381C00A2,0x11802D3,
-0x11802D3,0x3E0000F1,0x360000FB,0x2E0002D3,0xF840001D,0xF2540161,0x6002D4,0xAA380000,0x78380001,0x62380001,0x5C3C0006,0x4E380002,0xFC1C000A,0x9E280001,0x563000A2,0x40140002,0xC802D3,0x7000A4,0x7000A4,0x7000A4,0x7000A4,0x66600000,0x66600000,0x66600000,0x4A600000,0x4A600000,0x40600001,0xA800A2,0xA800A2,0xA800A2,0x50400001,0x50400001,
-0x40500001,0x15800A2,0x15800A2,0x40180001,0x380000A2,0xA800A2,0xA800A2,0xA800A2,0x50400001,0x50400001,0x40500001,0x15800A2,0x15800A2,0x40180001,0x380000A2,0x15800A2,0x15800A2,0x40180001,0x380000A2,0x380000A2,0xF6480001,0xFA64000D,0x7000A4,0xAA380000,0x763C0000,0x603C0000,0x56440001,0x50340000,0xF6240005,0x9E280000,0xF000A2,0x40180001,
-0xF000A2,0x9800C8,0x7E840000,0x5E840001,0x54840001,0xE000C8,0x66600001,0x54700001,0x1CC00C8,0x54340001,0x4A0000CA,0xE000C8,0x66600001,0x54700001,0x1CC00C8,0x54340001,0x4A0000CA,0x1CC00C8,0x54340001,0x4A0000CA,0x4A0000CA,0xE000C8,0x66600001,0x54700001,0x1CC00C8,0x54340001,0x4A0000CA,0x1CC00C8,0x54340001,0x4A0000CA,0x4A0000CA,0x1CC00C8,
-0x54340001,0x4A0000CA,0x4A0000CA,0x4A0000CA,0xFC4C0014,0x8A000C8,0xFE8C0020,0xA63C0000,0x7C340001,0x64340000,0x544C0001,0x54180001,0xFE2C000D,0xA0280000,0x58500000,0x4A0000CA,0x14000C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x4C00C8,0x58380000,0x58380000,0x58380000,0x58380000,0x58380000,
-0x58380000,0x38380001,0x38380001,0x38380001,0x2E380001,0x7000C8,0x7000C8,0x7000C8,0x7000C8,0x7000C8,0x7000C8,0x40140001,0x40140001,0x40140001,0x2E240001,0xE400C8,0xE400C8,0xE400C8,0x2C00000D,0x240000CA,0xF63C0014,0x4C00C8,0x4C00C8,0xA6380000,0x7C380000,0x66380000,0x66380000,0x4C380000,0xFC1C0001,0x96280001,0x40340001,0x40140001,
-0xA000C8,};
-static const uint32_t g_etc1_to_bc7_m6_table72[] = {
-0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0xDC0000,
-0xDC0000,0xDC0000,0xDC0000,0x24000000,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0x480001,0xE4C0000,0xE4C0000,0xE4C0000,0x6C0000,0x9C0000,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,
-0xA80000,0x1580000,0x1580000,0x1580000,0x38000000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x1580000,0x38000000,0x38000000,0x4780000,0x700001,0x700001,0x840000,0x48C0000,0x2980000,0x2980000,0xC00000,0x840000,0x48C0000,0xF00000,0x1580000,
-0xF00000,0x940001,0x940001,0x940001,0x940001,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x1C40000,0x1C40000,0x4A000000,0x4A000000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x1C40000,0x1C40000,0x4A000000,0x4A000000,0x1C40000,
-0x1C40000,0x4A000000,0x4A000000,0x4A000000,0x4AC0000,0xA00000,0x940001,0xCC0000,0xFC0000,0x13C0000,0x16C0000,0xBF80000,0xBC0000,0x2DC0000,0x13C0000,0x4A000000,0x13C0000,0xBC0001,0x3180000,0x11FC0000,0x5E000000,0x3180000,0x11FC0000,0x5E000000,0x11FC0000,0x5E000000,0x5E000000,0x3180000,0x11FC0000,0x5E000000,0x11FC0000,0x5E000000,
-0x5E000000,0x11FC0000,0x5E000000,0x5E000000,0x5E000000,0x3180000,0x11FC0000,0x5E000000,0x11FC0000,0x5E000000,0x5E000000,0x11FC0000,0x5E000000,0x5E000000,0x5E000000,0x11FC0000,0x5E000000,0x5E000000,0x5E000000,0x5E000000,0x2EC0000,0xCC80000,0xCC80000,0x1400000,0x1D00000,0x37F40000,0x5E000000,0x5E000000,0x3000000,0x1640000,0x55DC0000,0x5E000000,
-0x1940000,0x840734,0xAE7002D3,0x707002D4,0x5E7002D3,0xA0600192,0x786000D7,0x6264013B,0x66600192,0x5C5C00D7,0x545C0192,0x984C02D4,0x7A4C00A3,0x625400FD,0x6A4800CF,0x5E4C0002,0x545000D5,0x5E4802D4,0x584000FF,0x5048013A,0x4A4802D5,0xC40734,0x822802D3,0x5E4C02D4,0x76180192,0x5E3000D1,0x54380190,0x700002D3,0x5E1000A2,0x541400D5,0x4A2402D4,0x18C0734,
-0x5600031B,0x50000212,0x48000378,0x40000738,0xFE580072,0xF4780434,0xF67C04EC,0xB24C0002,0x84480003,0x6C4C0002,0x5E500011,0x5C440012,0xFE34004E,0xAA380000,0x604000A9,0x541400D5,0x1180734,0x9802D3,0x988400A2,0x6C8400A2,0x5E8400A2,0x8A7400C9,0x6E740002,0x60740015,0x627000C9,0x5C6C0026,0x547000C9,0xE002D3,0x7A4C00A2,0x5E6800A2,0x6E4000C9,0x5E4C0001,
-0x545800C8,0x1CC02D3,0x5E0C00A2,0x540000C8,0x4A0002D4,0xE002D3,0x7A4C00A2,0x5E6800A2,0x6E4000C9,0x5E4C0001,0x545800C8,0x1CC02D3,0x5E0C00A2,0x540000C8,0x4A0002D4,0x1CC02D3,0x5E0C00A2,0x540000C8,0x4A0002D4,0x4A0002D4,0xFC60002D,0xFC880173,0xFE8C016B,0xB24C0001,0x84480002,0x6C4C0001,0x5E580002,0x5C3C0006,0xFE3C0033,0xAA380000,0x623800A2,0x540000C8,
-0x14002D3,0x7002D3,0x7002D3,0x7002D3,0x7002D3,0x806000CA,0x806000CA,0x806000CA,0x585C00CA,0x585C00CA,0x4A5C00CA,0x7A4C00A3,0x7A4C00A3,0x7A4C00A3,0x5C4C0002,0x5C4C0002,0x4C500026,0x4E4C00A3,0x4E4C00A3,0x48480015,0x404800A5,0x2A402D3,0x2A402D3,0x2A402D3,0x622C00C9,0x622C00C9,0x4A4400C8,0x5C1400A2,0x5C1400A2,0x4A240001,0x403000A4,0x15002D3,
-0x15002D3,0x480000D4,0x3E0000CD,0x380002D4,0xFC540026,0xFC68015E,0x7002D3,0xB6480001,0x804C0002,0x6A4C0002,0x644C0005,0x58480002,0xFE34000E,0xAA380000,0x5E4400A3,0x4A240001,0xEC02D3,0x8400A2,0x8400A2,0x8400A2,0x8400A2,0x6C740001,0x6C740001,0x6C740001,0x52740001,0x52740001,0x4A700001,0xC400A2,0xC400A2,0xC400A2,0x58540001,0x58540001,
-0x4A600000,0x18C00A2,0x18C00A2,0x4A280000,0x400000A4,0xC400A2,0xC400A2,0xC400A2,0x58540001,0x58540001,0x4A600000,0x18C00A2,0x18C00A2,0x4A280000,0x400000A4,0x18C00A2,0x18C00A2,0x4A280000,0x400000A4,0x400000A4,0xFE580001,0xF4780012,0x8400A2,0xB6480000,0x7C500000,0x68500000,0x60540000,0x58480001,0xFE340005,0xAA380000,0x11800A2,0x4A280000,
-0x11800A2,0xA800CA,0x84980001,0x68940001,0x5E940001,0xFC00C8,0x6E740001,0x5E800001,0x3F800C8,0x5E440000,0x540000C8,0xFC00C8,0x6E740001,0x5E800001,0x3F800C8,0x5E440000,0x540000C8,0x3F800C8,0x5E440000,0x540000C8,0x540000C8,0xFC00C8,0x6E740001,0x5E800001,0x3F800C8,0x5E440000,0x540000C8,0x3F800C8,0x5E440000,0x540000C8,0x540000C8,0x3F800C8,
-0x5E440000,0x540000C8,0x540000C8,0x540000C8,0xF6680019,0x2B400C8,0xF8A00029,0xB24C0000,0x84480001,0x6E440000,0x5E5C0000,0x5E240000,0xF4480012,0xAA380000,0x60640001,0x540000C8,0x16800C8,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5C00CA,0x5E4C0001,0x5E4C0001,0x5E4C0001,0x5E4C0001,0x5E4C0001,
-0x5E4C0001,0x42480001,0x42480001,0x42480001,0x38480001,0x8C00C8,0x8C00C8,0x8C00C8,0x8C00C8,0x8C00C8,0x8C00C8,0x48280001,0x48280001,0x48280001,0x38340001,0x11800C8,0x11800C8,0x11800C8,0x38000001,0x2E0000C8,0xF0500019,0x5C00CA,0x5C00CA,0xAA4C0001,0x824C0001,0x6C4C0001,0x6C4C0001,0x544C0001,0xF8300002,0xA8380000,0x4C440000,0x48280001,
-0xC800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table73[] = {
-0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x10C0000,
-0x10C0000,0x10C0000,0x10C0000,0x2C000000,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x580001,0x600000,0x600000,0x600000,0x840000,0xBC0000,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,
-0xC00000,0x1880000,0x1880000,0x1880000,0x40000000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0x1880000,0x1880000,0x1880000,0x40000000,0x1880000,0x1880000,0x1880000,0x40000000,0x40000000,0xC880000,0x800001,0x800001,0x980000,0x4A00000,0xB00000,0xB00000,0xD80000,0x980000,0x4A00000,0x1140000,0x1880000,
-0x1140000,0xA40001,0xA40001,0xA40001,0xA40001,0x2F40000,0x2F40000,0x2F40000,0x1F40000,0x1F40000,0x52000000,0x2F40000,0x2F40000,0x2F40000,0x1F40000,0x1F40000,0x52000000,0x1F40000,0x1F40000,0x52000000,0x52000000,0x2F40000,0x2F40000,0x2F40000,0x1F40000,0x1F40000,0x52000000,0x1F40000,0x1F40000,0x52000000,0x52000000,0x1F40000,
-0x1F40000,0x52000000,0x52000000,0x52000000,0xC00000,0xB00000,0xA40001,0xE00000,0x3140000,0x1600000,0x1940000,0x15FC0000,0xD00000,0x2F40000,0x1600000,0x52000000,0x1600000,0xCC0001,0x3300000,0x1DFC0000,0x66000000,0x3300000,0x1DFC0000,0x66000000,0x1DFC0000,0x66000000,0x66000000,0x3300000,0x1DFC0000,0x66000000,0x1DFC0000,0x66000000,
-0x66000000,0x1DFC0000,0x66000000,0x66000000,0x66000000,0x3300000,0x1DFC0000,0x66000000,0x1DFC0000,0x66000000,0x66000000,0x1DFC0000,0x66000000,0x66000000,0x66000000,0x1DFC0000,0x66000000,0x66000000,0x66000000,0x66000000,0x3000000,0xDC0000,0xDC0000,0x1580000,0x1F80000,0x41F40000,0x66000000,0x66000000,0x1180000,0x3800000,0x5DEC0000,0x66000000,
-0x1B40000,0x940734,0xB68002D3,0x788002D4,0x668002D3,0xA8700192,0x807000D7,0x6A74013B,0x6E700192,0x646C00D7,0x5C6C0192,0xA05C02D4,0x825C00A3,0x6A6400FD,0x725800CF,0x665C0002,0x5C6000D5,0x665802D4,0x605000FF,0x5858013A,0x525802D5,0xDC0734,0x8A3802D3,0x665C02D4,0x7E280192,0x664000D1,0x5C480190,0x781002D3,0x662000A2,0x5C2400D5,0x523402D4,0x1BC0734,
-0x600002F8,0x5A0001E0,0x50000339,0x48000738,0xFE6C0096,0xFC880434,0xFE8C04EC,0xBA5C0002,0x8C580003,0x745C0002,0x66600011,0x64540012,0xFC4C006E,0xB2480000,0x685000A9,0x5C2400D5,0x1380734,0xA802D3,0xA09400A2,0x749400A2,0x669400A2,0x928400C9,0x76840002,0x68840015,0x6A8000C9,0x647C0026,0x5C8000C9,0xF802D3,0x825C00A2,0x667800A2,0x765000C9,0x665C0001,
-0x5C6800C8,0x1FC02D3,0x661C00A2,0x5C1000C8,0x520002D4,0xF802D3,0x825C00A2,0x667800A2,0x765000C9,0x665C0001,0x5C6800C8,0x1FC02D3,0x661C00A2,0x5C1000C8,0x520002D4,0x1FC02D3,0x661C00A2,0x5C1000C8,0x520002D4,0x520002D4,0xFC740036,0xF4980189,0xF8A0017A,0xBA5C0001,0x8C580002,0x745C0001,0x66680002,0x644C0006,0xFC50003D,0xB2480000,0x6A4800A2,0x5C1000C8,
-0x16402D3,0x8002D3,0x8002D3,0x8002D3,0x8002D3,0x887000CA,0x887000CA,0x887000CA,0x606C00CA,0x606C00CA,0x526C00CA,0x825C00A3,0x825C00A3,0x825C00A3,0x645C0002,0x645C0002,0x54600026,0x565C00A3,0x565C00A3,0x50580015,0x485800A5,0x2BC02D3,0x2BC02D3,0x2BC02D3,0x6A3C00C9,0x6A3C00C9,0x525400C8,0x642400A2,0x642400A2,0x52340001,0x484000A4,0x18002D3,
-0x18002D3,0x520000CA,0x480000B4,0x400002D4,0xFC64002D,0xF4780173,0x8002D3,0xBE580001,0x885C0002,0x725C0002,0x6C5C0005,0x60580002,0xFE440019,0xB2480000,0x665400A3,0x52340001,0x11002D3,0x9400A2,0x9400A2,0x9400A2,0x9400A2,0x74840001,0x74840001,0x74840001,0x5A840001,0x5A840001,0x52800001,0xDC00A2,0xDC00A2,0xDC00A2,0x60640001,0x60640001,
-0x52700000,0x1BC00A2,0x1BC00A2,0x52380000,0x480000A4,0xDC00A2,0xDC00A2,0xDC00A2,0x60640001,0x60640001,0x52700000,0x1BC00A2,0x1BC00A2,0x52380000,0x480000A4,0x1BC00A2,0x1BC00A2,0x52380000,0x480000A4,0x480000A4,0xFA6C0002,0xFC880012,0x9400A2,0xBE580000,0x84600000,0x70600000,0x68640000,0x60580001,0xFA48000A,0xB2480000,0x13800A2,0x52380000,
-0x13800A2,0xB800CA,0x8CA80001,0x70A40001,0x66A40001,0x11400C8,0x76840001,0x66900001,0xFF800C8,0x66540000,0x5C0000C8,0x11400C8,0x76840001,0x66900001,0xFF800C8,0x66540000,0x5C0000C8,0xFF800C8,0x66540000,0x5C0000C8,0x5C0000C8,0x11400C8,0x76840001,0x66900001,0xFF800C8,0x66540000,0x5C0000C8,0xFF800C8,0x66540000,0x5C0000C8,0x5C0000C8,0xFF800C8,
-0x66540000,0x5C0000C8,0x5C0000C8,0x5C0000C8,0xFE780019,0xAC400C8,0xFEAC002D,0xBA5C0000,0x8C580001,0x76540000,0x666C0000,0x66340000,0xFC580012,0xB2480000,0x68740001,0x5C0000C8,0x18C00C8,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x6C00CA,0x665C0001,0x665C0001,0x665C0001,0x665C0001,0x665C0001,
-0x665C0001,0x4A580001,0x4A580001,0x4A580001,0x40580001,0xA400C8,0xA400C8,0xA400C8,0xA400C8,0xA400C8,0xA400C8,0x50380001,0x50380001,0x50380001,0x40440001,0x14C00C8,0x14C00C8,0x14C00C8,0x40080000,0x360000C8,0xF8600019,0x6C00CA,0x6C00CA,0xB25C0001,0x8A5C0001,0x745C0001,0x745C0001,0x5C5C0001,0xF4440005,0xB0480000,0x54540000,0x50380001,
-0xE800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table74[] = {
-0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x13C0000,
-0x13C0000,0x13C0000,0x13C0000,0x34000000,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x700000,0x700000,0x700000,0x9C0000,0xE00000,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,
-0xD80000,0x1B80000,0x1B80000,0x1B80000,0x48000000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0x1B80000,0x1B80000,0x1B80000,0x48000000,0x1B80000,0x1B80000,0x1B80000,0x48000000,0x48000000,0x9C0000,0x900001,0x900001,0x2A80000,0x4B40000,0x2C40000,0x2C40000,0xF40000,0x2A80000,0x4B40000,0x1340000,0x1B80000,
-0x1340000,0xB40001,0xB40001,0xB40001,0xB40001,0x30C0000,0x30C0000,0x30C0000,0xBFC0000,0xBFC0000,0x5A000000,0x30C0000,0x30C0000,0x30C0000,0xBFC0000,0xBFC0000,0x5A000000,0xBFC0000,0xBFC0000,0x5A000000,0x5A000000,0x30C0000,0x30C0000,0x30C0000,0xBFC0000,0xBFC0000,0x5A000000,0xBFC0000,0xBFC0000,0x5A000000,0x5A000000,0xBFC0000,
-0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0xD40000,0x8C00000,0xB40001,0xF80000,0x1300000,0x1800000,0x1BC0000,0x21F40000,0xE40000,0x30C0000,0x1800000,0x5A000000,0x1800000,0xDC0001,0x1480000,0x29FC0000,0x6E000000,0x1480000,0x29FC0000,0x6E000000,0x29FC0000,0x6E000000,0x6E000000,0x1480000,0x29FC0000,0x6E000000,0x29FC0000,0x6E000000,
-0x6E000000,0x29FC0000,0x6E000000,0x6E000000,0x6E000000,0x1480000,0x29FC0000,0x6E000000,0x29FC0000,0x6E000000,0x6E000000,0x29FC0000,0x6E000000,0x6E000000,0x6E000000,0x29FC0000,0x6E000000,0x6E000000,0x6E000000,0x6E000000,0x3140000,0xEC0000,0xEC0000,0x1740000,0xDFC0000,0x4BF40000,0x6E000000,0x6E000000,0x32C0000,0x1A00000,0x65FC0000,0x6E000000,
-0x1D80000,0xA40734,0xBE9002D3,0x809002D4,0x6E9002D3,0xB0800192,0x888000D7,0x7284013B,0x76800192,0x6C7C00D7,0x647C0192,0xA86C02D4,0x8A6C00A3,0x727400FD,0x7A6800CF,0x6E6C0002,0x647000D5,0x6E6802D4,0x686000FF,0x6068013A,0x5A6802D5,0xF40734,0x924802D3,0x6E6C02D4,0x86380192,0x6E5000D1,0x64580190,0x802002D3,0x6E3000A2,0x643400D5,0x5A4402D4,0x1F00734,
-0x6C0002D8,0x600001B8,0x5A000314,0x50000738,0xFC7C00BA,0xF498046A,0xF8A00513,0xC26C0002,0x94680003,0x7C6C0002,0x6E700011,0x6C640012,0xFE5C0082,0xBA580000,0x706000A9,0x643400D5,0x15C0734,0xB802D3,0xA8A400A2,0x7CA400A2,0x6EA400A2,0x9A9400C9,0x7E940002,0x70940015,0x729000C9,0x6C8C0026,0x649000C9,0x11002D3,0x8A6C00A2,0x6E8800A2,0x7E6000C9,0x6E6C0001,
-0x647800C8,0xDFC02D3,0x6E2C00A2,0x642000C8,0x5A0002D4,0x11002D3,0x8A6C00A2,0x6E8800A2,0x7E6000C9,0x6E6C0001,0x647800C8,0xDFC02D3,0x6E2C00A2,0x642000C8,0x5A0002D4,0xDFC02D3,0x6E2C00A2,0x642000C8,0x5A0002D4,0x5A0002D4,0xFE800049,0xFCA80189,0xFEAC0186,0xC26C0001,0x94680002,0x7C6C0001,0x6E780002,0x6C5C0006,0xFE600051,0xBA580000,0x725800A2,0x642000C8,
-0x18802D3,0x9002D3,0x9002D3,0x9002D3,0x9002D3,0x908000CA,0x908000CA,0x908000CA,0x687C00CA,0x687C00CA,0x5A7C00CA,0x8A6C00A3,0x8A6C00A3,0x8A6C00A3,0x6C6C0002,0x6C6C0002,0x5C700026,0x5E6C00A3,0x5E6C00A3,0x58680015,0x506800A5,0xD402D3,0xD402D3,0xD402D3,0x724C00C9,0x724C00C9,0x5A6400C8,0x6C3400A2,0x6C3400A2,0x5A440001,0x505000A4,0x1B002D3,
-0x1B002D3,0x5A0C00C8,0x500000A5,0x480002D4,0xFE74003B,0xFC880173,0x9002D3,0xC6680001,0x906C0002,0x7A6C0002,0x746C0005,0x68680002,0xFE580021,0xBA580000,0x6E6400A3,0x5A440001,0x13002D3,0xA400A2,0xA400A2,0xA400A2,0xA400A2,0x7C940001,0x7C940001,0x7C940001,0x62940001,0x62940001,0x5A900001,0xF400A2,0xF400A2,0xF400A2,0x68740001,0x68740001,
-0x5A800000,0x1F000A2,0x1F000A2,0x5A480000,0x500000A4,0xF400A2,0xF400A2,0xF400A2,0x68740001,0x68740001,0x5A800000,0x1F000A2,0x1F000A2,0x5A480000,0x500000A4,0x1F000A2,0x1F000A2,0x5A480000,0x500000A4,0x500000A4,0xF6800005,0xF4980019,0xA400A2,0xC6680000,0x8C700000,0x78700000,0x70740000,0x68680001,0xF260000D,0xBA580000,0x15C00A2,0x5A480000,
-0x15C00A2,0xC800CA,0x94B80001,0x78B40001,0x6EB40001,0x12C00C8,0x7E940001,0x6EA00001,0x1BF800C8,0x6E640000,0x640000C8,0x12C00C8,0x7E940001,0x6EA00001,0x1BF800C8,0x6E640000,0x640000C8,0x1BF800C8,0x6E640000,0x640000C8,0x640000C8,0x12C00C8,0x7E940001,0x6EA00001,0x1BF800C8,0x6E640000,0x640000C8,0x1BF800C8,0x6E640000,0x640000C8,0x640000C8,0x1BF800C8,
-0x6E640000,0x640000C8,0x640000C8,0x640000C8,0xFA8C0020,0xD800C8,0xF8C00032,0xC26C0000,0x94680001,0x7E640000,0x6E7C0000,0x6E440000,0xFC6C0019,0xBA580000,0x70840001,0x640000C8,0x1AC00C8,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x7C00CA,0x6E6C0001,0x6E6C0001,0x6E6C0001,0x6E6C0001,0x6E6C0001,
-0x6E6C0001,0x52680001,0x52680001,0x52680001,0x48680001,0xBC00C8,0xBC00C8,0xBC00C8,0xBC00C8,0xBC00C8,0xBC00C8,0x58480001,0x58480001,0x58480001,0x48540001,0x17C00C8,0x17C00C8,0x17C00C8,0x48180000,0x3E0000C8,0xF0700022,0x7C00CA,0x7C00CA,0xBA6C0001,0x926C0001,0x7C6C0001,0x7C6C0001,0x646C0001,0xFC540005,0xB8580000,0x5C640000,0x58480001,
-0x10C00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table75[] = {
-0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x1700000,
-0x1700000,0x1700000,0x1700000,0x3C000000,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x8800000,0x8800000,0x8800000,0xB40000,0x1000000,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,
-0xF00000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0x50000000,0xAC0000,0xA00001,0xA00001,0xBC0000,0xCC0000,0xDC0000,0xDC0000,0x1100000,0xBC0000,0xCC0000,0x1580000,0x1E80000,
-0x1580000,0xC40001,0xC40001,0xC40001,0xC40001,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0x17FC0000,
-0x17FC0000,0x62000000,0x62000000,0x62000000,0x4E40000,0xD40000,0xC40001,0x10C0000,0x14C0000,0x1A40000,0x1E40000,0x2BF80000,0xF80000,0x3240000,0x1A40000,0x62000000,0x1A40000,0xEC0001,0x1600000,0x35FC0000,0x76000000,0x1600000,0x35FC0000,0x76000000,0x35FC0000,0x76000000,0x76000000,0x1600000,0x35FC0000,0x76000000,0x35FC0000,0x76000000,
-0x76000000,0x35FC0000,0x76000000,0x76000000,0x76000000,0x1600000,0x35FC0000,0x76000000,0x35FC0000,0x76000000,0x76000000,0x35FC0000,0x76000000,0x76000000,0x76000000,0x35FC0000,0x76000000,0x76000000,0x76000000,0x76000000,0x3280000,0x6FC0000,0x6FC0000,0x1900000,0x1BFC0000,0x55F40000,0x76000000,0x76000000,0x1440000,0x1C00000,0x6FD00000,0x76000000,
-0x1F80000,0xB40734,0xC6A002D3,0x88A002D4,0x76A002D3,0xB8900192,0x909000D7,0x7A94013B,0x7E900192,0x748C00D7,0x6C8C0192,0xB07C02D4,0x927C00A3,0x7A8400FD,0x827800CF,0x767C0002,0x6C8000D5,0x767802D4,0x707000FF,0x6878013A,0x627802D5,0x10C0734,0x9A5802D3,0x767C02D4,0x8E480192,0x766000D1,0x6C680190,0x883002D3,0x764000A2,0x6C4400D5,0x625402D4,0xBF80734,
-0x760002D3,0x6C000198,0x600002F8,0x58000738,0xFE8C00D4,0xFCA8046A,0xFEAC0517,0xCA7C0002,0x9C780003,0x847C0002,0x76800011,0x74740012,0xFE7000A7,0xC2680000,0x787000A9,0x6C4400D5,0x17C0734,0xC802D3,0xB0B400A2,0x84B400A2,0x76B400A2,0xA2A400C9,0x86A40002,0x78A40015,0x7AA000C9,0x749C0026,0x6CA000C9,0x12802D3,0x927C00A2,0x769800A2,0x867000C9,0x767C0001,
-0x6C8800C8,0x19FC02D3,0x763C00A2,0x6C3000C8,0x620002D4,0x12802D3,0x927C00A2,0x769800A2,0x867000C9,0x767C0001,0x6C8800C8,0x19FC02D3,0x763C00A2,0x6C3000C8,0x620002D4,0x19FC02D3,0x763C00A2,0x6C3000C8,0x620002D4,0x620002D4,0xFE940050,0xF4B801A3,0xF8C00193,0xCA7C0001,0x9C780002,0x847C0001,0x76880002,0x746C0006,0xFE780056,0xC2680000,0x7A6800A2,0x6C3000C8,
-0x1A802D3,0xA002D3,0xA002D3,0xA002D3,0xA002D3,0x989000CA,0x989000CA,0x989000CA,0x708C00CA,0x708C00CA,0x628C00CA,0x927C00A3,0x927C00A3,0x927C00A3,0x747C0002,0x747C0002,0x64800026,0x667C00A3,0x667C00A3,0x60780015,0x587800A5,0xEC02D3,0xEC02D3,0xEC02D3,0x7A5C00C9,0x7A5C00C9,0x627400C8,0x744400A2,0x744400A2,0x62540001,0x586000A4,0x1E402D3,
-0x1E402D3,0x621C00C8,0x580800A4,0x500002D4,0xFE840046,0xF498018A,0xA002D3,0xCE780001,0x987C0002,0x827C0002,0x7C7C0005,0x70780002,0xF86C0032,0xC2680000,0x767400A3,0x62540001,0x15402D3,0xB400A2,0xB400A2,0xB400A2,0xB400A2,0x84A40001,0x84A40001,0x84A40001,0x6AA40001,0x6AA40001,0x62A00001,0x10C00A2,0x10C00A2,0x10C00A2,0x70840001,0x70840001,
-0x62900000,0xBF800A2,0xBF800A2,0x62580000,0x580000A4,0x10C00A2,0x10C00A2,0x10C00A2,0x70840001,0x70840001,0x62900000,0xBF800A2,0xBF800A2,0x62580000,0x580000A4,0xBF800A2,0xBF800A2,0x62580000,0x580000A4,0x580000A4,0xFE900005,0xFCA80019,0xB400A2,0xCE780000,0x94800000,0x80800000,0x78840000,0x70780001,0xFA70000D,0xC2680000,0x17C00A2,0x62580000,
-0x17C00A2,0xD800CA,0x9CC80001,0x80C40001,0x76C40001,0x14400C8,0x86A40001,0x76B00001,0x27F800C8,0x76740000,0x6C0000C8,0x14400C8,0x86A40001,0x76B00001,0x27F800C8,0x76740000,0x6C0000C8,0x27F800C8,0x76740000,0x6C0000C8,0x6C0000C8,0x14400C8,0x86A40001,0x76B00001,0x27F800C8,0x76740000,0x6C0000C8,0x27F800C8,0x76740000,0x6C0000C8,0x6C0000C8,0x27F800C8,
-0x76740000,0x6C0000C8,0x6C0000C8,0x6C0000C8,0xF4A00029,0xE800C8,0xFECC003A,0xCA7C0000,0x9C780001,0x86740000,0x768C0000,0x76540000,0xF8840020,0xC2680000,0x78940001,0x6C0000C8,0x1D000C8,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x8C00CA,0x767C0001,0x767C0001,0x767C0001,0x767C0001,0x767C0001,
-0x767C0001,0x5A780001,0x5A780001,0x5A780001,0x50780001,0x2D000C8,0x2D000C8,0x2D000C8,0x2D000C8,0x2D000C8,0x2D000C8,0x60580001,0x60580001,0x60580001,0x50640001,0x1AC00C8,0x1AC00C8,0x1AC00C8,0x50280000,0x460000C8,0xF8800022,0x8C00CA,0x8C00CA,0xC27C0001,0x9A7C0001,0x847C0001,0x847C0001,0x6C7C0001,0xFC640008,0xC0680000,0x64740000,0x60580001,
-0x12C00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table76[] = {
-0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0xD00000,0x1A40000,
-0x1A40000,0x1A40000,0x1A40000,0x44000001,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x2940000,0x2940000,0x2940000,0xD00000,0x1280000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,
-0x10C0000,0xBF80000,0xBF80000,0xBF80000,0x58000001,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0xBF80000,0x58000001,0x58000001,0xC00000,0xB40000,0xB40000,0xD00000,0x2E00000,0xF40000,0xF40000,0x32C0000,0xD00000,0x2E00000,0x17C0000,0xBF80000,
-0x17C0000,0xD80000,0xD80000,0xD80000,0xD80000,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x25F80000,0x25F80000,0x6A000001,0x6A000001,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x25F80000,0x25F80000,0x6A000001,0x6A000001,0x25F80000,
-0x25F80000,0x6A000001,0x6A000001,0x6A000001,0xFC0000,0xAE40000,0xD80000,0x3240000,0x16C0000,0x1CC0000,0x9F80000,0x37FC0000,0x50C0000,0x1400000,0x1CC0000,0x6A000001,0x1CC0000,0x1000000,0x17C0000,0x43F80000,0x7E000001,0x17C0000,0x43F80000,0x7E000001,0x43F80000,0x7E000001,0x7E000001,0x17C0000,0x43F80000,0x7E000001,0x43F80000,0x7E000001,
-0x7E000001,0x43F80000,0x7E000001,0x7E000001,0x7E000001,0x17C0000,0x43F80000,0x7E000001,0x43F80000,0x7E000001,0x7E000001,0x43F80000,0x7E000001,0x7E000001,0x7E000001,0x43F80000,0x7E000001,0x7E000001,0x7E000001,0x7E000001,0x1400000,0x1100000,0x1100000,0x3AC0000,0x29FC0000,0x61F00000,0x7E000001,0x7E000001,0x15C0000,0x1E00000,0x79C40000,0x7E000001,
-0x11FC0000,0xC40738,0xCCB402D4,0x92B402D4,0x7EB402D5,0xC2A00190,0x9AA000D5,0x80A8013A,0x88A00190,0x7CA000D5,0x74A00192,0xBC8C02D3,0x9C8C00A2,0x849800FF,0x8C8C00D1,0x7E8C0002,0x749000D7,0x7E8C02D4,0x7A8400FD,0x7284013B,0x6C8C02D3,0x3240734,0xA46802D3,0x7E8C02D4,0x965C0192,0x807400CF,0x747C0192,0x904402D3,0x7E5400A3,0x745400D7,0x6C6402D4,0x17FC0734,
-0x7E1402D4,0x74080192,0x6C0002DC,0x62000734,0xFEA000FE,0xF6BC04A0,0xF8C0053C,0xD48C0000,0xA68C0001,0x8C8C0002,0x80900012,0x7C8C0011,0xFE8800DC,0xC87C0002,0x828000A9,0x745400D7,0x1A40734,0xD802D4,0xB8C800A4,0x8CC800A4,0x7EC400A5,0xAEB400C8,0x90B40001,0x80B80015,0x82B400C8,0x7CB00026,0x74B400CA,0x14402D3,0x9A9000A2,0x7EAC00A3,0x8E8400C9,0x7E8C0002,
-0x769800CA,0x27F802D3,0x7E5400A3,0x744400CA,0x6C0002D3,0x14402D3,0x9A9000A2,0x7EAC00A3,0x8E8400C9,0x7E8C0002,0x769800CA,0x27F802D3,0x7E5400A3,0x744400CA,0x6C0002D3,0x27F802D3,0x7E5400A3,0x744400CA,0x6C0002D3,0x6C0002D3,0xFAAC0065,0xFECC01A0,0xF2D401AD,0xD48C0000,0xA68C0001,0x8C8C0002,0x80980002,0x7E800005,0xFA900071,0xC87C0001,0x847800A2,0x744400CA,
-0x1D002D3,0xB402D4,0xB402D4,0xB402D4,0xB402D4,0xA4A000C8,0xA4A000C8,0xA4A000C8,0x78A000C8,0x78A000C8,0x6AA000C9,0x9C8C00A2,0x9C8C00A2,0x9C8C00A2,0x7E8C0001,0x7E8C0001,0x6E900026,0x708C00A2,0x708C00A2,0x6A880015,0x628C00A2,0x10802D3,0x10802D3,0x10802D3,0x846C00C9,0x846C00C9,0x6C8400C9,0x7E5400A2,0x7E5400A2,0x6A680002,0x627000A2,0x9F802D3,
-0x9F802D3,0x6A3000C9,0x621400A2,0x580002D3,0xFC9C005A,0xFEAC0189,0xB402D4,0xD48C0000,0xA28C0001,0x8C8C0001,0x86900006,0x788C0002,0xFE80003E,0xC87C0001,0x808400A2,0x6A680002,0x17802D3,0xC400A4,0xC400A4,0xC400A4,0xC400A4,0x90B40000,0x90B40000,0x90B40000,0x74B40000,0x74B40000,0x6AB40001,0x32400A2,0x32400A2,0x32400A2,0x7A940001,0x7A940001,
-0x6AA40001,0x17FC00A2,0x17FC00A2,0x6A6C0001,0x620000A2,0x32400A2,0x32400A2,0x32400A2,0x7A940001,0x7A940001,0x6AA40001,0x17FC00A2,0x17FC00A2,0x6A6C0001,0x620000A2,0x17FC00A2,0x17FC00A2,0x6A6C0001,0x620000A2,0x620000A2,0xFEA0000A,0xF6BC0020,0xC400A4,0xD48C0000,0xA0900000,0x8A900000,0x80980001,0x7A880000,0xF8880012,0xC87C0000,0x1A400A2,0x6A6C0001,
-0x1A400A2,0xEC00C8,0xA8D80000,0x88D80001,0x7ED80001,0x35C00C8,0x90B40001,0x7EC40001,0x33FC00C8,0x7E880001,0x740000CA,0x35C00C8,0x90B40001,0x7EC40001,0x33FC00C8,0x7E880001,0x740000CA,0x33FC00C8,0x7E880001,0x740000CA,0x740000CA,0x35C00C8,0x90B40001,0x7EC40001,0x33FC00C8,0x7E880001,0x740000CA,0x33FC00C8,0x7E880001,0x740000CA,0x740000CA,0x33FC00C8,
-0x7E880001,0x740000CA,0x740000CA,0x740000CA,0xFEB40029,0xFC00C8,0xFAE4003D,0xD0900000,0xA6880001,0x8E880000,0x7EA00001,0x7E6C0001,0xFE980020,0xCA7C0000,0x82A40000,0x740000CA,0x1F400C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0xA000C8,0x828C0000,0x828C0000,0x828C0000,0x828C0000,0x828C0000,
-0x828C0000,0x628C0001,0x628C0001,0x628C0001,0x588C0001,0xEC00C8,0xEC00C8,0xEC00C8,0xEC00C8,0xEC00C8,0xEC00C8,0x6A680001,0x6A680001,0x6A680001,0x58780001,0x1E400C8,0x1E400C8,0x1E400C8,0x583C0001,0x4E0000CA,0xF2940029,0xA000C8,0xA000C8,0xD08C0000,0xA68C0000,0x908C0000,0x908C0000,0x768C0000,0xF47C000D,0xC07C0001,0x6A880001,0x6A680001,
-0x15400C8,};
-static const uint32_t g_etc1_to_bc7_m6_table77[] = {
-0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0x1D80000,
-0x1D80000,0x1D80000,0x1D80000,0x4C000001,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0xAA40000,0xAA40000,0xAA40000,0xE80000,0x14C0000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,
-0x1240000,0x17F80000,0x17F80000,0x17F80000,0x60000001,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x17F80000,0x17F80000,0x17F80000,0x60000001,0x17F80000,0x17F80000,0x17F80000,0x60000001,0x60000001,0xD00000,0xC40000,0xC40000,0xE40000,0x2F40000,0x3080000,0x3080000,0x1480000,0xE40000,0x2F40000,0x1A00000,0x17F80000,
-0x1A00000,0xE80000,0xE80000,0xE80000,0xE80000,0x1580000,0x1580000,0x1580000,0x31F80000,0x31F80000,0x72000001,0x1580000,0x1580000,0x1580000,0x31F80000,0x31F80000,0x72000001,0x31F80000,0x31F80000,0x72000001,0x72000001,0x1580000,0x1580000,0x1580000,0x31F80000,0x31F80000,0x72000001,0x31F80000,0x31F80000,0x72000001,0x72000001,0x31F80000,
-0x31F80000,0x72000001,0x72000001,0x72000001,0x30C0000,0xF80000,0xE80000,0x13C0000,0x1840000,0x1EC0000,0x15FC0000,0x43F40000,0x5200000,0x1580000,0x1EC0000,0x72000001,0x1EC0000,0x1100000,0x1940000,0x4FF80000,0x86000001,0x1940000,0x4FF80000,0x86000001,0x4FF80000,0x86000001,0x86000001,0x1940000,0x4FF80000,0x86000001,0x4FF80000,0x86000001,
-0x86000001,0x4FF80000,0x86000001,0x86000001,0x86000001,0x1940000,0x4FF80000,0x86000001,0x4FF80000,0x86000001,0x86000001,0x4FF80000,0x86000001,0x86000001,0x86000001,0x4FF80000,0x86000001,0x86000001,0x86000001,0x86000001,0x1540000,0x9200000,0x9200000,0x1C80000,0x37FC0000,0x6BF00000,0x86000001,0x86000001,0x3700000,0x3FC0000,0x81D40000,0x86000001,
-0x1FFC0000,0xD40738,0xD4C402D4,0x9AC402D4,0x86C402D5,0xCAB00190,0xA2B000D5,0x88B8013A,0x90B00190,0x84B000D5,0x7CB00192,0xC49C02D3,0xA49C00A2,0x8CA800FF,0x949C00D1,0x869C0002,0x7CA000D7,0x869C02D4,0x829400FD,0x7A94013B,0x749C02D3,0x33C0734,0xAC7802D3,0x869C02D4,0x9E6C0192,0x888400CF,0x7C8C0192,0x985402D3,0x866400A3,0x7C6400D7,0x747402D4,0x23FC0734,
-0x862402D4,0x7C180192,0x740002D4,0x6A000734,0xFEB4012D,0xFECC04A0,0xFECC0554,0xDC9C0000,0xAE9C0001,0x949C0002,0x88A00012,0x849C0011,0xFE9800FA,0xD08C0002,0x8A9000A9,0x7C6400D7,0x1C80734,0xE802D4,0xC0D800A4,0x94D800A4,0x86D400A5,0xB6C400C8,0x98C40001,0x88C80015,0x8AC400C8,0x84C00026,0x7CC400CA,0x15C02D3,0xA2A000A2,0x86BC00A3,0x969400C9,0x869C0002,
-0x7EA800CA,0x33F802D3,0x866400A3,0x7C5400CA,0x740002D3,0x15C02D3,0xA2A000A2,0x86BC00A3,0x969400C9,0x869C0002,0x7EA800CA,0x33F802D3,0x866400A3,0x7C5400CA,0x740002D3,0x33F802D3,0x866400A3,0x7C5400CA,0x740002D3,0x740002D3,0xFEBC0076,0xF6DC01BA,0xFAE401AD,0xDC9C0000,0xAE9C0001,0x949C0002,0x88A80002,0x86900005,0xFEA40081,0xD08C0001,0x8C8800A2,0x7C5400CA,
-0x1F002D3,0xC402D4,0xC402D4,0xC402D4,0xC402D4,0xACB000C8,0xACB000C8,0xACB000C8,0x80B000C8,0x80B000C8,0x72B000C9,0xA49C00A2,0xA49C00A2,0xA49C00A2,0x869C0001,0x869C0001,0x76A00026,0x789C00A2,0x789C00A2,0x72980015,0x6A9C00A2,0x12002D3,0x12002D3,0x12002D3,0x8C7C00C9,0x8C7C00C9,0x749400C9,0x866400A2,0x866400A2,0x72780002,0x6A8000A2,0x15F802D3,
-0x15F802D3,0x724000C9,0x6A2400A2,0x600002D3,0xFEAC0065,0xF6BC01A0,0xC402D4,0xDC9C0000,0xAA9C0001,0x949C0001,0x8EA00006,0x809C0002,0xFE90004A,0xD08C0001,0x889400A2,0x72780002,0x19C02D3,0xD400A4,0xD400A4,0xD400A4,0xD400A4,0x98C40000,0x98C40000,0x98C40000,0x7CC40000,0x7CC40000,0x72C40001,0x33C00A2,0x33C00A2,0x33C00A2,0x82A40001,0x82A40001,
-0x72B40001,0x23FC00A2,0x23FC00A2,0x727C0001,0x6A0000A2,0x33C00A2,0x33C00A2,0x33C00A2,0x82A40001,0x82A40001,0x72B40001,0x23FC00A2,0x23FC00A2,0x727C0001,0x6A0000A2,0x23FC00A2,0x23FC00A2,0x727C0001,0x6A0000A2,0x6A0000A2,0xFAB4000D,0xFECC0020,0xD400A4,0xDC9C0000,0xA8A00000,0x92A00000,0x88A80001,0x82980000,0xF29C0019,0xD08C0000,0x1C800A2,0x727C0001,
-0x1C800A2,0xFC00C8,0xB0E80000,0x90E80001,0x86E80001,0x37400C8,0x98C40001,0x86D40001,0x3FFC00C8,0x86980001,0x7C0000CA,0x37400C8,0x98C40001,0x86D40001,0x3FFC00C8,0x86980001,0x7C0000CA,0x3FFC00C8,0x86980001,0x7C0000CA,0x7C0000CA,0x37400C8,0x98C40001,0x86D40001,0x3FFC00C8,0x86980001,0x7C0000CA,0x3FFC00C8,0x86980001,0x7C0000CA,0x7C0000CA,0x3FFC00C8,
-0x86980001,0x7C0000CA,0x7C0000CA,0x7C0000CA,0xFAC80032,0x10C00C8,0xF2F40048,0xD8A00000,0xAE980001,0x96980000,0x86B00001,0x867C0001,0xFEAC0029,0xD28C0000,0x8AB40000,0x7C0000CA,0xDFC00C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0xB000C8,0x8A9C0000,0x8A9C0000,0x8A9C0000,0x8A9C0000,0x8A9C0000,
-0x8A9C0000,0x6A9C0001,0x6A9C0001,0x6A9C0001,0x609C0001,0x10400C8,0x10400C8,0x10400C8,0x10400C8,0x10400C8,0x10400C8,0x72780001,0x72780001,0x72780001,0x60880001,0x7FC00C8,0x7FC00C8,0x7FC00C8,0x604C0001,0x560000CA,0xFAA40029,0xB000C8,0xB000C8,0xD89C0000,0xAE9C0000,0x989C0000,0x989C0000,0x7E9C0000,0xFC8C000D,0xC88C0001,0x72980001,0x72780001,
-0x17400C8,};
-static const uint32_t g_etc1_to_bc7_m6_table78[] = {
-0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x5F80000,
-0x5F80000,0x5F80000,0x5F80000,0x54000001,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xB80000,0xB80000,0xB80000,0x1000000,0x16C0000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,
-0x13C0000,0x21FC0000,0x21FC0000,0x21FC0000,0x68000001,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,0x21FC0000,0x21FC0000,0x68000001,0x21FC0000,0x21FC0000,0x21FC0000,0x68000001,0x68000001,0x8E00000,0xD40000,0xD40000,0x4F40000,0x3080000,0x1200000,0x1200000,0x1640000,0x4F40000,0x3080000,0x1C00000,0x21FC0000,
-0x1C00000,0xF80000,0xF80000,0xF80000,0xF80000,0x1700000,0x1700000,0x1700000,0x3DF80000,0x3DF80000,0x7A000001,0x1700000,0x1700000,0x1700000,0x3DF80000,0x3DF80000,0x7A000001,0x3DF80000,0x3DF80000,0x7A000001,0x7A000001,0x1700000,0x1700000,0x1700000,0x3DF80000,0x3DF80000,0x7A000001,0x3DF80000,0x3DF80000,0x7A000001,0x7A000001,0x3DF80000,
-0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x1200000,0x1080000,0xF80000,0x1500000,0x1A00000,0x9FC0000,0x23FC0000,0x4DFC0000,0x5340000,0x1700000,0x9FC0000,0x7A000001,0x9FC0000,0x1200000,0x1AC0000,0x5BF80000,0x8E000001,0x1AC0000,0x5BF80000,0x8E000001,0x5BF80000,0x8E000001,0x8E000001,0x1AC0000,0x5BF80000,0x8E000001,0x5BF80000,0x8E000001,
-0x8E000001,0x5BF80000,0x8E000001,0x8E000001,0x8E000001,0x1AC0000,0x5BF80000,0x8E000001,0x5BF80000,0x8E000001,0x8E000001,0x5BF80000,0x8E000001,0x8E000001,0x8E000001,0x5BF80000,0x8E000001,0x8E000001,0x8E000001,0x8E000001,0x1680000,0x1340000,0x1340000,0x1E40000,0x45FC0000,0x75F00000,0x8E000001,0x8E000001,0x1880000,0x13FC0000,0x89E40000,0x8E000001,
-0x2FFC0000,0xE40738,0xDCD402D4,0xA2D402D4,0x8ED402D5,0xD2C00190,0xAAC000D5,0x90C8013A,0x98C00190,0x8CC000D5,0x84C00192,0xCCAC02D3,0xACAC00A2,0x94B800FF,0x9CAC00D1,0x8EAC0002,0x84B000D7,0x8EAC02D4,0x8AA400FD,0x82A4013B,0x7CAC02D3,0x1540734,0xB48802D3,0x8EAC02D4,0xA67C0192,0x909400CF,0x849C0192,0xA06402D3,0x8E7400A3,0x847400D7,0x7C8402D4,0x2FFC0734,
-0x8E3402D4,0x84280192,0x7C0C02D3,0x72000734,0xFEC40168,0xF6DC04DA,0xF8E00569,0xE4AC0000,0xB6AC0001,0x9CAC0002,0x90B00012,0x8CAC0011,0xFCB0013D,0xD89C0002,0x92A000A9,0x847400D7,0x1E80734,0xF802D4,0xC8E800A4,0x9CE800A4,0x8EE400A5,0xBED400C8,0xA0D40001,0x90D80015,0x92D400C8,0x8CD00026,0x84D400CA,0x17402D3,0xAAB000A2,0x8ECC00A3,0x9EA400C9,0x8EAC0002,
-0x86B800CA,0x3FF802D3,0x8E7400A3,0x846400CA,0x7C0002D3,0x17402D3,0xAAB000A2,0x8ECC00A3,0x9EA400C9,0x8EAC0002,0x86B800CA,0x3FF802D3,0x8E7400A3,0x846400CA,0x7C0002D3,0x3FF802D3,0x8E7400A3,0x846400CA,0x7C0002D3,0x7C0002D3,0xFED00083,0xFEEC01BA,0xF2F401C8,0xE4AC0000,0xB6AC0001,0x9CAC0002,0x90B80002,0x8EA00005,0xFEB40095,0xD89C0001,0x949800A2,0x846400CA,
-0xBFC02D3,0xD402D4,0xD402D4,0xD402D4,0xD402D4,0xB4C000C8,0xB4C000C8,0xB4C000C8,0x88C000C8,0x88C000C8,0x7AC000C9,0xACAC00A2,0xACAC00A2,0xACAC00A2,0x8EAC0001,0x8EAC0001,0x7EB00026,0x80AC00A2,0x80AC00A2,0x7AA80015,0x72AC00A2,0x13802D3,0x13802D3,0x13802D3,0x948C00C9,0x948C00C9,0x7CA400C9,0x8E7400A2,0x8E7400A2,0x7A880002,0x729000A2,0x21F802D3,
-0x21F802D3,0x7A5000C9,0x723400A2,0x680002D3,0xFEBC0075,0xFECC01A0,0xD402D4,0xE4AC0000,0xB2AC0001,0x9CAC0001,0x96B00006,0x88AC0002,0xFEA0005A,0xD89C0001,0x90A400A2,0x7A880002,0x1BC02D3,0xE400A4,0xE400A4,0xE400A4,0xE400A4,0xA0D40000,0xA0D40000,0xA0D40000,0x84D40000,0x84D40000,0x7AD40001,0x15400A2,0x15400A2,0x15400A2,0x8AB40001,0x8AB40001,
-0x7AC40001,0x2FFC00A2,0x2FFC00A2,0x7A8C0001,0x720000A2,0x15400A2,0x15400A2,0x15400A2,0x8AB40001,0x8AB40001,0x7AC40001,0x2FFC00A2,0x2FFC00A2,0x7A8C0001,0x720000A2,0x2FFC00A2,0x2FFC00A2,0x7A8C0001,0x720000A2,0x720000A2,0xF6C80012,0xF6DC0029,0xE400A4,0xE4AC0000,0xB0B00000,0x9AB00000,0x90B80001,0x8AA80000,0xFAAC0019,0xD89C0000,0x1E800A2,0x7A8C0001,
-0x1E800A2,0x10C00C8,0xB8F80000,0x98F80001,0x8EF80001,0x38C00C8,0xA0D40001,0x8EE40001,0x4BFC00C8,0x8EA80001,0x840000CA,0x38C00C8,0xA0D40001,0x8EE40001,0x4BFC00C8,0x8EA80001,0x840000CA,0x4BFC00C8,0x8EA80001,0x840000CA,0x840000CA,0x38C00C8,0xA0D40001,0x8EE40001,0x4BFC00C8,0x8EA80001,0x840000CA,0x4BFC00C8,0x8EA80001,0x840000CA,0x840000CA,0x4BFC00C8,
-0x8EA80001,0x840000CA,0x840000CA,0x840000CA,0xF2E0003D,0x71C00C8,0xFB040048,0xE0B00000,0xB6A80001,0x9EA80000,0x8EC00001,0x8E8C0001,0xF4C80032,0xDA9C0000,0x92C40000,0x840000CA,0x1DF800C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0xC000C8,0x92AC0000,0x92AC0000,0x92AC0000,0x92AC0000,0x92AC0000,
-0x92AC0000,0x72AC0001,0x72AC0001,0x72AC0001,0x68AC0001,0x11C00C8,0x11C00C8,0x11C00C8,0x11C00C8,0x11C00C8,0x11C00C8,0x7A880001,0x7A880001,0x7A880001,0x68980001,0x13FC00C8,0x13FC00C8,0x13FC00C8,0x685C0001,0x5E0000CA,0xF2B40034,0xC000C8,0xC000C8,0xE0AC0000,0xB6AC0000,0xA0AC0000,0xA0AC0000,0x86AC0000,0xFC9C0012,0xD09C0001,0x7AA80001,0x7A880001,
-0x19800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table79[] = {
-0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x11F80000,
-0x11F80000,0x11F80000,0x11F80000,0x5C000001,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xC80000,0xC80000,0xC80000,0x1180000,0x1900000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,
-0x3500000,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x70000001,0xF40000,0xE40000,0xE40000,0x1080000,0x31C0000,0x3340000,0x3340000,0x1800000,0x1080000,0x31C0000,0x1E40000,0x2DFC0000,
-0x1E40000,0x1080000,0x1080000,0x1080000,0x1080000,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x82000001,0x82000001,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x82000001,0x82000001,0x49F80000,
-0x49F80000,0x82000001,0x82000001,0x82000001,0x1340000,0x5180000,0x1080000,0x1680000,0x1BC0000,0x19FC0000,0x31FC0000,0x59F40000,0x14C0000,0x1880000,0x19FC0000,0x82000001,0x19FC0000,0x1300000,0x1C40000,0x67F80000,0x96000001,0x1C40000,0x67F80000,0x96000001,0x67F80000,0x96000001,0x96000001,0x1C40000,0x67F80000,0x96000001,0x67F80000,0x96000001,
-0x96000001,0x67F80000,0x96000001,0x96000001,0x96000001,0x1C40000,0x67F80000,0x96000001,0x67F80000,0x96000001,0x96000001,0x67F80000,0x96000001,0x96000001,0x96000001,0x67F80000,0x96000001,0x96000001,0x96000001,0x96000001,0x17C0000,0x1440000,0x1440000,0x3FC0000,0x53F80000,0x7FF00000,0x96000001,0x96000001,0x19C0000,0x25FC0000,0x91F40000,0x96000001,
-0x3FF80000,0xF40738,0xE4E402D4,0xAAE402D4,0x96E402D5,0xDAD00190,0xB2D000D5,0x98D8013A,0xA0D00190,0x94D000D5,0x8CD00192,0xD4BC02D3,0xB4BC00A2,0x9CC800FF,0xA4BC00D1,0x96BC0002,0x8CC000D7,0x96BC02D4,0x92B400FD,0x8AB4013B,0x84BC02D3,0x16C0734,0xBC9802D3,0x96BC02D4,0xAE8C0192,0x98A400CF,0x8CAC0192,0xA87402D3,0x968400A3,0x8C8400D7,0x849402D4,0x3BFC0734,
-0x964402D4,0x8C380192,0x841C02D3,0x7A000734,0xFED8019A,0xFEEC04DA,0xFEEC0585,0xECBC0000,0xBEBC0001,0xA4BC0002,0x98C00012,0x94BC0011,0xFEBC0167,0xE0AC0002,0x9AB000A9,0x8C8400D7,0x7FC0734,0x10802D4,0xD0F800A4,0xA4F800A4,0x96F400A5,0xC6E400C8,0xA8E40001,0x98E80015,0x9AE400C8,0x94E00026,0x8CE400CA,0x18C02D3,0xB2C000A2,0x96DC00A3,0xA6B400C9,0x96BC0002,
-0x8EC800CA,0x4BF802D3,0x968400A3,0x8C7400CA,0x840002D3,0x18C02D3,0xB2C000A2,0x96DC00A3,0xA6B400C9,0x96BC0002,0x8EC800CA,0x4BF802D3,0x968400A3,0x8C7400CA,0x840002D3,0x4BF802D3,0x968400A3,0x8C7400CA,0x840002D3,0x840002D3,0xFEE400A6,0xF90001D4,0xFB0401C8,0xECBC0000,0xBEBC0001,0xA4BC0002,0x98C80002,0x96B00005,0xFACC00B5,0xE0AC0001,0x9CA800A2,0x8C7400CA,
-0x1BFC02D3,0xE402D4,0xE402D4,0xE402D4,0xE402D4,0xBCD000C8,0xBCD000C8,0xBCD000C8,0x90D000C8,0x90D000C8,0x82D000C9,0xB4BC00A2,0xB4BC00A2,0xB4BC00A2,0x96BC0001,0x96BC0001,0x86C00026,0x88BC00A2,0x88BC00A2,0x82B80015,0x7ABC00A2,0x15002D3,0x15002D3,0x15002D3,0x9C9C00C9,0x9C9C00C9,0x84B400C9,0x968400A2,0x968400A2,0x82980002,0x7AA000A2,0x2DF802D3,
-0x2DF802D3,0x826000C9,0x7A4400A2,0x700002D3,0xFECC0084,0xF6DC01B9,0xE402D4,0xECBC0000,0xBABC0001,0xA4BC0001,0x9EC00006,0x90BC0002,0xFEB40065,0xE0AC0001,0x98B400A2,0x82980002,0x1E002D3,0xF400A4,0xF400A4,0xF400A4,0xF400A4,0xA8E40000,0xA8E40000,0xA8E40000,0x8CE40000,0x8CE40000,0x82E40001,0x16C00A2,0x16C00A2,0x16C00A2,0x92C40001,0x92C40001,
-0x82D40001,0x3BFC00A2,0x3BFC00A2,0x829C0001,0x7A0000A2,0x16C00A2,0x16C00A2,0x16C00A2,0x92C40001,0x92C40001,0x82D40001,0x3BFC00A2,0x3BFC00A2,0x829C0001,0x7A0000A2,0x3BFC00A2,0x3BFC00A2,0x829C0001,0x7A0000A2,0x7A0000A2,0xFED80012,0xFEEC0029,0xF400A4,0xECBC0000,0xB8C00000,0xA2C00000,0x98C80001,0x92B80000,0xFCC00020,0xE0AC0000,0x7FC00A2,0x829C0001,
-0x7FC00A2,0x11C00C8,0xC1080000,0xA1080001,0x97080001,0x3A400C8,0xA8E40001,0x96F40001,0x57FC00C8,0x96B80001,0x8C0000CA,0x3A400C8,0xA8E40001,0x96F40001,0x57FC00C8,0x96B80001,0x8C0000CA,0x57FC00C8,0x96B80001,0x8C0000CA,0x8C0000CA,0x3A400C8,0xA8E40001,0x96F40001,0x57FC00C8,0x96B80001,0x8C0000CA,0x57FC00C8,0x96B80001,0x8C0000CA,0x8C0000CA,0x57FC00C8,
-0x96B80001,0x8C0000CA,0x8C0000CA,0x8C0000CA,0xFAF0003D,0xF2C00C8,0xF3140055,0xE8C00000,0xBEB80001,0xA6B80000,0x96D00001,0x969C0001,0xFCD80032,0xE2AC0000,0x9AD40000,0x8C0000CA,0x2BFC00C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0xD000C8,0x9ABC0000,0x9ABC0000,0x9ABC0000,0x9ABC0000,0x9ABC0000,
-0x9ABC0000,0x7ABC0001,0x7ABC0001,0x7ABC0001,0x70BC0001,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x82980001,0x82980001,0x82980001,0x70A80001,0x1FF800C8,0x1FF800C8,0x1FF800C8,0x706C0001,0x660000CA,0xFAC40034,0xD000C8,0xD000C8,0xE8BC0000,0xBEBC0000,0xA8BC0000,0xA8BC0000,0x8EBC0000,0xF8B00019,0xD8AC0001,0x82B80001,0x82980001,
-0x1B800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table80[] = {
-0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x3300000,0x1DFC0000,
-0x1DFC0000,0x1DFC0000,0x1DFC0000,0x66000000,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0xDC0000,0xDC0000,0xDC0000,0x3300000,0x1B40000,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,
-0x16C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0xB040000,0xF40001,0xF40001,0x51C0000,0x1340000,0x34C0000,0x34C0000,0x19C0000,0x51C0000,0x1340000,0x7FC0000,0x3BFC0000,
-0x7FC0000,0x1180001,0x1180001,0x1180001,0x1180001,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x57F80000,0x57F80000,0x8C000000,0x8C000000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x57F80000,0x57F80000,0x8C000000,0x8C000000,0x57F80000,
-0x57F80000,0x8C000000,0x8C000000,0x8C000000,0x1480000,0x12C0000,0x1180001,0x1800000,0x1D80000,0x29FC0000,0x3FFC0000,0x65F40000,0x3600000,0x1A40000,0x29FC0000,0x8C000000,0x29FC0000,0x1400001,0x3DC0000,0x73FC0000,0xA0000000,0x3DC0000,0x73FC0000,0xA0000000,0x73FC0000,0xA0000000,0xA0000000,0x3DC0000,0x73FC0000,0xA0000000,0x73FC0000,0xA0000000,
-0xA0000000,0x73FC0000,0xA0000000,0xA0000000,0xA0000000,0x3DC0000,0x73FC0000,0xA0000000,0x73FC0000,0xA0000000,0xA0000000,0x73FC0000,0xA0000000,0xA0000000,0xA0000000,0x73FC0000,0xA0000000,0xA0000000,0xA0000000,0xA0000000,0x1940000,0x1580000,0x1580000,0x19FC0000,0x61FC0000,0x89FC0000,0xA0000000,0xA0000000,0x3B40000,0x37FC0000,0x9BE80000,0xA0000000,
-0x4FFC0000,0x1080734,0xF0F402D3,0xB2F402D4,0xA0F402D3,0xE2E40192,0xBAE400D7,0xA4E8013B,0xA8E40192,0x9EE000D7,0x96E00192,0xDAD002D4,0xBCD000A3,0xA4D800FD,0xACCC00CF,0xA0D00002,0x96D400D5,0xA0CC02D4,0x9AC400FF,0x92CC013A,0x8CCC02D5,0x1880734,0xC4AC02D3,0xA0D002D4,0xB89C0192,0xA0B400D1,0x96BC0190,0xB28402D3,0xA09400A2,0x969800D5,0x8CA802D4,0x49F80734,
-0xA05402D3,0x96440190,0x8C3002D4,0x82000738,0xFEEC01E2,0xF9000514,0xFB040594,0xF4D00002,0xC6CC0003,0xAED00002,0xA0D40011,0x9EC80012,0xFED4019A,0xECBC0000,0xA2C400A9,0x969800D5,0x19FC0734,0x11C02D3,0xDB0800A2,0xAF0800A2,0xA10800A2,0xCCF800C9,0xB0F80002,0xA2F80015,0xA4F400C9,0x9EF00026,0x96F400C9,0x3A402D3,0xBCD000A2,0xA0EC00A2,0xB0C400C9,0xA0D00001,
-0x96DC00C8,0x57FC02D3,0xA09000A2,0x968400C8,0x8C0002D4,0x3A402D3,0xBCD000A2,0xA0EC00A2,0xB0C400C9,0xA0D00001,0x96DC00C8,0x57FC02D3,0xA09000A2,0x968400C8,0x8C0002D4,0x57FC02D3,0xA09000A2,0x968400C8,0x8C0002D4,0x8C0002D4,0xFEF800B6,0xFF0C01E3,0xF51801E6,0xF4D00001,0xC6CC0002,0xAED00001,0xA0DC0002,0x9EC00006,0xFEDC00C5,0xECBC0000,0xA4BC00A2,0x968400C8,
-0x2BFC02D3,0xF402D3,0xF402D3,0xF402D3,0xF402D3,0xC2E400CA,0xC2E400CA,0xC2E400CA,0x9AE000CA,0x9AE000CA,0x8CE000CA,0xBCD000A3,0xBCD000A3,0xBCD000A3,0x9ED00002,0x9ED00002,0x8ED40026,0x90D000A3,0x90D000A3,0x8ACC0015,0x82CC00A5,0x36802D3,0x36802D3,0x36802D3,0xA4B000C9,0xA4B000C9,0x8CC800C8,0x9E9800A2,0x9E9800A2,0x8CA80001,0x82B400A4,0x39FC02D3,
-0x39FC02D3,0x8C7000C8,0x825C00A4,0x7A0002D4,0xFAE000A6,0xFEEC01BE,0xF402D3,0xF8CC0001,0xC2D00002,0xACD00002,0xA6D00005,0x9ACC0002,0xFCCC0080,0xECBC0000,0xA0C800A3,0x8CA80001,0x5FC02D3,0x10800A2,0x10800A2,0x10800A2,0x10800A2,0xAEF80001,0xAEF80001,0xAEF80001,0x94F80001,0x94F80001,0x8CF40001,0x18800A2,0x18800A2,0x18800A2,0x9AD80001,0x9AD80001,
-0x8CE40000,0x49F800A2,0x49F800A2,0x8CAC0000,0x820000A4,0x18800A2,0x18800A2,0x18800A2,0x9AD80001,0x9AD80001,0x8CE40000,0x49F800A2,0x49F800A2,0x8CAC0000,0x820000A4,0x49F800A2,0x49F800A2,0x8CAC0000,0x820000A4,0x820000A4,0xFAEC0019,0xF9000032,0x10800A2,0xF8CC0000,0xBED40000,0xAAD40000,0xA2D80000,0x9ACC0001,0xFED00028,0xECBC0000,0x19FC00A2,0x8CAC0000,
-0x19FC00A2,0x12C00CA,0xC71C0001,0xAB180001,0xA1180001,0x1C000C8,0xB0F80001,0xA1040001,0x65F800C8,0xA0C80000,0x960000C8,0x1C000C8,0xB0F80001,0xA1040001,0x65F800C8,0xA0C80000,0x960000C8,0x65F800C8,0xA0C80000,0x960000C8,0x960000C8,0x1C000C8,0xB0F80001,0xA1040001,0x65F800C8,0xA0C80000,0x960000C8,0x65F800C8,0xA0C80000,0x960000C8,0x960000C8,0x65F800C8,
-0xA0C80000,0x960000C8,0x960000C8,0x960000C8,0xFB040048,0x94000C8,0xFD280055,0xF4D00000,0xC6CC0001,0xB0C80000,0xA0E00000,0xA0A80000,0xFEF0003D,0xECBC0000,0xA2E80001,0x960000C8,0x3DF800C8,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xE000CA,0xA0D00001,0xA0D00001,0xA0D00001,0xA0D00001,0xA0D00001,
-0xA0D00001,0x84CC0001,0x84CC0001,0x84CC0001,0x7ACC0001,0x15000C8,0x15000C8,0x15000C8,0x15000C8,0x15000C8,0x15000C8,0x8AAC0001,0x8AAC0001,0x8AAC0001,0x7AB80001,0x2DF800C8,0x2DF800C8,0x2DF800C8,0x7A7C0000,0x700000C8,0xF4D8003D,0xE000CA,0xE000CA,0xECD00001,0xC4D00001,0xAED00001,0xAED00001,0x96D00001,0xF4C40020,0xEABC0000,0x8EC80000,0x8AAC0001,
-0x1E000C8,};
-static const uint32_t g_etc1_to_bc7_m6_table81[] = {
-0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x29FC0000,
-0x29FC0000,0x29FC0000,0x29FC0000,0x6E000000,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xDC0001,0xEC0000,0xEC0000,0xEC0000,0x1480000,0x1D80000,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,
-0x1840000,0x47FC0000,0x47FC0000,0x47FC0000,0x82000000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x47FC0000,0x82000000,0x47FC0000,0x47FC0000,0x47FC0000,0x82000000,0x82000000,0x1180000,0x1040001,0x1040001,0x1300000,0x1480000,0x1640000,0x1640000,0x1B80000,0x1300000,0x1480000,0x17FC0000,0x47FC0000,
-0x17FC0000,0x1280001,0x1280001,0x1280001,0x1280001,0x1BC0000,0x1BC0000,0x1BC0000,0x61FC0000,0x61FC0000,0x94000000,0x1BC0000,0x1BC0000,0x1BC0000,0x61FC0000,0x61FC0000,0x94000000,0x61FC0000,0x61FC0000,0x94000000,0x94000000,0x1BC0000,0x1BC0000,0x1BC0000,0x61FC0000,0x61FC0000,0x94000000,0x61FC0000,0x61FC0000,0x94000000,0x94000000,0x61FC0000,
-0x61FC0000,0x94000000,0x94000000,0x94000000,0x5580000,0x73C0000,0x1280001,0x1940000,0x1F40000,0x39FC0000,0x4DFC0000,0x6FFC0000,0x3740000,0x1BC0000,0x39FC0000,0x94000000,0x39FC0000,0x1500001,0x3F40000,0x7FFC0000,0xA8000000,0x3F40000,0x7FFC0000,0xA8000000,0x7FFC0000,0xA8000000,0xA8000000,0x3F40000,0x7FFC0000,0xA8000000,0x7FFC0000,0xA8000000,
-0xA8000000,0x7FFC0000,0xA8000000,0xA8000000,0xA8000000,0x3F40000,0x7FFC0000,0xA8000000,0x7FFC0000,0xA8000000,0xA8000000,0x7FFC0000,0xA8000000,0xA8000000,0xA8000000,0x7FFC0000,0xA8000000,0xA8000000,0xA8000000,0xA8000000,0x1A80000,0x1680000,0x1680000,0x2DFC0000,0x6FFC0000,0x93FC0000,0xA8000000,0xA8000000,0x1CC0000,0x49FC0000,0xA3F80000,0xA8000000,
-0x5FF80000,0x1180734,0xF90402D3,0xBB0402D4,0xA90402D3,0xEAF40192,0xC2F400D7,0xACF8013B,0xB0F40192,0xA6F000D7,0x9EF00192,0xE2E002D4,0xC4E000A3,0xACE800FD,0xB4DC00CF,0xA8E00002,0x9EE400D5,0xA8DC02D4,0xA2D400FF,0x9ADC013A,0x94DC02D5,0x1A00734,0xCCBC02D3,0xA8E002D4,0xC0AC0192,0xA8C400D1,0x9ECC0190,0xBA9402D3,0xA8A400A2,0x9EA800D5,0x94B802D4,0x55F80734,
-0xA86402D3,0x9E540190,0x944002D4,0x8A000738,0xFEF8021F,0xFF0C0524,0xF31405C3,0xFCE00002,0xCEDC0003,0xB6E00002,0xA8E40011,0xA6D80012,0xFCEC01F6,0xF4CC0000,0xAAD400A9,0x9EA800D5,0x27FC0734,0x12C02D3,0xE31800A2,0xB71800A2,0xA91800A2,0xD50800C9,0xB9080002,0xAB080015,0xAD0400C9,0xA7000026,0x9F0400C9,0x3BC02D3,0xC4E000A2,0xA8FC00A2,0xB8D400C9,0xA8E00001,
-0x9EEC00C8,0x63FC02D3,0xA8A000A2,0x9E9400C8,0x940002D4,0x3BC02D3,0xC4E000A2,0xA8FC00A2,0xB8D400C9,0xA8E00001,0x9EEC00C8,0x63FC02D3,0xA8A000A2,0x9E9400C8,0x940002D4,0x63FC02D3,0xA8A000A2,0x9E9400C8,0x940002D4,0x940002D4,0xFB0C00DE,0xF92001F1,0xFD2801E6,0xFCE00001,0xCEDC0002,0xB6E00001,0xA8EC0002,0xA6D00006,0xFCF400E1,0xF4CC0000,0xACCC00A2,0x9E9400C8,
-0x3BFC02D3,0x10402D3,0x10402D3,0x10402D3,0x10402D3,0xCAF400CA,0xCAF400CA,0xCAF400CA,0xA2F000CA,0xA2F000CA,0x94F000CA,0xC4E000A3,0xC4E000A3,0xC4E000A3,0xA6E00002,0xA6E00002,0x96E40026,0x98E000A3,0x98E000A3,0x92DC0015,0x8ADC00A5,0x38002D3,0x38002D3,0x38002D3,0xACC000C9,0xACC000C9,0x94D800C8,0xA6A800A2,0xA6A800A2,0x94B80001,0x8AC400A4,0x45FC02D3,
-0x45FC02D3,0x948000C8,0x8A6C00A4,0x820002D4,0xFEF400BA,0xF90001D3,0x10402D3,0xF6E00002,0xCAE00002,0xB4E00002,0xAEE00005,0xA2DC0002,0xFED800A1,0xF4CC0000,0xA8D800A3,0x94B80001,0x15FC02D3,0x11800A2,0x11800A2,0x11800A2,0x11800A2,0xB7080001,0xB7080001,0xB7080001,0x9D080001,0x9D080001,0x95040001,0x1A000A2,0x1A000A2,0x1A000A2,0xA2E80001,0xA2E80001,
-0x94F40000,0x55F800A2,0x55F800A2,0x94BC0000,0x8A0000A4,0x1A000A2,0x1A000A2,0x1A000A2,0xA2E80001,0xA2E80001,0x94F40000,0x55F800A2,0x55F800A2,0x94BC0000,0x8A0000A4,0x55F800A2,0x55F800A2,0x94BC0000,0x8A0000A4,0x8A0000A4,0xF7000020,0xFF0C003A,0x11800A2,0xF4E40001,0xC6E40000,0xB2E40000,0xAAE80000,0xA2DC0001,0xF8EC0029,0xF4CC0000,0x27FC00A2,0x94BC0000,
-0x27FC00A2,0x13C00CA,0xCF2C0001,0xB3280001,0xA9280001,0x1D800C8,0xB9080001,0xA9140001,0x71F800C8,0xA8D80000,0x9E0000C8,0x1D800C8,0xB9080001,0xA9140001,0x71F800C8,0xA8D80000,0x9E0000C8,0x71F800C8,0xA8D80000,0x9E0000C8,0x9E0000C8,0x1D800C8,0xB9080001,0xA9140001,0x71F800C8,0xA8D80000,0x9E0000C8,0x71F800C8,0xA8D80000,0x9E0000C8,0x9E0000C8,0x71F800C8,
-0xA8D80000,0x9E0000C8,0x9E0000C8,0x9E0000C8,0xFF14004A,0x15400C8,0xF5380062,0xFCE00000,0xCEDC0001,0xB8D80000,0xA8F00000,0xA8B80000,0xF7080048,0xF4CC0000,0xAAF80001,0x9E0000C8,0x4BFC00C8,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xF000CA,0xA8E00001,0xA8E00001,0xA8E00001,0xA8E00001,0xA8E00001,
-0xA8E00001,0x8CDC0001,0x8CDC0001,0x8CDC0001,0x82DC0001,0x16800C8,0x16800C8,0x16800C8,0x16800C8,0x16800C8,0x16800C8,0x92BC0001,0x92BC0001,0x92BC0001,0x82C80001,0x39F800C8,0x39F800C8,0x39F800C8,0x828C0000,0x780000C8,0xFCE8003D,0xF000CA,0xF000CA,0xF4E00001,0xCCE00001,0xB6E00001,0xB6E00001,0x9EE00001,0xFCD40020,0xF2CC0000,0x96D80000,0x92BC0001,
-0x3FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table82[] = {
-0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x35FC0000,
-0x35FC0000,0x35FC0000,0x35FC0000,0x76000000,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0x6FC0000,0x6FC0000,0x6FC0000,0x1600000,0x1F80000,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,
-0x19C0000,0x53FC0000,0x53FC0000,0x53FC0000,0x8A000000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,0x53FC0000,0x53FC0000,0x8A000000,0x53FC0000,0x53FC0000,0x53FC0000,0x8A000000,0x8A000000,0x1280000,0x1140001,0x1140001,0x1440000,0x15C0000,0x3780000,0x3780000,0x1D40000,0x1440000,0x15C0000,0x25FC0000,0x53FC0000,
-0x25FC0000,0x1380001,0x1380001,0x1380001,0x1380001,0x3D00000,0x3D00000,0x3D00000,0x6DFC0000,0x6DFC0000,0x9C000000,0x3D00000,0x3D00000,0x3D00000,0x6DFC0000,0x6DFC0000,0x9C000000,0x6DFC0000,0x6DFC0000,0x9C000000,0x9C000000,0x3D00000,0x3D00000,0x3D00000,0x6DFC0000,0x6DFC0000,0x9C000000,0x6DFC0000,0x6DFC0000,0x9C000000,0x9C000000,0x6DFC0000,
-0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x16C0000,0xF4C0000,0x1380001,0x1AC0000,0xFFC0000,0x47FC0000,0x5BFC0000,0x7BF40000,0x3880000,0x3D00000,0x47FC0000,0x9C000000,0x47FC0000,0x1600001,0x13FC0000,0x8BFC0000,0xB0000000,0x13FC0000,0x8BFC0000,0xB0000000,0x8BFC0000,0xB0000000,0xB0000000,0x13FC0000,0x8BFC0000,0xB0000000,0x8BFC0000,0xB0000000,
-0xB0000000,0x8BFC0000,0xB0000000,0xB0000000,0xB0000000,0x13FC0000,0x8BFC0000,0xB0000000,0x8BFC0000,0xB0000000,0xB0000000,0x8BFC0000,0xB0000000,0xB0000000,0xB0000000,0x8BFC0000,0xB0000000,0xB0000000,0xB0000000,0xB0000000,0x1BC0000,0x5780000,0x5780000,0x41FC0000,0x7DF80000,0x9DFC0000,0xB0000000,0xB0000000,0x1E00000,0x59FC0000,0xADCC0000,0xB0000000,
-0x6DFC0000,0x1280734,0xFD1402D4,0xC31402D4,0xB11402D3,0xF3040192,0xCB0400D7,0xB508013B,0xB9040192,0xAF0000D7,0xA7000192,0xEAF002D4,0xCCF000A3,0xB4F800FD,0xBCEC00CF,0xB0F00002,0xA6F400D5,0xB0EC02D4,0xAAE400FF,0xA2EC013A,0x9CEC02D5,0x1B80734,0xD4CC02D3,0xB0F002D4,0xC8BC0192,0xB0D400D1,0xA6DC0190,0xC2A402D3,0xB0B400A2,0xA6B800D5,0x9CC802D4,0x61F80734,
-0xB07402D3,0xA6640190,0x9C5002D4,0x92000738,0xFF0C0252,0xF9200552,0xFB2405C3,0xFEF00006,0xD6EC0003,0xBEF00002,0xB0F40011,0xAEE80012,0xFEF8021A,0xFCDC0000,0xB2E400A9,0xA6B800D5,0x37FC0734,0x13C02D3,0xEB2800A2,0xBF2800A2,0xB12800A2,0xDD1800C9,0xC1180002,0xB3180015,0xB51400C9,0xAF100026,0xA71400C9,0x1D402D3,0xCCF000A2,0xB10C00A2,0xC0E400C9,0xB0F00001,
-0xA6FC00C8,0x6FFC02D3,0xB0B000A2,0xA6A400C8,0x9C0002D4,0x1D402D3,0xCCF000A2,0xB10C00A2,0xC0E400C9,0xB0F00001,0xA6FC00C8,0x6FFC02D3,0xB0B000A2,0xA6A400C8,0x9C0002D4,0x6FFC02D3,0xB0B000A2,0xA6A400C8,0x9C0002D4,0x9C0002D4,0xFD1C00F5,0xFF2C0209,0xF5380203,0xFCF40005,0xD6EC0002,0xBEF00001,0xB0FC0002,0xAEE00006,0xFF0800F4,0xFCDC0000,0xB4DC00A2,0xA6A400C8,
-0x49FC02D3,0x11402D3,0x11402D3,0x11402D3,0x11402D3,0xD30400CA,0xD30400CA,0xD30400CA,0xAB0000CA,0xAB0000CA,0x9D0000CA,0xCCF000A3,0xCCF000A3,0xCCF000A3,0xAEF00002,0xAEF00002,0x9EF40026,0xA0F000A3,0xA0F000A3,0x9AEC0015,0x92EC00A5,0x39802D3,0x39802D3,0x39802D3,0xB4D000C9,0xB4D000C9,0x9CE800C8,0xAEB800A2,0xAEB800A2,0x9CC80001,0x92D400A4,0x51FC02D3,
-0x51FC02D3,0x9C9000C8,0x927C00A4,0x8A0002D4,0xFF0400CB,0xFF0C01DB,0x11402D3,0xFEF00002,0xD2F00002,0xBCF00002,0xB6F00005,0xAAEC0002,0xFCF000B5,0xFCDC0000,0xB0E800A3,0x9CC80001,0x23FC02D3,0x12800A2,0x12800A2,0x12800A2,0x12800A2,0xBF180001,0xBF180001,0xBF180001,0xA5180001,0xA5180001,0x9D140001,0x1B800A2,0x1B800A2,0x1B800A2,0xAAF80001,0xAAF80001,
-0x9D040000,0x61F800A2,0x61F800A2,0x9CCC0000,0x920000A4,0x1B800A2,0x1B800A2,0x1B800A2,0xAAF80001,0xAAF80001,0x9D040000,0x61F800A2,0x61F800A2,0x9CCC0000,0x920000A4,0x61F800A2,0x61F800A2,0x9CCC0000,0x920000A4,0x920000A4,0xFF100020,0xF920003D,0x12800A2,0xFCF40001,0xCEF40000,0xBAF40000,0xB2F80000,0xAAEC0001,0xFEF8002D,0xFCDC0000,0x37FC00A2,0x9CCC0000,
-0x37FC00A2,0x14C00CA,0xD73C0001,0xBB380001,0xB1380001,0x1F000C8,0xC1180001,0xB1240001,0x7DF800C8,0xB0E80000,0xA60000C8,0x1F000C8,0xC1180001,0xB1240001,0x7DF800C8,0xB0E80000,0xA60000C8,0x7DF800C8,0xB0E80000,0xA60000C8,0xA60000C8,0x1F000C8,0xC1180001,0xB1240001,0x7DF800C8,0xB0E80000,0xA60000C8,0x7DF800C8,0xB0E80000,0xA60000C8,0xA60000C8,0x7DF800C8,
-0xB0E80000,0xA60000C8,0xA60000C8,0xA60000C8,0xFB2C0055,0x16400C8,0xFD480062,0xFEF40001,0xD6EC0001,0xC0E80000,0xB1000000,0xB0C80000,0xFF180048,0xFCDC0000,0xB3080001,0xA60000C8,0x5BFC00C8,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0x10000CA,0xB0F00001,0xB0F00001,0xB0F00001,0xB0F00001,0xB0F00001,
-0xB0F00001,0x94EC0001,0x94EC0001,0x94EC0001,0x8AEC0001,0x18000C8,0x18000C8,0x18000C8,0x18000C8,0x18000C8,0x18000C8,0x9ACC0001,0x9ACC0001,0x9ACC0001,0x8AD80001,0x45F800C8,0x45F800C8,0x45F800C8,0x8A9C0000,0x800000C8,0xF4F8004A,0x10000CA,0x10000CA,0xFCF00001,0xD4F00001,0xBEF00001,0xBEF00001,0xA6F00001,0xF8E80029,0xFADC0000,0x9EE80000,0x9ACC0001,
-0x13FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table83[] = {
-0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x41FC0000,
-0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xF0C0000,0xF0C0000,0xF0C0000,0x1780000,0xFFC0000,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,
-0x1B40000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x92000000,0x5380000,0x1240001,0x1240001,0x3540000,0x1700000,0x1900000,0x1900000,0x3EC0000,0x3540000,0x1700000,0x35FC0000,0x5FF80000,
-0x35FC0000,0x1480001,0x1480001,0x1480001,0x1480001,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x79FC0000,
-0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x1800000,0x1600000,0x1480001,0x1C00000,0x23FC0000,0x57FC0000,0x69F80000,0x85F80000,0x39C0000,0x3E80000,0x57FC0000,0xA4000000,0x57FC0000,0x1700001,0x2BFC0000,0x97FC0000,0xB8000000,0x2BFC0000,0x97FC0000,0xB8000000,0x97FC0000,0xB8000000,0xB8000000,0x2BFC0000,0x97FC0000,0xB8000000,0x97FC0000,0xB8000000,
-0xB8000000,0x97FC0000,0xB8000000,0xB8000000,0xB8000000,0x2BFC0000,0x97FC0000,0xB8000000,0x97FC0000,0xB8000000,0xB8000000,0x97FC0000,0xB8000000,0xB8000000,0xB8000000,0x97FC0000,0xB8000000,0xB8000000,0xB8000000,0xB8000000,0x1D00000,0xD880000,0xD880000,0x53FC0000,0x89FC0000,0xA7FC0000,0xB8000000,0xB8000000,0x1F80000,0x6BFC0000,0xB5DC0000,0xB8000000,
-0x7DF80000,0x1380734,0xFF2402DC,0xCB2402D4,0xB92402D3,0xFB140192,0xD31400D7,0xBD18013B,0xC1140192,0xB71000D7,0xAF100192,0xF30002D4,0xD50000A3,0xBD0800FD,0xC4FC00CF,0xB9000002,0xAF0400D5,0xB8FC02D4,0xB2F400FF,0xAAFC013A,0xA4FC02D5,0x1D00734,0xDCDC02D3,0xB90002D4,0xD0CC0192,0xB8E400D1,0xAEEC0190,0xCAB402D3,0xB8C400A2,0xAEC800D5,0xA4D802D4,0x6DF80734,
-0xB88402D3,0xAE740190,0xA46002D4,0x9A000738,0xFF20028E,0xFF2C056A,0xF33405F4,0xFF04001E,0xDEFC0003,0xC7000002,0xB9040011,0xB6F80012,0xFF10025E,0xFEF00006,0xBAF400A9,0xAEC800D5,0x45FC0734,0x14C02D3,0xF33800A2,0xC73800A2,0xB93800A2,0xE52800C9,0xC9280002,0xBB280015,0xBD2400C9,0xB7200026,0xAF2400C9,0x1EC02D3,0xD50000A2,0xB91C00A2,0xC8F400C9,0xB9000001,
-0xAF0C00C8,0x7BFC02D3,0xB8C000A2,0xAEB400C8,0xA40002D4,0x1EC02D3,0xD50000A2,0xB91C00A2,0xC8F400C9,0xB9000001,0xAF0C00C8,0x7BFC02D3,0xB8C000A2,0xAEB400C8,0xA40002D4,0x7BFC02D3,0xB8C000A2,0xAEB400C8,0xA40002D4,0xA40002D4,0xFD30010A,0xFB44020B,0xFD480203,0xFF08000D,0xDEFC0002,0xC7000001,0xB90C0002,0xB6F00006,0xFF180119,0xFEF00005,0xBCEC00A2,0xAEB400C8,
-0x59FC02D3,0x12402D3,0x12402D3,0x12402D3,0x12402D3,0xDB1400CA,0xDB1400CA,0xDB1400CA,0xB31000CA,0xB31000CA,0xA51000CA,0xD50000A3,0xD50000A3,0xD50000A3,0xB7000002,0xB7000002,0xA7040026,0xA90000A3,0xA90000A3,0xA2FC0015,0x9AFC00A5,0x3B002D3,0x3B002D3,0x3B002D3,0xBCE000C9,0xBCE000C9,0xA4F800C8,0xB6C800A2,0xB6C800A2,0xA4D80001,0x9AE400A4,0x5DFC02D3,
-0x5DFC02D3,0xA4A000C8,0x9A8C00A4,0x920002D4,0xFF1000F1,0xF92001EE,0x12402D3,0xFF00000B,0xDB000002,0xC5000002,0xBF000005,0xB2FC0002,0xFF0000CB,0xFCF00002,0xB8F800A3,0xA4D80001,0x33FC02D3,0x13800A2,0x13800A2,0x13800A2,0x13800A2,0xC7280001,0xC7280001,0xC7280001,0xAD280001,0xAD280001,0xA5240001,0x1D000A2,0x1D000A2,0x1D000A2,0xB3080001,0xB3080001,
-0xA5140000,0x6DF800A2,0x6DF800A2,0xA4DC0000,0x9A0000A4,0x1D000A2,0x1D000A2,0x1D000A2,0xB3080001,0xB3080001,0xA5140000,0x6DF800A2,0x6DF800A2,0xA4DC0000,0x9A0000A4,0x6DF800A2,0x6DF800A2,0xA4DC0000,0x9A0000A4,0x9A0000A4,0xFF200029,0xFF2C0049,0x13800A2,0xFD040004,0xD7040000,0xC3040000,0xBB080000,0xB2FC0001,0xFD100032,0xFCF00001,0x45FC00A2,0xA4DC0000,
-0x45FC00A2,0x15C00CA,0xDF4C0001,0xC3480001,0xB9480001,0xDFC00C8,0xC9280001,0xB9340001,0x89F800C8,0xB8F80000,0xAE0000C8,0xDFC00C8,0xC9280001,0xB9340001,0x89F800C8,0xB8F80000,0xAE0000C8,0x89F800C8,0xB8F80000,0xAE0000C8,0xAE0000C8,0xDFC00C8,0xC9280001,0xB9340001,0x89F800C8,0xB8F80000,0xAE0000C8,0x89F800C8,0xB8F80000,0xAE0000C8,0xAE0000C8,0x89F800C8,
-0xB8F80000,0xAE0000C8,0xAE0000C8,0xAE0000C8,0xF7400062,0x37400C8,0xF5580071,0xFD100005,0xDEFC0001,0xC8F80000,0xB9100000,0xB8D80000,0xFF2C0055,0xF8FC0002,0xBB180001,0xAE0000C8,0x69FC00C8,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0x11000CA,0xB9000001,0xB9000001,0xB9000001,0xB9000001,0xB9000001,
-0xB9000001,0x9CFC0001,0x9CFC0001,0x9CFC0001,0x92FC0001,0x19800C8,0x19800C8,0x19800C8,0x19800C8,0x19800C8,0x19800C8,0xA2DC0001,0xA2DC0001,0xA2DC0001,0x92E80001,0x51F800C8,0x51F800C8,0x51F800C8,0x92AC0000,0x880000C8,0xFD08004A,0x11000CA,0x11000CA,0xFD000002,0xDD000001,0xC7000001,0xC7000001,0xAF000001,0xFEF4002D,0xFAF00001,0xA6F80000,0xA2DC0001,
-0x21FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table84[] = {
-0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x4FF80000,
-0x4FF80000,0x4FF80000,0x4FF80000,0x86000001,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x9200000,0x9200000,0x9200000,0x1940000,0x1FFC0000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,
-0x1D00000,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x14C0000,0x1380000,0x1380000,0x7680000,0x1880000,0x1A80000,0x1A80000,0xDFC0000,0x7680000,0x1880000,0x45FC0000,0x6DF80000,
-0x45FC0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x87FC0000,0x87FC0000,0xAC000001,0xAC000001,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x87FC0000,0x87FC0000,0xAC000001,0xAC000001,0x87FC0000,
-0x87FC0000,0xAC000001,0xAC000001,0xAC000001,0x1940000,0x1740000,0x15C0000,0x1D80000,0x39FC0000,0x67FC0000,0x77FC0000,0x91FC0000,0x1B40000,0x9FC0000,0x67FC0000,0xAC000001,0x67FC0000,0x1840000,0x47FC0000,0xA5F80000,0xC0000001,0x47FC0000,0xA5F80000,0xC0000001,0xA5F80000,0xC0000001,0xC0000001,0x47FC0000,0xA5F80000,0xC0000001,0xA5F80000,0xC0000001,
-0xC0000001,0xA5F80000,0xC0000001,0xC0000001,0xC0000001,0x47FC0000,0xA5F80000,0xC0000001,0xA5F80000,0xC0000001,0xC0000001,0xA5F80000,0xC0000001,0xC0000001,0xC0000001,0xA5F80000,0xC0000001,0xC0000001,0xC0000001,0xC0000001,0x5E40000,0x79C0000,0x79C0000,0x6BFC0000,0x99FC0000,0xB3F80000,0xC0000001,0xC0000001,0x1BFC0000,0x7DFC0000,0xBFD00000,0xC0000001,
-0x8DFC0000,0x1480738,0xFD3802F8,0xD53802D4,0xC13802D5,0xFF240198,0xDD2400D5,0xC32C013A,0xCB240190,0xBF2400D5,0xB7240192,0xFF1002D3,0xDF1000A2,0xC71C00FF,0xCF1000D1,0xC1100002,0xB71400D7,0xC11002D4,0xBD0800FD,0xB508013B,0xAF1002D3,0x3E80734,0xE6EC02D3,0xC11002D4,0xD8E00192,0xC2F800CF,0xB7000192,0xD2C802D3,0xC0D800A3,0xB6D800D7,0xAEE802D4,0x79FC0734,
-0xC09802D4,0xB68C0192,0xAE7002D3,0xA4000734,0xFF3402E1,0xFB440590,0xFD4805F4,0xFF18004A,0xE9100001,0xCF100002,0xC3140012,0xBF100011,0xFF2402C7,0xFF080025,0xC50400A9,0xB6D800D7,0x57FC0734,0x15C02D4,0xFB4C00A4,0xCF4C00A4,0xC14800A5,0xF13800C8,0xD3380001,0xC33C0015,0xC53800C8,0xBF340026,0xB73800CA,0xDFC02D3,0xDD1400A2,0xC13000A3,0xD10800C9,0xC1100002,
-0xB91C00CA,0x89F802D3,0xC0D800A3,0xB6C800CA,0xAE0002D3,0xDFC02D3,0xDD1400A2,0xC13000A3,0xD10800C9,0xC1100002,0xB91C00CA,0x89F802D3,0xC0D800A3,0xB6C800CA,0xAE0002D3,0x89F802D3,0xC0D800A3,0xB6C800CA,0xAE0002D3,0xAE0002D3,0xFF44013A,0xF558022A,0xF5580225,0xFF20002A,0xE9100001,0xCF100002,0xC31C0002,0xC1040005,0xFF34013A,0xFF0C0012,0xC6FC00A2,0xB6C800CA,
-0x69FC02D3,0x13802D4,0x13802D4,0x13802D4,0x13802D4,0xE72400C8,0xE72400C8,0xE72400C8,0xBB2400C8,0xBB2400C8,0xAD2400C9,0xDF1000A2,0xDF1000A2,0xDF1000A2,0xC1100001,0xC1100001,0xB1140026,0xB31000A2,0xB31000A2,0xAD0C0015,0xA51000A2,0x1CC02D3,0x1CC02D3,0x1CC02D3,0xC6F000C9,0xC6F000C9,0xAF0800C9,0xC0D800A2,0xC0D800A2,0xACEC0002,0xA4F400A2,0x6BF802D3,
-0x6BF802D3,0xACB400C9,0xA49800A2,0x9A0002D3,0xFD28010A,0xFF2C0209,0x13802D4,0xFF180019,0xE5100001,0xCF100001,0xC9140006,0xBB100002,0xFF1400E5,0xFF04000E,0xC30800A2,0xACEC0002,0x43FC02D3,0x14800A4,0x14800A4,0x14800A4,0x14800A4,0xD3380000,0xD3380000,0xD3380000,0xB7380000,0xB7380000,0xAD380001,0x3E800A2,0x3E800A2,0x3E800A2,0xBD180001,0xBD180001,
-0xAD280001,0x79FC00A2,0x79FC00A2,0xACF00001,0xA40000A2,0x3E800A2,0x3E800A2,0x3E800A2,0xBD180001,0xBD180001,0xAD280001,0x79FC00A2,0x79FC00A2,0xACF00001,0xA40000A2,0x79FC00A2,0x79FC00A2,0xACF00001,0xA40000A2,0xA40000A2,0xFB340034,0xFB440048,0x14800A4,0xFF1C0008,0xE3140000,0xCD140000,0xC31C0001,0xBD0C0000,0xF928003D,0xF90C0005,0x57FC00A2,0xACF00001,
-0x57FC00A2,0x17000C8,0xEB5C0000,0xCB5C0001,0xC15C0001,0x29FC00C8,0xD3380001,0xC1480001,0x97F800C8,0xC10C0001,0xB60000CA,0x29FC00C8,0xD3380001,0xC1480001,0x97F800C8,0xC10C0001,0xB60000CA,0x97F800C8,0xC10C0001,0xB60000CA,0xB60000CA,0x29FC00C8,0xD3380001,0xC1480001,0x97F800C8,0xC10C0001,0xB60000CA,0x97F800C8,0xC10C0001,0xB60000CA,0xB60000CA,0x97F800C8,
-0xC10C0001,0xB60000CA,0xB60000CA,0xB60000CA,0xFF500064,0x18800C8,0xFF6C0071,0xFD28000D,0xE90C0001,0xD10C0000,0xC1240001,0xC0F00001,0xF9480062,0xFF100005,0xC5280000,0xB60000CA,0x7BFC00C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0x12400C8,0xC5100000,0xC5100000,0xC5100000,0xC5100000,0xC5100000,
-0xC5100000,0xA5100001,0xA5100001,0xA5100001,0x9B100001,0x3B000C8,0x3B000C8,0x3B000C8,0x3B000C8,0x3B000C8,0x3B000C8,0xACEC0001,0xACEC0001,0xACEC0001,0x9AFC0001,0x5DFC00C8,0x5DFC00C8,0x5DFC00C8,0x9AC00001,0x900000CA,0xF71C0055,0x12400C8,0x12400C8,0xF9140008,0xE9100000,0xD3100000,0xD3100000,0xB9100000,0xFD0C0032,0xFF000004,0xAD0C0001,0xACEC0001,
-0x33FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table85[] = {
-0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x5BF80000,
-0x5BF80000,0x5BF80000,0x5BF80000,0x8E000001,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1340000,0x1340000,0x1340000,0x1AC0000,0x2FFC0000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,
-0x1E80000,0x79F80000,0x79F80000,0x79F80000,0xA2000001,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x79F80000,0x79F80000,0x79F80000,0xA2000001,0x79F80000,0x79F80000,0x79F80000,0xA2000001,0xA2000001,0x75C0000,0x1480000,0x1480000,0x37C0000,0x19C0000,0x3BC0000,0x3BC0000,0x21FC0000,0x37C0000,0x19C0000,0x55FC0000,0x79F80000,
-0x55FC0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x23FC0000,0x23FC0000,0x23FC0000,0x93FC0000,0x93FC0000,0xB4000001,0x23FC0000,0x23FC0000,0x23FC0000,0x93FC0000,0x93FC0000,0xB4000001,0x93FC0000,0x93FC0000,0xB4000001,0xB4000001,0x23FC0000,0x23FC0000,0x23FC0000,0x93FC0000,0x93FC0000,0xB4000001,0x93FC0000,0x93FC0000,0xB4000001,0xB4000001,0x93FC0000,
-0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0x1A80000,0x1840000,0x16C0000,0x1F00000,0x4DFC0000,0x77FC0000,0x85FC0000,0x9DF40000,0x1C80000,0x23FC0000,0x77FC0000,0xB4000001,0x77FC0000,0x1940000,0x5FFC0000,0xB1F80000,0xC8000001,0x5FFC0000,0xB1F80000,0xC8000001,0xB1F80000,0xC8000001,0xC8000001,0x5FFC0000,0xB1F80000,0xC8000001,0xB1F80000,0xC8000001,
-0xC8000001,0xB1F80000,0xC8000001,0xC8000001,0xC8000001,0x5FFC0000,0xB1F80000,0xC8000001,0xB1F80000,0xC8000001,0xC8000001,0xB1F80000,0xC8000001,0xC8000001,0xC8000001,0xB1F80000,0xC8000001,0xC8000001,0xC8000001,0xC8000001,0x5F80000,0xFAC0000,0xFAC0000,0x7DFC0000,0xA7F80000,0xBDF80000,0xC8000001,0xC8000001,0x39FC0000,0x8FFC0000,0xC7E00000,0xC8000001,
-0x9DF80000,0x1580738,0xFF480314,0xDD4802D4,0xC94802D5,0xFD3801B8,0xE53400D5,0xCB3C013A,0xD3340190,0xC73400D5,0xBF340192,0xFF2402D8,0xE72000A2,0xCF2C00FF,0xD72000D1,0xC9200002,0xBF2400D7,0xC92002D4,0xC51800FD,0xBD18013B,0xB72002D3,0x7FC0734,0xEEFC02D3,0xC92002D4,0xE0F00192,0xCB0800CF,0xBF100192,0xDAD802D3,0xC8E800A3,0xBEE800D7,0xB6F802D4,0x85FC0734,
-0xC8A802D4,0xBE9C0192,0xB68002D3,0xAC000734,0xFF44033E,0xFF4C05D0,0xF5580625,0xFF2C0086,0xF1200001,0xD7200002,0xCB240012,0xC7200011,0xFF3402FA,0xFF1C0054,0xCD1400A9,0xBEE800D7,0x65FC0734,0x16C02D4,0xFF5C00A5,0xD75C00A4,0xC95800A5,0xF94800C8,0xDB480001,0xCB4C0015,0xCD4800C8,0xC7440026,0xBF4800CA,0x25FC02D3,0xE52400A2,0xC94000A3,0xD91800C9,0xC9200002,
-0xC12C00CA,0x95F802D3,0xC8E800A3,0xBED800CA,0xB60002D3,0x25FC02D3,0xE52400A2,0xC94000A3,0xD91800C9,0xC9200002,0xC12C00CA,0x95F802D3,0xC8E800A3,0xBED800CA,0xB60002D3,0x95F802D3,0xC8E800A3,0xBED800CA,0xB60002D3,0xB60002D3,0xFF500155,0xFD68022A,0xFD680225,0xFF340042,0xF1200001,0xD7200002,0xCB2C0002,0xC9140005,0xFF44015B,0xFF240029,0xCF0C00A2,0xBED800CA,
-0x79FC02D3,0x14802D4,0x14802D4,0x14802D4,0x14802D4,0xEF3400C8,0xEF3400C8,0xEF3400C8,0xC33400C8,0xC33400C8,0xB53400C9,0xE72000A2,0xE72000A2,0xE72000A2,0xC9200001,0xC9200001,0xB9240026,0xBB2000A2,0xBB2000A2,0xB51C0015,0xAD2000A2,0x1E402D3,0x1E402D3,0x1E402D3,0xCF0000C9,0xCF0000C9,0xB71800C9,0xC8E800A2,0xC8E800A2,0xB4FC0002,0xAD0400A2,0x77F802D3,
-0x77F802D3,0xB4C400C9,0xACA800A2,0xA20002D3,0xFD380124,0xFB44020C,0x14802D4,0xFD280035,0xED200001,0xD7200001,0xD1240006,0xC3200002,0xFB2C0109,0xFF18001A,0xCB1800A2,0xB4FC0002,0x53FC02D3,0x15800A4,0x15800A4,0x15800A4,0x15800A4,0xDB480000,0xDB480000,0xDB480000,0xBF480000,0xBF480000,0xB5480001,0x7FC00A2,0x7FC00A2,0x7FC00A2,0xC5280001,0xC5280001,
-0xB5380001,0x85FC00A2,0x85FC00A2,0xB5000001,0xAC0000A2,0x7FC00A2,0x7FC00A2,0x7FC00A2,0xC5280001,0xC5280001,0xB5380001,0x85FC00A2,0x85FC00A2,0xB5000001,0xAC0000A2,0x85FC00A2,0x85FC00A2,0xB5000001,0xAC0000A2,0xAC0000A2,0xF748003D,0xF3540055,0x15800A4,0xFD30000D,0xEB240000,0xD5240000,0xCB2C0001,0xC51C0000,0xFF340041,0xFD200008,0x65FC00A2,0xB5000001,
-0x65FC00A2,0x18000C8,0xF36C0000,0xD36C0001,0xC96C0001,0x41FC00C8,0xDB480001,0xC9580001,0xA1FC00C8,0xC91C0001,0xBE0000CA,0x41FC00C8,0xDB480001,0xC9580001,0xA1FC00C8,0xC91C0001,0xBE0000CA,0xA1FC00C8,0xC91C0001,0xBE0000CA,0xBE0000CA,0x41FC00C8,0xDB480001,0xC9580001,0xA1FC00C8,0xC91C0001,0xBE0000CA,0xA1FC00C8,0xC91C0001,0xBE0000CA,0xBE0000CA,0xA1FC00C8,
-0xC91C0001,0xBE0000CA,0xBE0000CA,0xBE0000CA,0xF7680071,0x59800C8,0xF77C0080,0xFF400012,0xF11C0001,0xD91C0000,0xC9340001,0xC9000001,0xFD580064,0xFF2C000D,0xCD380000,0xBE0000CA,0x89FC00C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0x13400C8,0xCD200000,0xCD200000,0xCD200000,0xCD200000,0xCD200000,
-0xCD200000,0xAD200001,0xAD200001,0xAD200001,0xA3200001,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0xB4FC0001,0xB4FC0001,0xB4FC0001,0xA30C0001,0x69FC00C8,0x69FC00C8,0x69FC00C8,0xA2D00001,0x980000CA,0xFF2C0055,0x13400C8,0x13400C8,0xFB24000D,0xF1200000,0xDB200000,0xDB200000,0xC1200000,0xFD1C003D,0xFF140005,0xB51C0001,0xB4FC0001,
-0x41FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table86[] = {
-0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x67F80000,
-0x67F80000,0x67F80000,0x67F80000,0x96000001,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1440000,0x1440000,0x1440000,0x1C40000,0x3FF80000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,
-0x3FC0000,0x85F80000,0x85F80000,0x85F80000,0xAA000001,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,0x85F80000,0x85F80000,0xAA000001,0x85F80000,0x85F80000,0x85F80000,0xAA000001,0xAA000001,0xF6C0000,0x1580000,0x1580000,0x1900000,0x1B00000,0x1D40000,0x1D40000,0x33FC0000,0x1900000,0x1B00000,0x63FC0000,0x85F80000,
-0x63FC0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x9FF80000,0x9FF80000,0xBC000001,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x9FF80000,0x9FF80000,0xBC000001,0x9FF80000,0x9FF80000,0xBC000001,0xBC000001,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x9FF80000,0x9FF80000,0xBC000001,0x9FF80000,0x9FF80000,0xBC000001,0xBC000001,0x9FF80000,
-0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0x5B80000,0x3940000,0x17C0000,0xDFC0000,0x61FC0000,0x85FC0000,0x93F80000,0xA7F80000,0x1DC0000,0x3BFC0000,0x85FC0000,0xBC000001,0x85FC0000,0x1A40000,0x77FC0000,0xBDF80000,0xD0000001,0x77FC0000,0xBDF80000,0xD0000001,0xBDF80000,0xD0000001,0xD0000001,0x77FC0000,0xBDF80000,0xD0000001,0xBDF80000,0xD0000001,
-0xD0000001,0xBDF80000,0xD0000001,0xD0000001,0xD0000001,0x77FC0000,0xBDF80000,0xD0000001,0xBDF80000,0xD0000001,0xD0000001,0xBDF80000,0xD0000001,0xD0000001,0xD0000001,0xBDF80000,0xD0000001,0xD0000001,0xD0000001,0xD0000001,0x23FC0000,0x1C00000,0x1C00000,0x91FC0000,0xB5F80000,0xC7F80000,0xD0000001,0xD0000001,0x57FC0000,0xA1FC0000,0xCFF00000,0xD0000001,
-0xABFC0000,0x1680738,0xFF5C0339,0xE55802D4,0xD15802D5,0xFF4801E0,0xED4400D5,0xD34C013A,0xDB440190,0xCF4400D5,0xC7440192,0xFD3802F8,0xEF3000A2,0xD73C00FF,0xDF3000D1,0xD1300002,0xC73400D7,0xD13002D4,0xCD2800FD,0xC528013B,0xBF3002D3,0x1FFC0734,0xF70C02D3,0xD13002D4,0xE9000192,0xD31800CF,0xC7200192,0xE2E802D3,0xD0F800A3,0xC6F800D7,0xBF0802D4,0x91FC0734,
-0xD0B802D4,0xC6AC0192,0xBE9002D3,0xB4000734,0xFF580386,0xFB6405D2,0xFD680625,0xFF4000D3,0xF9300001,0xDF300002,0xD3340012,0xCF300011,0xFD4C036A,0xFF300099,0xD52400A9,0xC6F800D7,0x75FC0734,0x17C02D4,0xFF6C00B4,0xDF6C00A4,0xD16800A5,0xFD5800CA,0xE3580001,0xD35C0015,0xD55800C8,0xCF540026,0xC75800CA,0x3DFC02D3,0xED3400A2,0xD15000A3,0xE12800C9,0xD1300002,
-0xC93C00CA,0xA1F802D3,0xD0F800A3,0xC6E800CA,0xBE0002D3,0x3DFC02D3,0xED3400A2,0xD15000A3,0xE12800C9,0xD1300002,0xC93C00CA,0xA1F802D3,0xD0F800A3,0xC6E800CA,0xBE0002D3,0xA1F802D3,0xD0F800A3,0xC6E800CA,0xBE0002D3,0xBE0002D3,0xFF64017A,0xF5780248,0xF77C0244,0xFF48006E,0xF9300001,0xDF300002,0xD33C0002,0xD1240005,0xFF5C016D,0xFD3C004B,0xD71C00A2,0xC6E800CA,
-0x87FC02D3,0x15802D4,0x15802D4,0x15802D4,0x15802D4,0xF74400C8,0xF74400C8,0xF74400C8,0xCB4400C8,0xCB4400C8,0xBD4400C9,0xEF3000A2,0xEF3000A2,0xEF3000A2,0xD1300001,0xD1300001,0xC1340026,0xC33000A2,0xC33000A2,0xBD2C0015,0xB53000A2,0x1FC02D3,0x1FC02D3,0x1FC02D3,0xD71000C9,0xD71000C9,0xBF2800C9,0xD0F800A2,0xD0F800A2,0xBD0C0002,0xB51400A2,0x83F802D3,
-0x83F802D3,0xBCD400C9,0xB4B800A2,0xAA0002D3,0xFD48013D,0xF3540229,0x15802D4,0xFF3C0048,0xF5300001,0xDF300001,0xD9340006,0xCB300002,0xFF3C0120,0xFD2C0035,0xD32800A2,0xBD0C0002,0x61FC02D3,0x16800A4,0x16800A4,0x16800A4,0x16800A4,0xE3580000,0xE3580000,0xE3580000,0xC7580000,0xC7580000,0xBD580001,0x1FFC00A2,0x1FFC00A2,0x1FFC00A2,0xCD380001,0xCD380001,
-0xBD480001,0x91FC00A2,0x91FC00A2,0xBD100001,0xB40000A2,0x1FFC00A2,0x1FFC00A2,0x1FFC00A2,0xCD380001,0xCD380001,0xBD480001,0x91FC00A2,0x91FC00A2,0xBD100001,0xB40000A2,0x91FC00A2,0x91FC00A2,0xBD100001,0xB40000A2,0xB40000A2,0xFF58003D,0xFB640055,0x16800A4,0xFF440012,0xF3340000,0xDD340000,0xD33C0001,0xCD2C0000,0xFD4C0048,0xFF300011,0x75FC00A2,0xBD100001,
-0x75FC00A2,0x19000C8,0xFB7C0000,0xDB7C0001,0xD17C0001,0x59FC00C8,0xE3580001,0xD1680001,0xADFC00C8,0xD12C0001,0xC60000CA,0x59FC00C8,0xE3580001,0xD1680001,0xADFC00C8,0xD12C0001,0xC60000CA,0xADFC00C8,0xD12C0001,0xC60000CA,0xC60000CA,0x59FC00C8,0xE3580001,0xD1680001,0xADFC00C8,0xD12C0001,0xC60000CA,0xADFC00C8,0xD12C0001,0xC60000CA,0xC60000CA,0xADFC00C8,
-0xD12C0001,0xC60000CA,0xC60000CA,0xC60000CA,0xFF780071,0xDA800C8,0xFF8C0080,0xFF580020,0xF92C0001,0xE12C0000,0xD1440001,0xD1100001,0xFF700071,0xFD480019,0xD5480000,0xC60000CA,0x99FC00C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0x14400C8,0xD5300000,0xD5300000,0xD5300000,0xD5300000,0xD5300000,
-0xD5300000,0xB5300001,0xB5300001,0xB5300001,0xAB300001,0x1E000C8,0x1E000C8,0x1E000C8,0x1E000C8,0x1E000C8,0x1E000C8,0xBD0C0001,0xBD0C0001,0xBD0C0001,0xAB1C0001,0x75FC00C8,0x75FC00C8,0x75FC00C8,0xAAE00001,0xA00000CA,0xF73C0064,0x14400C8,0x14400C8,0xFB340014,0xF9300000,0xE3300000,0xE3300000,0xC9300000,0xF9300048,0xF928000D,0xBD2C0001,0xBD0C0001,
-0x51FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table87[] = {
-0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x73F80000,
-0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x3540000,0x3540000,0x3540000,0x1DC0000,0x4DFC0000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,
-0x1BFC0000,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0xB2000001,0x1800000,0x1680000,0x1680000,0x7A00000,0x1C40000,0x3E80000,0x3E80000,0x47FC0000,0x7A00000,0x1C40000,0x73FC0000,0x91F80000,
-0x73FC0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0xABF80000,
-0xABF80000,0xC4000001,0xC4000001,0xC4000001,0x1CC0000,0xBA40000,0x18C0000,0x2BFC0000,0x73FC0000,0x95FC0000,0x9FFC0000,0xB3F40000,0x1F00000,0x53FC0000,0x95FC0000,0xC4000001,0x95FC0000,0x1B40000,0x8FFC0000,0xC9F80000,0xD8000001,0x8FFC0000,0xC9F80000,0xD8000001,0xC9F80000,0xD8000001,0xD8000001,0x8FFC0000,0xC9F80000,0xD8000001,0xC9F80000,0xD8000001,
-0xD8000001,0xC9F80000,0xD8000001,0xD8000001,0xD8000001,0x8FFC0000,0xC9F80000,0xD8000001,0xC9F80000,0xD8000001,0xD8000001,0xC9F80000,0xD8000001,0xD8000001,0xD8000001,0xC9F80000,0xD8000001,0xD8000001,0xD8000001,0xD8000001,0x4BFC0000,0x1D00000,0x1D00000,0xA5FC0000,0xC1FC0000,0xD1F80000,0xD8000001,0xD8000001,0x75FC0000,0xB1FC0000,0xD9C40000,0xD8000001,
-0xBBFC0000,0x1780738,0xFF6C0378,0xED6802D4,0xD96802D5,0xFF5C0212,0xF55400D5,0xDB5C013A,0xE3540190,0xD75400D5,0xCF540192,0xFF50031B,0xF74000A2,0xDF4C00FF,0xE74000D1,0xD9400002,0xCF4400D7,0xD94002D4,0xD53800FD,0xCD38013B,0xC74002D3,0x37FC0734,0xFF1C02D3,0xD94002D4,0xF1100192,0xDB2800CF,0xCF300192,0xEAF802D3,0xD90800A3,0xCF0800D7,0xC71802D4,0x9DFC0734,
-0xD8C802D4,0xCEBC0192,0xC6A002D3,0xBC000734,0xFF6403DE,0xF3740618,0xF5780658,0xFF54012A,0xFF400006,0xE7400002,0xDB440012,0xD7400011,0xFF5C03A6,0xFF4400EA,0xDD3400A9,0xCF0800D7,0x83FC0734,0x18C02D4,0xFF8000CD,0xE77C00A4,0xD97800A5,0xFF6C00D4,0xEB680001,0xDB6C0015,0xDD6800C8,0xD7640026,0xCF6800CA,0x55FC02D3,0xF54400A2,0xD96000A3,0xE93800C9,0xD9400002,
-0xD14C00CA,0xADF802D3,0xD90800A3,0xCEF800CA,0xC60002D3,0x55FC02D3,0xF54400A2,0xD96000A3,0xE93800C9,0xD9400002,0xD14C00CA,0xADF802D3,0xD90800A3,0xCEF800CA,0xC60002D3,0xADF802D3,0xD90800A3,0xCEF800CA,0xC60002D3,0xC60002D3,0xFF780191,0xFD880248,0xFF8C0244,0xFF5C0096,0xFD440005,0xE7400002,0xDB4C0002,0xD9340005,0xFF7001A5,0xFF540071,0xDF2C00A2,0xCEF800CA,
-0x97FC02D3,0x16802D4,0x16802D4,0x16802D4,0x16802D4,0xFF5400C8,0xFF5400C8,0xFF5400C8,0xD35400C8,0xD35400C8,0xC55400C9,0xF74000A2,0xF74000A2,0xF74000A2,0xD9400001,0xD9400001,0xC9440026,0xCB4000A2,0xCB4000A2,0xC53C0015,0xBD4000A2,0x19FC02D3,0x19FC02D3,0x19FC02D3,0xDF2000C9,0xDF2000C9,0xC73800C9,0xD90800A2,0xD90800A2,0xC51C0002,0xBD2400A2,0x8FF802D3,
-0x8FF802D3,0xC4E400C9,0xBCC800A2,0xB20002D3,0xFF58015D,0xFB640229,0x16802D4,0xFF4C0065,0xFD400001,0xE7400001,0xE1440006,0xD3400002,0xFF500139,0xFF3C0054,0xDB3800A2,0xC51C0002,0x71FC02D3,0x17800A4,0x17800A4,0x17800A4,0x17800A4,0xEB680000,0xEB680000,0xEB680000,0xCF680000,0xCF680000,0xC5680001,0x37FC00A2,0x37FC00A2,0x37FC00A2,0xD5480001,0xD5480001,
-0xC5580001,0x9DFC00A2,0x9DFC00A2,0xC5200001,0xBC0000A2,0x37FC00A2,0x37FC00A2,0x37FC00A2,0xD5480001,0xD5480001,0xC5580001,0x9DFC00A2,0x9DFC00A2,0xC5200001,0xBC0000A2,0x9DFC00A2,0x9DFC00A2,0xC5200001,0xBC0000A2,0xBC0000A2,0xFB6C0048,0xF3740064,0x17800A4,0xFD580019,0xFB440000,0xE5440000,0xDB4C0001,0xD53C0000,0xF5640055,0xFF480014,0x83FC00A2,0xC5200001,
-0x83FC00A2,0x1A000C8,0xFF8C0001,0xE38C0001,0xD98C0001,0x71FC00C8,0xEB680001,0xD9780001,0xB9FC00C8,0xD93C0001,0xCE0000CA,0x71FC00C8,0xEB680001,0xD9780001,0xB9FC00C8,0xD93C0001,0xCE0000CA,0xB9FC00C8,0xD93C0001,0xCE0000CA,0xCE0000CA,0x71FC00C8,0xEB680001,0xD9780001,0xB9FC00C8,0xD93C0001,0xCE0000CA,0xB9FC00C8,0xD93C0001,0xCE0000CA,0xCE0000CA,0xB9FC00C8,
-0xD93C0001,0xCE0000CA,0xCE0000CA,0xCE0000CA,0xFF8C0080,0x1BC00C8,0xF79C0091,0xFF6C002D,0xFB480005,0xE93C0000,0xD9540001,0xD9200001,0xF7880080,0xFF600029,0xDD580000,0xCE0000CA,0xA7FC00C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0x15400C8,0xDD400000,0xDD400000,0xDD400000,0xDD400000,0xDD400000,
-0xDD400000,0xBD400001,0xBD400001,0xBD400001,0xB3400001,0x1F800C8,0x1F800C8,0x1F800C8,0x1F800C8,0x1F800C8,0x1F800C8,0xC51C0001,0xC51C0001,0xC51C0001,0xB32C0001,0x81FC00C8,0x81FC00C8,0x81FC00C8,0xB2F00001,0xA80000CA,0xFF4C0064,0x15400C8,0x15400C8,0xFD480019,0xFD400001,0xEB400000,0xEB400000,0xD1400000,0xFF3C0050,0xFD380014,0xC53C0001,0xC51C0001,
-0x5FFC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table88[] = {
-0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x3F40000,0x7FFC0000,
-0x7FFC0000,0x7FFC0000,0x7FFC0000,0xA8000000,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1500001,0x1680000,0x1680000,0x1680000,0x3F40000,0x5FF80000,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,
-0x37FC0000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x1940000,0x1780001,0x1780001,0x1B80000,0x3D80000,0x9FC0000,0x9FC0000,0x5DFC0000,0x1B80000,0x3D80000,0x83FC0000,0x9DFC0000,
-0x83FC0000,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0xB9F80000,0xB9F80000,0xCE000000,0xCE000000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0xB9F80000,0xB9F80000,0xCE000000,0xCE000000,0xB9F80000,
-0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0x3E00000,0x5B80000,0x19C0001,0x4DFC0000,0x8BFC0000,0xA5FC0000,0xAFFC0000,0xBFF40000,0x15FC0000,0x6FFC0000,0xA5FC0000,0xCE000000,0xA5FC0000,0x1C40001,0xABFC0000,0xD7F80000,0xE2000000,0xABFC0000,0xD7F80000,0xE2000000,0xD7F80000,0xE2000000,0xE2000000,0xABFC0000,0xD7F80000,0xE2000000,0xD7F80000,0xE2000000,
-0xE2000000,0xD7F80000,0xE2000000,0xE2000000,0xE2000000,0xABFC0000,0xD7F80000,0xE2000000,0xD7F80000,0xE2000000,0xE2000000,0xD7F80000,0xE2000000,0xE2000000,0xE2000000,0xD7F80000,0xE2000000,0xE2000000,0xE2000000,0xE2000000,0x77FC0000,0x1E40000,0x1E40000,0xBBFC0000,0xD1FC0000,0xDDF40000,0xE2000000,0xE2000000,0x97FC0000,0xC5FC0000,0xE1F40000,0xE2000000,
-0xCBFC0000,0x18C0734,0xFF8003BF,0xF57802D4,0xE37802D3,0xFF740272,0xFD6800D7,0xE76C013B,0xEB680192,0xE16400D7,0xD9640192,0xFF680361,0xFF5400A3,0xE75C00FD,0xEF5000CF,0xE3540002,0xD95800D5,0xE35002D4,0xDD4800FF,0xD550013A,0xCF5002D5,0x53FC0734,0xFF3802E3,0xE35402D4,0xFB200192,0xE33800D1,0xD9400190,0xF50802D3,0xE31800A2,0xD91C00D5,0xCF2C02D4,0xABF80734,
-0xE2D802D3,0xD8C80190,0xCEB402D4,0xC4000738,0xFF780447,0xFD880614,0xFD88065C,0xFF6C01B7,0xFF58002E,0xF1540002,0xE3580011,0xE14C0012,0xFF700413,0xFF5C016A,0xE54800A9,0xD91C00D5,0x95FC0734,0x1A002D3,0xFF9000FB,0xF18C00A2,0xE38C00A2,0xFF8000F1,0xF37C0002,0xE57C0015,0xE77800C9,0xE1740026,0xD97800C9,0x71FC02D3,0xFF5400A2,0xE37000A2,0xF34800C9,0xE3540001,
-0xD96000C8,0xB9FC02D3,0xE31400A2,0xD90800C8,0xCE0002D4,0x71FC02D3,0xFF5400A2,0xE37000A2,0xF34800C9,0xE3540001,0xD96000C8,0xB9FC02D3,0xE31400A2,0xD90800C8,0xCE0002D4,0xB9FC02D3,0xE31400A2,0xD90800C8,0xCE0002D4,0xCE0002D4,0xFD9001C4,0xF79C0269,0xF79C0266,0xFF7800D1,0xFF5C001A,0xF1540001,0xE3600002,0xE1440006,0xFF8801C3,0xFF7000B5,0xE74000A2,0xD90800C8,
-0xA7FC02D3,0x17802D3,0x17802D3,0x17802D3,0x17802D3,0xFF6800CE,0xFF6800CE,0xFF6800CE,0xDD6400CA,0xDD6400CA,0xCF6400CA,0xFF5400A3,0xFF5400A3,0xFF5400A3,0xE1540002,0xE1540002,0xD1580026,0xD35400A3,0xD35400A3,0xCD500015,0xC55000A5,0x35FC02D3,0x35FC02D3,0x35FC02D3,0xE73400C9,0xE73400C9,0xCF4C00C8,0xE11C00A2,0xE11C00A2,0xCF2C0001,0xC53800A4,0x9DF402D3,
-0x9DF402D3,0xCEF400C8,0xC4E000A4,0xBC0002D4,0xFF680189,0xF374024B,0x17802D3,0xFF600099,0xFF540009,0xEF540002,0xE9540005,0xDD500002,0xFF64016D,0xFF540079,0xE34C00A3,0xCF2C0001,0x81FC02D3,0x18C00A2,0x18C00A2,0x18C00A2,0x18C00A2,0xF17C0001,0xF17C0001,0xF17C0001,0xD77C0001,0xD77C0001,0xCF780001,0x53FC00A2,0x53FC00A2,0x53FC00A2,0xDD5C0001,0xDD5C0001,
-0xCF680000,0xABF800A2,0xABF800A2,0xCF300000,0xC40000A4,0x53FC00A2,0x53FC00A2,0x53FC00A2,0xDD5C0001,0xDD5C0001,0xCF680000,0xABF800A2,0xABF800A2,0xCF300000,0xC40000A4,0xABF800A2,0xABF800A2,0xCF300000,0xC40000A4,0xC40000A4,0xF7800055,0xFD880062,0x18C00A2,0xFB700029,0xFF580001,0xED580000,0xE55C0000,0xDD500001,0xFD740055,0xFD640020,0x95FC00A2,0xCF300000,
-0x95FC00A2,0x1B000CA,0xFFA4000D,0xED9C0001,0xE39C0001,0x8DFC00C8,0xF37C0001,0xE3880001,0xC7FC00C8,0xE34C0000,0xD80000C8,0x8DFC00C8,0xF37C0001,0xE3880001,0xC7FC00C8,0xE34C0000,0xD80000C8,0xC7FC00C8,0xE34C0000,0xD80000C8,0xD80000C8,0x8DFC00C8,0xF37C0001,0xE3880001,0xC7FC00C8,0xE34C0000,0xD80000C8,0xC7FC00C8,0xE34C0000,0xD80000C8,0xD80000C8,0xC7FC00C8,
-0xE34C0000,0xD80000C8,0xD80000C8,0xD80000C8,0xF7A40091,0xFCC00C8,0xFFAC0095,0xFF8C0048,0xFF64000D,0xF34C0000,0xE3640000,0xE32C0000,0xFF980082,0xFD80003D,0xE56C0001,0xD80000C8,0xB9FC00C8,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0x16400CA,0xE3540001,0xE3540001,0xE3540001,0xE3540001,0xE3540001,
-0xE3540001,0xC7500001,0xC7500001,0xC7500001,0xBD500001,0x19FC00C8,0x19FC00C8,0x19FC00C8,0x19FC00C8,0x19FC00C8,0x19FC00C8,0xCD300001,0xCD300001,0xCD300001,0xBD3C0001,0x8FF800C8,0x8FF800C8,0x8FF800C8,0xBD000000,0xB20000C8,0xF9600071,0x16400CA,0x16400CA,0xFF580022,0xFD540005,0xF1540001,0xF1540001,0xD9540001,0xFD540055,0xFF500019,0xD14C0000,0xCD300001,
-0x71FC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table89[] = {
-0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x13FC0000,0x8BFC0000,
-0x8BFC0000,0x8BFC0000,0x8BFC0000,0xB0000000,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x1600001,0x5780000,0x5780000,0x5780000,0x13FC0000,0x6DFC0000,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,
-0x4FFC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0xC4000000,0x1A40000,0x1880001,0x1880001,0x5C80000,0x3EC0000,0x27FC0000,0x27FC0000,0x71FC0000,0x5C80000,0x3EC0000,0x93FC0000,0xA9FC0000,
-0x93FC0000,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x87FC0000,0x87FC0000,0x87FC0000,0xC5F80000,0xC5F80000,0xD6000000,0x87FC0000,0x87FC0000,0x87FC0000,0xC5F80000,0xC5F80000,0xD6000000,0xC5F80000,0xC5F80000,0xD6000000,0xD6000000,0x87FC0000,0x87FC0000,0x87FC0000,0xC5F80000,0xC5F80000,0xD6000000,0xC5F80000,0xC5F80000,0xD6000000,0xD6000000,0xC5F80000,
-0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0x1F40000,0xDC80000,0x1AC0001,0x6BFC0000,0x9DFC0000,0xB5FC0000,0xBDF80000,0xC9F80000,0x3DFC0000,0x87FC0000,0xB5FC0000,0xD6000000,0xB5FC0000,0x1D40001,0xC3FC0000,0xE1FC0000,0xEA000000,0xC3FC0000,0xE1FC0000,0xEA000000,0xE1FC0000,0xEA000000,0xEA000000,0xC3FC0000,0xE1FC0000,0xEA000000,0xE1FC0000,0xEA000000,
-0xEA000000,0xE1FC0000,0xEA000000,0xEA000000,0xEA000000,0xC3FC0000,0xE1FC0000,0xEA000000,0xE1FC0000,0xEA000000,0xEA000000,0xE1FC0000,0xEA000000,0xEA000000,0xEA000000,0xE1FC0000,0xEA000000,0xEA000000,0xEA000000,0xEA000000,0x9FFC0000,0x3F40000,0x3F40000,0xCFFC0000,0xDFF80000,0xE7F40000,0xEA000000,0xEA000000,0xB5FC0000,0xD5FC0000,0xEBC80000,0xEA000000,
-0xDBFC0000,0x19C0734,0xFF90041C,0xFD8802D4,0xEB8802D3,0xFF8002E2,0xFF7800E7,0xEF7C013B,0xF3780192,0xE97400D7,0xE1740192,0xFF7403A9,0xFF6800C6,0xEF6C00FD,0xF76000CF,0xEB640002,0xE16800D5,0xEB6002D4,0xE55800FF,0xDD60013A,0xD76002D5,0x6BFC0734,0xFF580319,0xEB6402D4,0xFF38019A,0xEB4800D1,0xE1500190,0xFD1802D3,0xEB2800A2,0xE12C00D5,0xD73C02D4,0xB7F80734,
-0xEAE802D3,0xE0D80190,0xD6C402D4,0xCC000738,0xFF8C0494,0xF598065A,0xF79C068F,0xFF800236,0xFF6C0086,0xF9640002,0xEB680011,0xE95C0012,0xFF800481,0xFF7401F7,0xED5800A9,0xE12C00D5,0xA3FC0734,0x1B002D3,0xFFA40126,0xF99C00A2,0xEB9C00A2,0xFF980119,0xFB8C0002,0xED8C0015,0xEF8800C9,0xE9840026,0xE18800C9,0x89FC02D3,0xFF7000AD,0xEB8000A2,0xFB5800C9,0xEB640001,
-0xE17000C8,0xC5FC02D3,0xEB2400A2,0xE11800C8,0xD60002D4,0x89FC02D3,0xFF7000AD,0xEB8000A2,0xFB5800C9,0xEB640001,0xE17000C8,0xC5FC02D3,0xEB2400A2,0xE11800C8,0xD60002D4,0xC5FC02D3,0xEB2400A2,0xE11800C8,0xD60002D4,0xD60002D4,0xFFA001E5,0xFFAC0269,0xFFAC0266,0xFF94010A,0xFF7C0049,0xF9640001,0xEB700002,0xE9540006,0xFF9801E6,0xFF8400E2,0xEF5000A2,0xE11800C8,
-0xB7FC02D3,0x18802D3,0x18802D3,0x18802D3,0x18802D3,0xFF7800E3,0xFF7800E3,0xFF7800E3,0xE57400CA,0xE57400CA,0xD77400CA,0xFF6800AD,0xFF6800AD,0xFF6800AD,0xE9640002,0xE9640002,0xD9680026,0xDB6400A3,0xDB6400A3,0xD5600015,0xCD6000A5,0x4DFC02D3,0x4DFC02D3,0x4DFC02D3,0xEF4400C9,0xEF4400C9,0xD75C00C8,0xE92C00A2,0xE92C00A2,0xD73C0001,0xCD4800A4,0xA7FC02D3,
-0xA7FC02D3,0xD70400C8,0xCCF000A4,0xC40002D4,0xFD8001A6,0xFB84024B,0x18802D3,0xFF7000CA,0xFF6C0022,0xF7640002,0xF1640005,0xE5600002,0xFF78018A,0xFF6800A8,0xEB5C00A3,0xD73C0001,0x91FC02D3,0x19C00A2,0x19C00A2,0x19C00A2,0x19C00A2,0xF98C0001,0xF98C0001,0xF98C0001,0xDF8C0001,0xDF8C0001,0xD7880001,0x6BFC00A2,0x6BFC00A2,0x6BFC00A2,0xE56C0001,0xE56C0001,
-0xD7780000,0xB7F800A2,0xB7F800A2,0xD7400000,0xCC0000A4,0x6BFC00A2,0x6BFC00A2,0x6BFC00A2,0xE56C0001,0xE56C0001,0xD7780000,0xB7F800A2,0xB7F800A2,0xD7400000,0xCC0000A4,0xB7F800A2,0xB7F800A2,0xD7400000,0xCC0000A4,0xCC0000A4,0xFF900055,0xF5980071,0x19C00A2,0xFD840032,0xFF700005,0xF5680000,0xED6C0000,0xE5600001,0xFD880062,0xFF74002D,0xA3FC00A2,0xD7400000,
-0xA3FC00A2,0x1C000CA,0xFFB40022,0xF5AC0001,0xEBAC0001,0xA5FC00C8,0xFB8C0001,0xEB980001,0xD3FC00C8,0xEB5C0000,0xE00000C8,0xA5FC00C8,0xFB8C0001,0xEB980001,0xD3FC00C8,0xEB5C0000,0xE00000C8,0xD3FC00C8,0xEB5C0000,0xE00000C8,0xE00000C8,0xA5FC00C8,0xFB8C0001,0xEB980001,0xD3FC00C8,0xEB5C0000,0xE00000C8,0xD3FC00C8,0xEB5C0000,0xE00000C8,0xE00000C8,0xD3FC00C8,
-0xEB5C0000,0xE00000C8,0xE00000C8,0xE00000C8,0xFFB40091,0x1E000C8,0xF9C000A2,0xFFA00059,0xFF7C0025,0xFB5C0000,0xEB740000,0xEB3C0000,0xFFAC0095,0xFF940050,0xED7C0001,0xE00000C8,0xC7FC00C8,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0x17400CA,0xEB640001,0xEB640001,0xEB640001,0xEB640001,0xEB640001,
-0xEB640001,0xCF600001,0xCF600001,0xCF600001,0xC5600001,0x31FC00C8,0x31FC00C8,0x31FC00C8,0x31FC00C8,0x31FC00C8,0x31FC00C8,0xD5400001,0xD5400001,0xD5400001,0xC54C0001,0x9BF800C8,0x9BF800C8,0x9BF800C8,0xC5100000,0xBA0000C8,0xFF6C007D,0x17400CA,0x17400CA,0xFF68002D,0xFF64000A,0xF9640001,0xF9640001,0xE1640001,0xF9680062,0xFD600029,0xD95C0000,0xD5400001,
-0x7FFC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table90[] = {
-0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x97FC0000,
-0x97FC0000,0x97FC0000,0x97FC0000,0xB8000000,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0xD880000,0xD880000,0xD880000,0x2BFC0000,0x7DF80000,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,
-0x69FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0xCC000000,0x3B40000,0x1980001,0x1980001,0x1DC0000,0xBFC0000,0x45FC0000,0x45FC0000,0x85FC0000,0x1DC0000,0xBFC0000,0xA1FC0000,0xB5FC0000,
-0xA1FC0000,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x9FFC0000,0x9FFC0000,0x9FFC0000,0xD1F80000,0xD1F80000,0xDE000000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0xD1F80000,0xD1F80000,0xDE000000,0xD1F80000,0xD1F80000,0xDE000000,0xDE000000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0xD1F80000,0xD1F80000,0xDE000000,0xD1F80000,0xD1F80000,0xDE000000,0xDE000000,0xD1F80000,
-0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0x1FFC0000,0x1DC0000,0x1BC0001,0x89FC0000,0xB1FC0000,0xC3FC0000,0xC9FC0000,0xD5F40000,0x63FC0000,0x9FFC0000,0xC3FC0000,0xDE000000,0xC3FC0000,0x1E40001,0xDBFC0000,0xEDFC0000,0xF2000000,0xDBFC0000,0xEDFC0000,0xF2000000,0xEDFC0000,0xF2000000,0xF2000000,0xDBFC0000,0xEDFC0000,0xF2000000,0xEDFC0000,0xF2000000,
-0xF2000000,0xEDFC0000,0xF2000000,0xF2000000,0xF2000000,0xDBFC0000,0xEDFC0000,0xF2000000,0xEDFC0000,0xF2000000,0xF2000000,0xEDFC0000,0xF2000000,0xF2000000,0xF2000000,0xEDFC0000,0xF2000000,0xF2000000,0xF2000000,0xF2000000,0xC7FC0000,0x47FC0000,0x47FC0000,0xE3FC0000,0xEBFC0000,0xF1F40000,0xF2000000,0xF2000000,0xD3FC0000,0xE7FC0000,0xF3D80000,0xF2000000,
-0xE9FC0000,0x1AC0734,0xFFA40477,0xFF9C02EB,0xF39802D3,0xFF980352,0xFF8C012F,0xF78C013B,0xFB880192,0xF18400D7,0xE9840192,0xFF8C0401,0xFF7C011E,0xF77C00FD,0xFF7000CF,0xF3740002,0xE97800D5,0xF37002D4,0xED6800FF,0xE570013A,0xDF7002D5,0x83FC0734,0xFF700361,0xF37402D4,0xFF5801E2,0xF35800D1,0xE9600190,0xFF3802DD,0xF33800A2,0xE93C00D5,0xDF4C02D4,0xC3F80734,
-0xF2F802D3,0xE8E80190,0xDED402D4,0xD4000738,0xFFA004E6,0xFDA8065A,0xFFAC068F,0xFF9402CB,0xFF84010B,0xFF740006,0xF3780011,0xF16C0012,0xFF9804CA,0xFF84028B,0xF56800A9,0xE93C00D5,0xB3FC0734,0x1C002D3,0xFFB4016B,0xFFAC00A3,0xF3AC00A2,0xFFA8015B,0xFF9C0012,0xF59C0015,0xF79800C9,0xF1940026,0xE99800C9,0xA3FC02D3,0xFF8800D5,0xF39000A2,0xFF7000CE,0xF3740001,
-0xE98000C8,0xD1FC02D3,0xF33400A2,0xE92800C8,0xDE0002D4,0xA3FC02D3,0xFF8800D5,0xF39000A2,0xFF7000CE,0xF3740001,0xE98000C8,0xD1FC02D3,0xF33400A2,0xE92800C8,0xDE0002D4,0xD1FC02D3,0xF33400A2,0xE92800C8,0xDE0002D4,0xDE0002D4,0xFFB40202,0xF7BC028B,0xF7BC028B,0xFFA4015B,0xFF900089,0xFF740005,0xF3800002,0xF1640006,0xFDB00222,0xFF980136,0xF76000A2,0xE92800C8,
-0xC5FC02D3,0x19802D3,0x19802D3,0x19802D3,0x19802D3,0xFF8C00FE,0xFF8C00FE,0xFF8C00FE,0xED8400CA,0xED8400CA,0xDF8400CA,0xFF7800CB,0xFF7800CB,0xFF7800CB,0xF1740002,0xF1740002,0xE1780026,0xE37400A3,0xE37400A3,0xDD700015,0xD57000A5,0x65FC02D3,0x65FC02D3,0x65FC02D3,0xF75400C9,0xF75400C9,0xDF6C00C8,0xF13C00A2,0xF13C00A2,0xDF4C0001,0xD55800A4,0xB3FC02D3,
-0xB3FC02D3,0xDF1400C8,0xD50000A4,0xCC0002D4,0xFF9001C6,0xF598026A,0x19802D3,0xFF8800F3,0xFF80004A,0xFF740002,0xF9740005,0xED700002,0xFD8801C3,0xFF8000DD,0xF36C00A3,0xDF4C0001,0x9FFC02D3,0x1AC00A2,0x1AC00A2,0x1AC00A2,0x1AC00A2,0xFF9C0002,0xFF9C0002,0xFF9C0002,0xE79C0001,0xE79C0001,0xDF980001,0x83FC00A2,0x83FC00A2,0x83FC00A2,0xED7C0001,0xED7C0001,
-0xDF880000,0xC3F800A2,0xC3F800A2,0xDF500000,0xD40000A4,0x83FC00A2,0x83FC00A2,0x83FC00A2,0xED7C0001,0xED7C0001,0xDF880000,0xC3F800A2,0xC3F800A2,0xDF500000,0xD40000A4,0xC3F800A2,0xC3F800A2,0xDF500000,0xD40000A4,0xD40000A4,0xFFA00064,0xFDA80071,0x1AC00A2,0xFF98003D,0xFF840011,0xFD780000,0xF57C0000,0xED700001,0xF5A00071,0xFB90003D,0xB3FC00A2,0xDF500000,
-0xB3FC00A2,0x1D000CA,0xFFC4003A,0xFDBC0001,0xF3BC0001,0xBDFC00C8,0xFFA00009,0xF3A80001,0xDFF800C8,0xF36C0000,0xE80000C8,0xBDFC00C8,0xFFA00009,0xF3A80001,0xDFF800C8,0xF36C0000,0xE80000C8,0xDFF800C8,0xF36C0000,0xE80000C8,0xE80000C8,0xBDFC00C8,0xFFA00009,0xF3A80001,0xDFF800C8,0xF36C0000,0xE80000C8,0xDFF800C8,0xF36C0000,0xE80000C8,0xE80000C8,0xDFF800C8,
-0xF36C0000,0xE80000C8,0xE80000C8,0xE80000C8,0xFBC800A4,0x1F000C8,0xFFCC00AA,0xFDBC0071,0xFFA4003D,0xFF740004,0xF3840000,0xF34C0000,0xF9C800A2,0xFBB80071,0xF58C0001,0xE80000C8,0xD7FC00C8,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0x18400CA,0xF3740001,0xF3740001,0xF3740001,0xF3740001,0xF3740001,
-0xF3740001,0xD7700001,0xD7700001,0xD7700001,0xCD700001,0x49FC00C8,0x49FC00C8,0x49FC00C8,0x49FC00C8,0x49FC00C8,0x49FC00C8,0xDD500001,0xDD500001,0xDD500001,0xCD5C0001,0xA7F800C8,0xA7F800C8,0xA7F800C8,0xCD200000,0xC20000C8,0xF9800082,0x18400CA,0x18400CA,0xFB7C003D,0xFF780012,0xFD740002,0xFD740002,0xE9740001,0xFF74006A,0xFD740032,0xE16C0000,0xDD500001,
-0x8FFC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table91[] = {
-0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0xA3FC0000,
-0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x19C0000,0x19C0000,0x19C0000,0x43FC0000,0x8BFC0000,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,
-0x81FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xD4000000,0xBC40000,0x1A80001,0x1A80001,0x1F00000,0x33FC0000,0x63FC0000,0x63FC0000,0x99FC0000,0x1F00000,0x33FC0000,0xB1FC0000,0xC1FC0000,
-0xB1FC0000,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xDDF40000,0xDDF40000,0xE6000000,0xE6000000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xDDF40000,0xDDF40000,0xE6000000,0xE6000000,0xDDF40000,
-0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0x57FC0000,0x1EC0000,0x1CC0001,0xA7FC0000,0xC5FC0000,0xD3FC0000,0xD7FC0000,0xDFF80000,0x8BFC0000,0xB7FC0000,0xD3FC0000,0xE6000000,0xD3FC0000,0x1F40001,0xF5FC0000,0xF9FC0000,0xFA000000,0xF5FC0000,0xF9FC0000,0xFA000000,0xF9FC0000,0xFA000000,0xFA000000,0xF5FC0000,0xF9FC0000,0xFA000000,0xF9FC0000,0xFA000000,
-0xFA000000,0xF9FC0000,0xFA000000,0xFA000000,0xFA000000,0xF5FC0000,0xF9FC0000,0xFA000000,0xF9FC0000,0xFA000000,0xFA000000,0xF9FC0000,0xFA000000,0xFA000000,0xFA000000,0xF9FC0000,0xFA000000,0xFA000000,0xFA000000,0xFA000000,0xEDFC0000,0xC7FC0000,0xC7FC0000,0xF7FC0000,0xF9FC0000,0xFBF40000,0xFA000000,0xFA000000,0xF1FC0000,0xF7FC0000,0xFBE80000,0xFA000000,
-0xF9FC0000,0x1BC0734,0xFFB004DF,0xFFAC032C,0xFBA802D3,0xFFA403F2,0xFFA001B3,0xFF9C013B,0xFF9401A4,0xF99400D7,0xF1940192,0xFFA40479,0xFF9401AE,0xFF8C00FD,0xFF880107,0xFB840002,0xF18800D5,0xFB8002D4,0xF57800FF,0xED80013A,0xE78002D5,0x9BFC0734,0xFF8803C9,0xFB8402D4,0xFF700252,0xFB6800D1,0xF1700190,0xFF58031F,0xFB4800A2,0xF14C00D5,0xE75C02D4,0xCFF80734,
-0xFB0802D3,0xF0F80190,0xE6E402D4,0xDC000738,0xFFB40553,0xF5B806A4,0xF7BC06C4,0xFFA4038C,0xFF9801BF,0xFF88005A,0xFB880011,0xF97C0012,0xFDB00553,0xFF98033A,0xFD7800A9,0xF14C00D5,0xC1FC0734,0x1D002D3,0xFFC401A3,0xFFC000CE,0xFBBC00A2,0xFFBC0199,0xFFB4005A,0xFDAC0015,0xFFA800C9,0xF9A40026,0xF1A800C9,0xBBFC02D3,0xFFA0011D,0xFBA000A2,0xFF8800FE,0xFB840001,
-0xF19000C8,0xDDFC02D3,0xFB4400A2,0xF13800C8,0xE60002D4,0xBBFC02D3,0xFFA0011D,0xFBA000A2,0xFF8800FE,0xFB840001,0xF19000C8,0xDDFC02D3,0xFB4400A2,0xF13800C8,0xE60002D4,0xDDFC02D3,0xFB4400A2,0xF13800C8,0xE60002D4,0xE60002D4,0xFBC80244,0xFFCC028B,0xFFCC028B,0xFFC001A6,0xFFA800ED,0xFF900039,0xFB900002,0xF9740006,0xFFC40243,0xFFB0019E,0xFF7000A2,0xF13800C8,
-0xD5FC02D3,0x1A802D3,0x1A802D3,0x1A802D3,0x1A802D3,0xFF9C012B,0xFF9C012B,0xFF9C012B,0xF59400CA,0xF59400CA,0xE79400CA,0xFF8C00ED,0xFF8C00ED,0xFF8C00ED,0xF9840002,0xF9840002,0xE9880026,0xEB8400A3,0xEB8400A3,0xE5800015,0xDD8000A5,0x7DFC02D3,0x7DFC02D3,0x7DFC02D3,0xFF6400C9,0xFF6400C9,0xE77C00C8,0xF94C00A2,0xF94C00A2,0xE75C0001,0xDD6800A4,0xBFFC02D3,
-0xBFFC02D3,0xE72400C8,0xDD1000A4,0xD40002D4,0xFFA001E5,0xFDA8026A,0x1A802D3,0xFF980126,0xFF940082,0xFF88001A,0xFF88000A,0xF5800002,0xFD9C01E1,0xFF900111,0xFB7C00A3,0xE75C0001,0xAFFC02D3,0x1BC00A2,0x1BC00A2,0x1BC00A2,0x1BC00A2,0xFFB0000D,0xFFB0000D,0xFFB0000D,0xEFAC0001,0xEFAC0001,0xE7A80001,0x9BFC00A2,0x9BFC00A2,0x9BFC00A2,0xF58C0001,0xF58C0001,
-0xE7980000,0xCFF800A2,0xCFF800A2,0xE7600000,0xDC0000A4,0x9BFC00A2,0x9BFC00A2,0x9BFC00A2,0xF58C0001,0xF58C0001,0xE7980000,0xCFF800A2,0xCFF800A2,0xE7600000,0xDC0000A4,0xCFF800A2,0xCFF800A2,0xE7600000,0xDC0000A4,0xDC0000A4,0xFBB40071,0xF5B80082,0x1BC00A2,0xFBAC0055,0xFF9C0022,0xFF900008,0xFD8C0000,0xF5800001,0xFDB00071,0xFFA40048,0xC1FC00A2,0xE7600000,
-0xC1FC00A2,0x1E000CA,0xFFDC0062,0xFFCC0011,0xFBCC0001,0xD5FC00C8,0xFFC0002D,0xFBB80001,0xEBF800C8,0xFB7C0000,0xF00000C8,0xD5FC00C8,0xFFC0002D,0xFBB80001,0xEBF800C8,0xFB7C0000,0xF00000C8,0xEBF800C8,0xFB7C0000,0xF00000C8,0xF00000C8,0xD5FC00C8,0xFFC0002D,0xFBB80001,0xEBF800C8,0xFB7C0000,0xF00000C8,0xEBF800C8,0xFB7C0000,0xF00000C8,0xF00000C8,0xEBF800C8,
-0xFB7C0000,0xF00000C8,0xF00000C8,0xF00000C8,0xF3E000B5,0x27FC00C8,0xF9E000B5,0xFDD40091,0xFFBC0061,0xFFA40022,0xFB940000,0xFB5C0000,0xFDD800A4,0xFFCC0082,0xFD9C0001,0xF00000C8,0xE5FC00C8,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0x19400CA,0xFB840001,0xFB840001,0xFB840001,0xFB840001,0xFB840001,
-0xFB840001,0xDF800001,0xDF800001,0xDF800001,0xD5800001,0x63FC00C8,0x63FC00C8,0x63FC00C8,0x63FC00C8,0x63FC00C8,0x63FC00C8,0xE5600001,0xE5600001,0xE5600001,0xD56C0001,0xB3F800C8,0xB3F800C8,0xB3F800C8,0xD5300000,0xCA0000C8,0xF3940091,0x19400CA,0x19400CA,0xFD8C004A,0xFD880022,0xFD84000A,0xFD84000A,0xF1840001,0xFD8C0071,0xF9880048,0xE97C0000,0xE5600001,
-0x9FF800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table92[] = {
-0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0xB1F80000,
-0xB1F80000,0xB1F80000,0xB1F80000,0xC8000001,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0x1940000,0xFAC0000,0xFAC0000,0xFAC0000,0x5FFC0000,0x9DF80000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,
-0x9BFC0000,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0x5D80000,0x1BC0000,0x1BC0000,0x17FC0000,0x5FFC0000,0x85FC0000,0x85FC0000,0xAFFC0000,0x17FC0000,0x5FFC0000,0xC1FC0000,0xCFF80000,
-0xC1FC0000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xE9FC0000,0xEE000001,0xEE000001,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xE9FC0000,0xEE000001,0xEE000001,0xE9FC0000,
-0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0x97FC0000,0x17FC0000,0x1E00000,0xC9FC0000,0xDBFC0000,0xE3FC0000,0xE7F80000,0xEBF80000,0xB7FC0000,0xD3FC0000,0xE3FC0000,0xEE000001,0xE3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1CC0625,0xFFC4047D,0xFFC0034C,0xFFBC02D4,0xFFBC03A5,0xFFB40204,0xFFB00161,0xFFAC017D,0xFFA800C9,0xF7A80139,0xFFB00412,0xFFAC01FB,0xFFA4012A,0xFFA00106,0xFF980009,0xF9980082,0xFF940225,0xFB9000C8,0xF59000C1,0xEF940222,0xB5FC0625,0xFFA003BC,0xFF9802D3,0xFF94024F,0xFF8400E9,0xF784013B,0xFF7C02BE,0xFF6400A5,0xF7640076,0xEF700222,0xDBF80625,
-0xFF2C02D3,0xF7180139,0xEEFC0222,0xE4000627,0xFFC404D9,0xFDC8059D,0xFFCC05C5,0xFFB40375,0xFFA801FF,0xFFA400B5,0xFF9C0032,0xFF900004,0xFFBC04B6,0xFFB00336,0xFF9000B4,0xF7640076,0xD1FC0625,0x1DC0225,0xFFD40178,0xFFCC00E4,0xFFCC00A4,0xFFD00145,0xFFC8007E,0xFFC40020,0xFFC00081,0xFDB8000C,0xF7BC0071,0xCFFC0222,0xFFC00118,0xFFB800A2,0xFFAC00C6,0xFF9C0001,
-0xF7A40071,0xE7FC0222,0xFF6C00A2,0xF7540071,0xEE000222,0xCFFC0222,0xFFC00118,0xFFB800A2,0xFFAC00C6,0xFF9C0001,0xF7A40071,0xE7FC0222,0xFF6C00A2,0xF7540071,0xEE000222,0xE7FC0222,0xFF6C00A2,0xF7540071,0xEE000222,0xEE000222,0xFFD801C3,0xF7DC0201,0xF7DC0204,0xFDD00171,0xFFC000F2,0xFFAC0059,0xFFA80005,0xFF900000,0xFDD801C5,0xFFC80146,0xFF9800AB,0xF7540071,
-0xE1FC0222,0x1BC02D4,0x1BC02D4,0x1BC02D4,0x1BC02D4,0xFFB00161,0xFFB00161,0xFFB00161,0xFDA800C8,0xFDA800C8,0xEFA800C9,0xFFA4012A,0xFFA4012A,0xFFA4012A,0xFF980009,0xFF980009,0xF3980026,0xF59400A2,0xF59400A2,0xEF900015,0xE79400A2,0x99FC02D3,0x99FC02D3,0x99FC02D3,0xFF8400E9,0xFF8400E9,0xF18C00C9,0xFF6400A5,0xFF6400A5,0xEF700002,0xE77800A2,0xCDFC02D3,
-0xCDFC02D3,0xEF3800C9,0xE71C00A2,0xDC0002D3,0xFBB40225,0xF5B8028C,0x1BC02D4,0xFFAC0175,0xFFA400D9,0xFF9C0059,0xFF9C0032,0xFD940002,0xFDB00201,0xFFA40163,0xFF9000AB,0xEF700002,0xBFFC02D3,0x1CC00A4,0x1CC00A4,0x1CC00A4,0x1CC00A4,0xFFC40020,0xFFC40020,0xFFC40020,0xF9BC0000,0xF9BC0000,0xEFBC0001,0xB7FC00A2,0xB7FC00A2,0xB7FC00A2,0xFF9C0001,0xFF9C0001,
-0xEFAC0001,0xDDF400A2,0xDDF400A2,0xEF740001,0xE60000A2,0xB7FC00A2,0xB7FC00A2,0xB7FC00A2,0xFF9C0001,0xFF9C0001,0xEFAC0001,0xDDF400A2,0xDDF400A2,0xEF740001,0xE60000A2,0xDDF400A2,0xDDF400A2,0xEF740001,0xE60000A2,0xE60000A2,0xFDC80080,0xFFCC0080,0x1CC00A4,0xFDC00064,0xFDBC003D,0xFFAC0019,0xFFA80005,0xFF900000,0xFFC40080,0xFBC00062,0xD3FC00A2,0xEF740001,
-0xD3FC00A2,0x1F00071,0xFFE80041,0xFFE40014,0xFFE00000,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xF3FC0071,0xFFA00000,0xF6000071,0xF6000071,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xF3FC0071,0xFFA00000,0xF6000071,0xF6000071,0xF3FC0071,
-0xFFA00000,0xF6000071,0xF6000071,0xF6000071,0xFFEC0062,0x87FC0071,0xFFEC0064,0xFDE80055,0xFDE40048,0xFFC8001D,0xFFB40000,0xFF8C0000,0xFDEC0062,0xFFE40055,0xFFBC0004,0xF6000071,0xF1FC0071,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0x1A800C8,0xFF980005,0xFF980005,0xFF980005,0xFF980005,0xFF980005,
-0xFF980005,0xE7940001,0xE7940001,0xE7940001,0xDD940001,0x7DFC00C8,0x7DFC00C8,0x7DFC00C8,0x7DFC00C8,0x7DFC00C8,0x7DFC00C8,0xEF700001,0xEF700001,0xEF700001,0xDD800001,0xBFFC00C8,0xBFFC00C8,0xBFFC00C8,0xDD440001,0xD20000CA,0xFBA40091,0x1A800C8,0x1A800C8,0xFFA00055,0xFD9C0034,0xFF980014,0xFF980014,0xFB940000,0xFD9C0082,0xFD9C0055,0xEF900001,0xEF700001,
-0xAFFC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table93[] = {
-0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0x77FC0000,0xBDF80000,
-0xBDF80000,0xBDF80000,0xBDF80000,0xD0000001,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1C00000,0x1C00000,0x1C00000,0x77FC0000,0xABFC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,
-0xB5FC0000,0xDBF80000,0xDBF80000,0xDBF80000,0xE4000001,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xDBF80000,0xDBF80000,0xDBF80000,0xE4000001,0xDBF80000,0xDBF80000,0xDBF80000,0xE4000001,0xE4000001,0xDE80000,0x1CC0000,0x1CC0000,0x51FC0000,0x87FC0000,0xA3FC0000,0xA3FC0000,0xC3FC0000,0x51FC0000,0x87FC0000,0xD1FC0000,0xDBF80000,
-0xD1FC0000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0xEBFC0000,0xEBFC0000,0xEBFC0000,0xF5FC0000,0xF5FC0000,0xF6000001,0xEBFC0000,0xEBFC0000,0xEBFC0000,0xF5FC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF5FC0000,0xF6000001,0xF6000001,0xEBFC0000,0xEBFC0000,0xEBFC0000,0xF5FC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF5FC0000,0xF6000001,0xF6000001,0xF5FC0000,
-0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xD1FC0000,0x97FC0000,0x1F00000,0xE7FC0000,0xEFFC0000,0xF3FC0000,0xF5F80000,0xF7F40000,0xDFFC0000,0xEBFC0000,0xF3FC0000,0xF6000001,0xF3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1D804C1,0xFFD003C9,0xFFCC0314,0xFFCC02D4,0xFFC80305,0xFFC40204,0xFFC401A0,0xFFB80151,0xFFB800D8,0xFBB800E1,0xFFC4031D,0xFFBC01F3,0xFFBC017A,0xFFAC00E6,0xFFAC003D,0xFBA80036,0xFFAC016D,0xFFA400A4,0xF9A40049,0xF5A4014E,0xC7FC04C1,0xFFB80354,0xFFB002D3,0xFFAC01F7,0xFFA0011A,0xFB9800E3,0xFF940216,0xFF8800CD,0xFB78001A,0xF580014F,0xE3FC04C1,
-0xFF6002D3,0xFB3800E1,0xF514014E,0xEA0004C3,0xFFD003DA,0xF5D80485,0xF5D8049C,0xFFC402E5,0xFFBC01E3,0xFFB800E7,0xFFB40084,0xFFA80015,0xFFD003C4,0xFFC402B9,0xFFA800E4,0xFB78001A,0xDDF804C1,0x1E8014D,0xFDE40105,0xFFE000BD,0xFFDC00A4,0xFFDC00C9,0xFFD8006C,0xFFD40041,0xFFCC0041,0xFFCC0001,0xFBCC0019,0xDFFC014D,0xFFD800D8,0xFFCC00A4,0xFFC0007D,0xFFB80011,
-0xFBB80019,0xEFFC014D,0xFF9C00A2,0xFB740019,0xF400014E,0xDFFC014D,0xFFD800D8,0xFFCC00A4,0xFFC0007D,0xFFB80011,0xFBB80019,0xEFFC014D,0xFF9C00A2,0xFB740019,0xF400014E,0xEFFC014D,0xFF9C00A2,0xFB740019,0xF400014E,0xF400014E,0xFFE4011D,0xFDE80131,0xFDE80138,0xFFDC00E1,0xFFD400AA,0xFFC80056,0xFFBC0025,0xFFB0000D,0xFFE00122,0xFFDC00DA,0xFFB800A6,0xFB740019,
-0xEBFC014D,0x1CC02D4,0x1CC02D4,0x1CC02D4,0x1CC02D4,0xFFC401A0,0xFFC401A0,0xFFC401A0,0xFFB800D8,0xFFB800D8,0xF7B800C9,0xFFBC017A,0xFFBC017A,0xFFBC017A,0xFFAC003D,0xFFAC003D,0xFBA80026,0xFDA400A2,0xFDA400A2,0xF7A00015,0xEFA400A2,0xB1FC02D3,0xB1FC02D3,0xB1FC02D3,0xFFA0011A,0xFFA0011A,0xF99C00C9,0xFF8800CD,0xFF8800CD,0xF7800002,0xEF8800A2,0xD9FC02D3,
-0xD9FC02D3,0xF74800C9,0xEF2C00A2,0xE40002D3,0xFDC80244,0xFDC8028C,0x1CC02D4,0xFFBC01BA,0xFFB40131,0xFFB400A8,0xFFB40084,0xFFA80015,0xFFBC0236,0xFFBC01A6,0xFFA800DB,0xF7800002,0xCFFC02D3,0x1DC00A4,0x1DC00A4,0x1DC00A4,0x1DC00A4,0xFFD40041,0xFFD40041,0xFFD40041,0xFFCC0001,0xFFCC0001,0xF7CC0001,0xCFFC00A2,0xCFFC00A2,0xCFFC00A2,0xFFB80011,0xFFB80011,
-0xF7BC0001,0xE7FC00A2,0xE7FC00A2,0xF7840001,0xEE0000A2,0xCFFC00A2,0xCFFC00A2,0xCFFC00A2,0xFFB80011,0xFFB80011,0xF7BC0001,0xE7FC00A2,0xE7FC00A2,0xF7840001,0xEE0000A2,0xE7FC00A2,0xE7FC00A2,0xF7840001,0xEE0000A2,0xEE0000A2,0xFFD80082,0xF7DC0091,0x1DC00A4,0xFDD80071,0xFDD00055,0xFFC8003D,0xFFBC0025,0xFFB0000D,0xF1DC0091,0xFBD40071,0xE1FC00A2,0xF7840001,
-0xE1FC00A2,0x1F80019,0xFFF4000D,0xFFF00004,0xFFF00000,0xF5FC0019,0xFFF00008,0xFFE80000,0xF9FC0019,0xFFD40000,0xFA000019,0xF5FC0019,0xFFF00008,0xFFE80000,0xF9FC0019,0xFFD40000,0xFA000019,0xF9FC0019,0xFFD40000,0xFA000019,0xFA000019,0xF5FC0019,0xFFF00008,0xFFE80000,0xF9FC0019,0xFFD40000,0xFA000019,0xF9FC0019,0xFFD40000,0xFA000019,0xFA000019,0xF9FC0019,
-0xFFD40000,0xFA000019,0xFA000019,0xFA000019,0xFDF40014,0xC7FC0019,0xF5F80019,0xFFF40012,0xFFE80010,0xFFE80008,0xFFDC0000,0xFFC80000,0xF1FC0019,0xFFF00014,0xFFE00001,0xFA000019,0xF9FC0019,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0x1B800C8,0xFFA80014,0xFFA80014,0xFFA80014,0xFFA80014,0xFFA80014,
-0xFFA80014,0xEFA40001,0xEFA40001,0xEFA40001,0xE5A40001,0x95FC00C8,0x95FC00C8,0x95FC00C8,0x95FC00C8,0x95FC00C8,0x95FC00C8,0xF7800001,0xF7800001,0xF7800001,0xE5900001,0xCBFC00C8,0xCBFC00C8,0xCBFC00C8,0xE5540001,0xDA0000CA,0xF3B400A4,0x1B800C8,0x1B800C8,0xFBB40071,0xFDB00048,0xFFAC0029,0xFFAC0029,0xFFA40004,0xF9B00091,0xFBAC0064,0xF7A00001,0xF7800001,
-0xBFF800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table94[] = {
-0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0xC9F80000,
-0xC9F80000,0xC9F80000,0xC9F80000,0xD8000001,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1D00000,0x1D00000,0x1D00000,0x8FFC0000,0xBBFC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,
-0xCDFC0000,0xE7F80000,0xE7F80000,0xE7F80000,0xEC000001,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xE7F80000,0xE7F80000,0xE7F80000,0xEC000001,0xE7F80000,0xE7F80000,0xE7F80000,0xEC000001,0xEC000001,0x1FC0000,0x1DC0000,0x1DC0000,0x89FC0000,0xADFC0000,0xC1FC0000,0xC1FC0000,0xD7FC0000,0x89FC0000,0xADFC0000,0xDFFC0000,0xE7F80000,
-0xDFFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1E403A2,0xFFDC032A,0xFFD802D5,0xFFD802B1,0xFFDC0282,0xFFD40205,0xFFD001C9,0xFFCC0150,0xFFCC0110,0xFFC800C8,0xFFD0028A,0xFFD001E9,0xFFC801A9,0xFFC000FC,0xFFC00098,0xFFBC0029,0xFFB8011A,0xFFB800A1,0xFFB00014,0xF9B400C1,0xD5FC03A2,0xFFCC02F1,0xFFC802AE,0xFFB801CC,0xFFB80153,0xFFAC00CA,0xFFAC01B3,0xFFA000FE,0xFF900001,0xF99400C2,0xEBF803A2,
-0xFF9002AE,0xFF5800C8,0xF93400C1,0xF00003A2,0xFDE00326,0xF9E00370,0xFBE40389,0xFFD40272,0xFFD001DA,0xFFC8012F,0xFFC400E0,0xFFB80065,0xFFD8030F,0xFFD00253,0xFFBC010D,0xFF900001,0xE5FC03A2,0x1F400C2,0xFDF000AE,0xFFF00099,0xFFEC0091,0xFFE80086,0xFFE80065,0xFFE80055,0xFFE40030,0xFFE0001D,0xFFDC0000,0xEFFC00C1,0xFFE400A1,0xFFE40091,0xFFD80058,0xFFD80034,
-0xFFCC0000,0xF7F800C1,0xFFC80091,0xFF940000,0xF80000C1,0xEFFC00C1,0xFFE400A1,0xFFE40091,0xFFD80058,0xFFD80034,0xFFCC0000,0xF7F800C1,0xFFC80091,0xFF940000,0xF80000C1,0xF7F800C1,0xFFC80091,0xFF940000,0xF80000C1,0xF80000C1,0xFDF000AC,0xFFEC00C0,0xF3F400C2,0xFFEC009B,0xFFE80081,0xFFE0005E,0xFFE00048,0xFFD0002D,0xFBF000AC,0xFFE80095,0xFFD80092,0xFF940000,
-0xF5FC00C1,0x1D802B1,0x1D802B1,0x1D802B1,0x1D802B1,0xFFD001C9,0xFFD001C9,0xFFD001C9,0xFFCC0110,0xFFCC0110,0xFFC800C8,0xFFC801A9,0xFFC801A9,0xFFC801A9,0xFFC00098,0xFFC00098,0xFFBC0029,0xFFB800A1,0xFFB800A1,0xFDB00011,0xF7B40091,0xC9FC02AE,0xC9FC02AE,0xC9FC02AE,0xFFB80153,0xFFB80153,0xFFAC00CA,0xFFA000FE,0xFFA000FE,0xFF900001,0xF7980091,0xE5F802AE,
-0xE5F802AE,0xFF5800C8,0xF73C0091,0xEC0002AE,0xFFD80245,0xF5D8028C,0x1D802B1,0xFFD401E2,0xFFCC017A,0xFFC40114,0xFFC400E0,0xFFB80065,0xFFD0022E,0xFFD001C3,0xFFBC0109,0xFF900001,0xDFF802AE,0x1EC0091,0x1EC0091,0x1EC0091,0x1EC0091,0xFFE80055,0xFFE80055,0xFFE80055,0xFFE0001D,0xFFE0001D,0xFFDC0000,0xE5FC0091,0xE5FC0091,0xE5FC0091,0xFFD80034,0xFFD80034,
-0xFFCC0000,0xF3F80091,0xF3F80091,0xFF940000,0xF6000091,0xE5FC0091,0xE5FC0091,0xE5FC0091,0xFFD80034,0xFFD80034,0xFFCC0000,0xF3F80091,0xF3F80091,0xFF940000,0xF6000091,0xF3F80091,0xF3F80091,0xFF940000,0xF6000091,0xF6000091,0xFBEC0080,0xFFEC0080,0x1EC0091,0xF9EC0080,0xFFE0006A,0xFFE00055,0xFFE00048,0xFFD0002D,0xF9EC0080,0xFFE80071,0xEFFC0091,0xFF940000,
-0xEFFC0091,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0x1C800C8,0xFFBC0029,0xFFBC0029,0xFFBC0029,0xFFBC0029,0xFFBC0029,
-0xFFBC0029,0xF7B40001,0xF7B40001,0xF7B40001,0xEDB40001,0xAFFC00C8,0xAFFC00C8,0xAFFC00C8,0xAFFC00C8,0xAFFC00C8,0xAFFC00C8,0xFF900001,0xFF900001,0xFF900001,0xEDA00001,0xD7FC00C8,0xD7FC00C8,0xD7FC00C8,0xED640001,0xE20000CA,0xFBC400A4,0x1C800C8,0x1C800C8,0xFBC40080,0xFFBC0061,0xFFBC0041,0xFFBC0041,0xFFB80014,0xFFBC009D,0xFFBC0075,0xFFB00001,0xFF900001,
-0xCDFC00C8,};
-static const uint32_t g_etc1_to_bc7_m6_table95[] = {
-0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xD5F80000,
-0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x9E00000,0x9E00000,0x9E00000,0xA9FC0000,0xC9FC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,
-0xE5FC0000,0xF3F80000,0xF3F80000,0xF3F80000,0xF4000001,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xF3F80000,0xF3F80000,0xF3F80000,0xF4000001,0xF3F80000,0xF3F80000,0xF3F80000,0xF4000001,0xF4000001,0x77FC0000,0x1EC0000,0x1EC0000,0xC3FC0000,0xD5FC0000,0xDFFC0000,0xDFFC0000,0xEBFC0000,0xC3FC0000,0xD5FC0000,0xEFFC0000,0xF3F80000,
-0xEFFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1EC0232,0xFFE801F2,0xFFE401C9,0xFFE401B9,0xFFE801AA,0xFFE00165,0xFFDC0149,0xFFD80110,0xFFD800EC,0xFFD800C8,0xFFDC0186,0xFFDC0131,0xFFDC010D,0xFFD400BB,0xFFD00088,0xFFD00048,0xFFCC0091,0xFFCC0051,0xFFC40001,0xFBC40039,0xE3FC0232,0xFFD801DD,0xFFD801B9,0xFFCC0150,0xFFCC0110,0xFFC400C8,0xFFC00109,0xFFB80096,0xFFAC0011,0xFBAC0039,0xF1F80232,
-0xFFAC01B9,0xFF8800C8,0xFB5C0039,0xF4000232,0xFFE401E6,0xFDE80210,0xFFEC0221,0xFFE0019E,0xFFDC013E,0xFFD400D8,0xFFD000B5,0xFFD00061,0xFFE401ED,0xFFDC018C,0xFFD000AF,0xFFAC0011,0xEDFC0232,0x1F80036,0xFFF80031,0xFFF4002D,0xFFF40029,0xFFF40022,0xFFF4001D,0xFFF40019,0xFFF0000C,0xFFF00008,0xFFEC0000,0xF7FC0036,0xFFF0002D,0xFFF00029,0xFFF00018,0xFFE40010,
-0xFFE40000,0xFBFC0036,0xFFE00029,0xFFC80000,0xFA000039,0xF7FC0036,0xFFF0002D,0xFFF00029,0xFFF00018,0xFFE40010,0xFFE40000,0xFBFC0036,0xFFE00029,0xFFC80000,0xFA000039,0xFBFC0036,0xFFE00029,0xFFC80000,0xFA000039,0xFA000039,0xFFF40030,0xF5F80036,0xF5F80036,0xFFF8002C,0xFFF00022,0xFFEC0018,0xFFE80011,0xFFE4000A,0xFFF8002C,0xFFF4002B,0xFFE8002A,0xFFC80000,
-0xFBFC0036,0x1E401B9,0x1E401B9,0x1E401B9,0x1E401B9,0xFFDC0149,0xFFDC0149,0xFFDC0149,0xFFD800EC,0xFFD800EC,0xFFD800C8,0xFFDC010D,0xFFDC010D,0xFFDC010D,0xFFD00088,0xFFD00088,0xFFD00048,0xFFCC0051,0xFFCC0051,0xFFC40001,0xFBC40029,0xD9FC01B9,0xD9FC01B9,0xD9FC01B9,0xFFCC0110,0xFFCC0110,0xFFC400C8,0xFFB80096,0xFFB80096,0xFFAC0011,0xFBAC0029,0xEDF801B9,
-0xEDF801B9,0xFF8800C8,0xFB5C0029,0xF20001BA,0xFDE40182,0xFBE401A0,0x1E401B9,0xFFDC0144,0xFFD80101,0xFFD400C8,0xFFD000B5,0xFFD00061,0xFDE00181,0xFFDC013B,0xFFD000AE,0xFFAC0011,0xE7FC01B9,0x1F40029,0x1F40029,0x1F40029,0x1F40029,0xFFF40019,0xFFF40019,0xFFF40019,0xFFF00008,0xFFF00008,0xFFEC0000,0xF1FC0029,0xF1FC0029,0xF1FC0029,0xFFE40010,0xFFE40010,
-0xFFE40000,0xF9F80029,0xF9F80029,0xFFC80000,0xFA000029,0xF1FC0029,0xF1FC0029,0xF1FC0029,0xFFE40010,0xFFE40010,0xFFE40000,0xF9F80029,0xF9F80029,0xFFC80000,0xFA000029,0xF9F80029,0xF9F80029,0xFFC80000,0xFA000029,0xFA000029,0xFFF40020,0xF3F40029,0x1F40029,0xFDF40020,0xFFF00019,0xFFEC0014,0xFFE80011,0xFFE4000A,0xFDF40020,0xFDF00022,0xF7FC0029,0xFFC80000,
-0xF7FC0029,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0x1D800C8,0xFFD00048,0xFFD00048,0xFFD00048,0xFFD00048,0xFFD00048,
-0xFFD00048,0xFFC40001,0xFFC40001,0xFFC40001,0xF5C40001,0xC7FC00C8,0xC7FC00C8,0xC7FC00C8,0xC7FC00C8,0xC7FC00C8,0xC7FC00C8,0xFFAC0011,0xFFAC0011,0xFFAC0011,0xF5B00001,0xE3FC00C8,0xE3FC00C8,0xE3FC00C8,0xF5740001,0xEA0000CA,0xF5D800B5,0x1D800C8,0x1D800C8,0xFDD40091,0xFFD00075,0xFFCC0061,0xFFCC0061,0xFFCC0034,0xFDD400A2,0xFFD00082,0xFFC4001D,0xFFAC0011,
-0xDDF800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table96[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x180001,0x180001,0x180001,0x180001,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x4C0000,0x4C0000,0xC000000,0xC000000,0x2240000,0x2240000,0x2240000,0x4C0000,0x4C0000,0xC000000,0x4C0000,0x4C0000,0xC000000,0xC000000,0x4C0000,
-0x4C0000,0xC000000,0xC000000,0xC000000,0x41C0000,0x1C0000,0x180001,0x240000,0x2C0000,0x340000,0x3C0000,0x5C0000,0x200000,0x2240000,0x340000,0xC000000,0x340000,0x540000,0x7C0000,0xFC0000,0x28000001,0x7C0000,0xFC0000,0x28000001,0xFC0000,0x28000001,0x28000001,0x7C0000,0xFC0000,0x28000001,0xFC0000,0x28000001,
-0x28000001,0xFC0000,0x28000001,0x28000001,0x28000001,0x7C0000,0xFC0000,0x28000001,0xFC0000,0x28000001,0x28000001,0xFC0000,0x28000001,0x28000001,0x28000001,0xFC0000,0x28000001,0x28000001,0x28000001,0x28000001,0x680000,0x4580000,0x4580000,0x8C0000,0xCC0000,0x1940000,0x28000001,0x28000001,0x2700000,0x9C0000,0x1DCC0000,0x28000001,
-0xB00000,0x1C0499,0x76080071,0x3C080071,0x28080072,0x500001A5,0x3A000028,0x28000001,0x280001A5,0x200000A2,0x1A0001A5,0x3600039D,0x2E000149,0x240000AA,0x22000236,0x2000011B,0x180001F1,0x1A00039D,0x1C000236,0x160002AE,0x1200039E,0x280499,0x28000216,0x22000127,0x220002AF,0x1E000181,0x18000231,0x180003E2,0x1600028E,0x140002EE,0x120003C2,0x500499,
-0x1600032F,0x10000373,0x1000041B,0xC00049B,0xA4000108,0xFE0C0229,0xF21401F2,0x48000118,0x3600011B,0x2800011B,0x220000C2,0x20000173,0x64000209,0x3C00017A,0x1E0001B6,0x140002EE,0x380499,0x24039D,0x720C0055,0x3A0C0055,0x280C0056,0x500001A5,0x3A000028,0x28000001,0x280001A5,0x200000A2,0x1A0001A5,0x34039D,0x2E000149,0x240000AA,0x22000236,0x2000011B,
-0x180001F1,0x68039D,0x1C000236,0x160002AE,0x1200039E,0x34039D,0x2E000149,0x240000AA,0x22000236,0x2000011B,0x180001F1,0x68039D,0x1C000236,0x160002AE,0x1200039E,0x68039D,0x1C000236,0x160002AE,0x1200039E,0x1200039E,0xA4000108,0xFE0C0205,0xF61C016E,0x48000118,0x3600011B,0x2800011B,0x220000C2,0x20000173,0x6C0001DB,0x3C00016A,0x1E0001B2,0x160002AE,
-0x4C039D,0x80071,0x80071,0x80071,0x80071,0x26000000,0x26000000,0x26000000,0x12000000,0x12000000,0xC000000,0x10000055,0x10000055,0x10000055,0xC000020,0xC000020,0xC000010,0x8000055,0x8000055,0x8000034,0x6000055,0xC0071,0xC0071,0xC0071,0xC000030,0xC000030,0xC000020,0x600005D,0x600005D,0x800003D,0x6000059,0x140071,
-0x140071,0x4000052,0x4000062,0x4000072,0x44000019,0xC8000000,0x80071,0x24000022,0x1A00001D,0x12000022,0x1200001D,0xC000022,0x24000036,0x1A00002D,0xA000056,0x800003D,0x100071,0xC0055,0xC0055,0xC0055,0xC0055,0x26000000,0x26000000,0x26000000,0x12000000,0x12000000,0xC000000,0x100055,0x100055,0x100055,0xC000020,0xC000020,
-0xC000010,0x200055,0x200055,0x8000034,0x6000055,0x100055,0x100055,0x100055,0xC000020,0xC000020,0xC000010,0x200055,0x200055,0x8000034,0x6000055,0x200055,0x200055,0x8000034,0x6000055,0x6000055,0x44000019,0xC8000000,0xC0055,0x24000022,0x1A00001D,0x12000022,0x1200001D,0xC000022,0x24000032,0x1A000029,0x180055,0x8000034,
-0x180055,0x3801A5,0x621C0001,0x361C0001,0x28180002,0x5001A5,0x3A000028,0x28000001,0xA001A5,0x200000A2,0x1A0001A5,0x5001A5,0x3A000028,0x28000001,0xA001A5,0x200000A2,0x1A0001A5,0xA001A5,0x200000A2,0x1A0001A5,0x1A0001A5,0x5001A5,0x3A000028,0x28000001,0xA001A5,0x200000A2,0x1A0001A5,0xA001A5,0x200000A2,0x1A0001A5,0x1A0001A5,0xA001A5,
-0x200000A2,0x1A0001A5,0x1A0001A5,0x1A0001A5,0xA40000A4,0x3C01A5,0xFC280062,0x52000091,0x360000A2,0x2C00009D,0x2400006A,0x200000CA,0x720000DD,0x4C0000B4,0x26000059,0x1A0001A5,0x7001A5,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table97[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x280001,0x280001,0x280001,0x280001,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x7C0000,0x7C0000,0x14000000,0x14000000,0x23C0000,0x23C0000,0x23C0000,0x7C0000,0x7C0000,0x14000000,0x7C0000,0x7C0000,0x14000000,0x14000000,0x7C0000,
-0x7C0000,0x14000000,0x14000000,0x14000000,0x300000,0x2C0000,0x280001,0x380000,0x440000,0x580000,0x640000,0x980000,0x340000,0x23C0000,0x580000,0x14000000,0x580000,0x640000,0x940000,0x12C0000,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,
-0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x940000,0x12C0000,0x30000001,0x12C0000,0x30000001,0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x12C0000,0x30000001,0x30000001,0x30000001,0x30000001,0x7C0000,0xC680000,0xC680000,0xA80000,0xF40000,0x1E00000,0x30000001,0x30000001,0x880000,0xBC0000,0x25DC0000,0x30000001,
-0xD40000,0x240691,0x86100129,0x460C0129,0x300C012A,0x6A0001A5,0x46000008,0x32000011,0x340001A5,0x2C00006A,0x220001A5,0x480004ED,0x3A0001D1,0x2E00010E,0x2E00029E,0x2800012B,0x2200021E,0x220004ED,0x20000306,0x1C000362,0x160004EE,0x340691,0x34000316,0x280001E3,0x28000367,0x280001D4,0x1E000295,0x1E000566,0x2000037F,0x1C0003C6,0x1600052E,0x680691,
-0x1C00048F,0x1600049F,0x140005C6,0x10000693,0xC2000118,0xF2140349,0xF61C0372,0x66000128,0x4000013B,0x34000135,0x2A0000C3,0x240001AE,0x82000289,0x500001CD,0x26000266,0x1C0003C6,0x4C0691,0x3004ED,0x821400DD,0x441400DD,0x301400DE,0x6A0001A5,0x46000008,0x3204000E,0x340001A5,0x2C00006A,0x220001A5,0x24404ED,0x3A0001D1,0x2E00010E,0x2E00029E,0x2800012B,
-0x2200021E,0x8C04ED,0x20000306,0x1C000362,0x160004EE,0x24404ED,0x3A0001D1,0x2E00010E,0x2E00029E,0x2800012B,0x2200021E,0x8C04ED,0x20000306,0x1C000362,0x160004EE,0x8C04ED,0x20000306,0x1C000362,0x160004EE,0x160004EE,0xC2000118,0xF61C02BD,0xFA24026E,0x66000128,0x4000013B,0x34000135,0x2A0000C3,0x240001AE,0x82000249,0x500001B4,0x26000262,0x1C000362,
-0x6404ED,0xC0129,0xC0129,0xC0129,0xC0129,0x3E000000,0x3E000000,0x3E000000,0x1E000000,0x1E000000,0x14000000,0x1C0000DD,0x1C0000DD,0x1C0000DD,0x18000050,0x18000050,0x12000028,0xE0000DD,0xE0000DD,0xE000088,0xA0000DD,0x140126,0x140126,0x140126,0x12000088,0x12000088,0x1200004C,0xC0000F1,0xC0000F1,0xC0000A1,0x80000EA,0x240126,
-0x240126,0xA0000C6,0x8000105,0x6000126,0x76000041,0xFA040011,0xC0129,0x3A000059,0x28000050,0x22000055,0x1E000049,0x16000061,0x42000092,0x3200007D,0x120000DE,0xC0000A1,0x1C0126,0x1400DD,0x1400DD,0x1400DD,0x1400DD,0x3E000000,0x3E000000,0x3E000000,0x1E000000,0x1E000000,0x14000000,0x1C00DD,0x1C00DD,0x1C00DD,0x18000050,0x18000050,
-0x12000028,0x3800DD,0x3800DD,0xE000088,0xA0000DD,0x1C00DD,0x1C00DD,0x1C00DD,0x18000050,0x18000050,0x12000028,0x3800DD,0x3800DD,0xE000088,0xA0000DD,0x3800DD,0x3800DD,0xE000088,0xA0000DD,0xA0000DD,0x76000041,0xFA04000D,0x1400DD,0x3A000059,0x28000050,0x22000055,0x1E000049,0x16000061,0x42000082,0x32000074,0x2800DD,0xE000088,
-0x2800DD,0x4801A5,0x6A2C0001,0x3E2C0001,0x30280002,0x6801A5,0x46000008,0x30100001,0xD001A5,0x2C00006A,0x220001A5,0x6801A5,0x46000008,0x30100001,0xD001A5,0x2C00006A,0x220001A5,0xD001A5,0x2C00006A,0x220001A5,0x220001A5,0x6801A5,0x46000008,0x30100001,0xD001A5,0x2C00006A,0x220001A5,0xD001A5,0x2C00006A,0x220001A5,0x220001A5,0xD001A5,
-0x2C00006A,0x220001A5,0x220001A5,0x220001A5,0xD6000075,0x4C01A5,0xF4380071,0x6E000059,0x4A00006A,0x36000064,0x2E00003A,0x280000A2,0x920400B5,0x5E00007D,0x32000028,0x220001A5,0x9401A5,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table98[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x40001,0x80000,0x80000,0x80000,0x80000,0x80000,
-0x80000,0xC0000,0xC0000,0xC0000,0x2000000,0x80000,0x80000,0x80000,0x80000,0x80000,0x80000,0xC0000,0xC0000,0xC0000,0x2000000,0xC0000,0xC0000,0xC0000,0x2000000,0x2000000,0xA040000,0x40001,0x40001,0x6040000,0x80000,0x80000,0x80000,0x80000,0x6040000,0x80000,0xC0000,0xC0000,
-0xC0000,0x380001,0x380001,0x380001,0x380001,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0xAC0000,0xAC0000,0x1C000000,0x1C000000,0x540000,0x540000,0x540000,0xAC0000,0xAC0000,0x1C000000,0xAC0000,0xAC0000,0x1C000000,0x1C000000,0xAC0000,
-0xAC0000,0x1C000000,0x1C000000,0x1C000000,0x440000,0x63C0000,0x380001,0x24C0000,0x600000,0x780000,0x8C0000,0xD40000,0x480000,0x540000,0x780000,0x1C000000,0x780000,0x740000,0xAC0000,0x15C0000,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,
-0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0xAC0000,0x15C0000,0x38000001,0x15C0000,0x38000001,0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0x15C0000,0x38000001,0x38000001,0x38000001,0x38000001,0x900000,0x7C0000,0x7C0000,0x2C00000,0x11C0000,0x9F00000,0x38000001,0x38000001,0x9C0000,0xD80000,0x2DEC0000,0x38000001,
-0xF40000,0x2C088E,0x9A1401FE,0x501401FE,0x381401FF,0x7E0401AA,0x54040005,0x3C08003E,0x3E0401AA,0x3404004F,0x2A0401AA,0x5A0005EA,0x46000218,0x36000163,0x3A0002BD,0x3000010A,0x28000215,0x2C0005EA,0x26000383,0x240003DD,0x1C0005ED,0x40088E,0x40000401,0x3400028E,0x340003FA,0x2E0001F3,0x280002BE,0x2800069F,0x2600042C,0x20000463,0x1C000651,0x80088E,
-0x200005EF,0x1C0005B2,0x1A000749,0x1600088E,0xF40000E9,0xF61C0486,0xFA24050F,0x74000111,0x5000011D,0x3C000103,0x34000096,0x2C00019A,0xA40002D1,0x5E0001D1,0x320002B1,0x20000463,0x5C088E,0x3C05EA,0x8E200152,0x4E200152,0x38200153,0x7A0801A6,0x54040001,0x3A10002A,0x3E0401A6,0x3404004B,0x2A0401A6,0x5805EA,0x46000218,0x36000163,0x3A0002BD,0x3000010A,
-0x28000215,0xB005EA,0x26000383,0x240003DD,0x1C0005ED,0x5805EA,0x46000218,0x36000163,0x3A0002BD,0x3000010A,0x28000215,0xB005EA,0x26000383,0x240003DD,0x1C0005ED,0xB005EA,0x26000383,0x240003DD,0x1C0005ED,0x1C0005ED,0xF40000E9,0xFC280356,0xFE2C0353,0x74000111,0x5000011D,0x3C000103,0x34000096,0x2C00019A,0xA400026D,0x5E0001AD,0x320002A8,0x240003DD,
-0x7C05EA,0x1401FE,0x1401FE,0x1401FE,0x1401FE,0x52040005,0x52040005,0x52040005,0x28040006,0x28040006,0x1C040005,0x30000152,0x30000152,0x30000152,0x22000059,0x22000059,0x1A000028,0x16000154,0x16000154,0x160000B4,0xE000154,0x2001FD,0x2001FD,0x2001FD,0x220000D2,0x220000D2,0x18000069,0x12000188,0x12000188,0x140000EA,0xE00016D,0x3C01FD,
-0x3C01FD,0x10000149,0xA0001B5,0xA0001FD,0xB6000049,0xFE0C005E,0x1401FE,0x50000071,0x3C000061,0x2E000061,0x2A000049,0x2000007D,0x640000DD,0x440000AD,0x1E000156,0x140000EA,0x2C01FD,0x200152,0x200152,0x200152,0x200152,0x4E080001,0x4E080001,0x4E080001,0x28080001,0x28080001,0x1C040001,0x300152,0x300152,0x300152,0x22000059,0x22000059,
-0x1A000028,0x5C0152,0x5C0152,0x160000B4,0xE000154,0x300152,0x300152,0x300152,0x22000059,0x22000059,0x1A000028,0x5C0152,0x5C0152,0x160000B4,0xE000154,0x5C0152,0x5C0152,0x160000B4,0xE000154,0xE000154,0xB6000049,0xFE0C003A,0x200152,0x50000071,0x3C000061,0x2E000061,0x2A000049,0x2000007D,0x640000B9,0x4400009D,0x400152,0x160000B4,
-0x400152,0x5801A5,0x723C0001,0x463C0001,0x38380002,0x8001A5,0x54040000,0x38200001,0x10001A5,0x36000049,0x2A0001A5,0x8001A5,0x54040000,0x38200001,0x10001A5,0x36000049,0x2A0001A5,0x10001A5,0x36000049,0x2A0001A5,0x2A0001A5,0x8001A5,0x54040000,0x38200001,0x10001A5,0x36000049,0x2A0001A5,0x10001A5,0x36000049,0x2A0001A5,0x2A0001A5,0x10001A5,
-0x36000049,0x2A0001A5,0x2A0001A5,0x2A0001A5,0xF6040055,0x5C01A5,0xFC480071,0x84000034,0x56000048,0x40000048,0x36000019,0x32000071,0xB4040091,0x6E000055,0x3C00000A,0x2A0001A5,0xB401A5,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x8000000,0x8000000,0x8000000,0x8000000,0x8000000,
-0x8000000,0x4000000,0x4000000,0x4000000,0x2000000,0x40005,0x40005,0x40005,0x40005,0x40005,0x40005,0x2000002,0x2000002,0x2000002,0x2000001,0x5,0x5,0x5,0x2000004,0x5,0x28000000,0x40005,0x40005,0x12000000,0xC000000,0xA000000,0xA000000,0x6000000,0x12000001,0xC000001,0x4000000,0x2000002,
-0x5,};
-static const uint32_t g_etc1_to_bc7_m6_table99[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x140001,0x200000,0x200000,0x200000,0x200000,0x200000,
-0x200000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0x3C0000,0x3C0000,0x3C0000,0xA000000,0xA000000,0x180000,0x140001,0x140001,0x2180000,0x1C0000,0x1C0000,0x1C0000,0x240000,0x2180000,0x1C0000,0x2C0000,0x3C0000,
-0x2C0000,0x480001,0x480001,0x480001,0x480001,0x6C0000,0x6C0000,0x6C0000,0xDC0000,0xDC0000,0x24000000,0x6C0000,0x6C0000,0x6C0000,0xDC0000,0xDC0000,0x24000000,0xDC0000,0xDC0000,0x24000000,0x24000000,0x6C0000,0x6C0000,0x6C0000,0xDC0000,0xDC0000,0x24000000,0xDC0000,0xDC0000,0x24000000,0x24000000,0xDC0000,
-0xDC0000,0x24000000,0x24000000,0x24000000,0x2540000,0xE4C0000,0x480001,0x640000,0x7C0000,0x9C0000,0xB40000,0x1100000,0x5C0000,0x6C0000,0x9C0000,0x24000000,0x9C0000,0x840000,0xC40000,0x18C0000,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,
-0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0xC40000,0x18C0000,0x40000001,0x18C0000,0x40000001,0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0x18C0000,0x40000001,0x40000001,0x40000001,0x40000001,0xA40000,0x8C0000,0x8C0000,0xDC0000,0x1400000,0x13F00000,0x40000001,0x40000001,0xB40000,0xF80000,0x35FC0000,0x40000001,
-0x1180000,0x380A26,0xA62002D2,0x5A2002D2,0x402002D3,0x8E0C01E2,0x600C003F,0x461000A2,0x480C01E2,0x3C080073,0x320C01E2,0x720005EA,0x580001A8,0x40040159,0x4600024D,0x3A00007A,0x320001C9,0x380005EA,0x320002FB,0x2C000362,0x240005ED,0x540A26,0x4C000489,0x3C00031B,0x40000432,0x3A0001E3,0x2E0002BE,0x34000717,0x30000415,0x2A00043D,0x2400067D,0xA40A26,
-0x260006D7,0x26000662,0x20000819,0x1C000A26,0xFA0400DE,0xFC2805A6,0xFE2C0687,0x90000086,0x6200008B,0x4C00007A,0x40000029,0x380000FA,0xC200025D,0x76000142,0x3C000233,0x2A00043D,0x740A26,0x4C05EA,0x96300152,0x56300152,0x40300153,0x821801A6,0x5C140001,0x4220002A,0x461401A6,0x3C14004B,0x321401A6,0x7005EA,0x580001A8,0x40080153,0x4600024D,0x3A00007A,
-0x320001C9,0xE405EA,0x320002FB,0x2C000362,0x240005ED,0x7005EA,0x580001A8,0x40080153,0x4600024D,0x3A00007A,0x320001C9,0xE405EA,0x320002FB,0x2C000362,0x240005ED,0xE405EA,0x320002FB,0x2C000362,0x240005ED,0x240005ED,0xFC0800CE,0xF438037A,0xFA44035E,0x90000086,0x6200008B,0x4C00007A,0x40000029,0x380000FA,0xD00001C1,0x7C000105,0x3C000223,0x2C000362,
-0xA005EA,0x2002D2,0x2002D2,0x2002D2,0x2002D2,0x620C003D,0x620C003D,0x620C003D,0x320C003E,0x320C003E,0x240C003D,0x48000152,0x48000152,0x48000152,0x34000025,0x34000025,0x24000001,0x22000152,0x22000152,0x1C000080,0x16000154,0x3002D2,0x3002D2,0x3002D2,0x280000FE,0x280000FE,0x2200007D,0x1E0001C8,0x1E0001C8,0x1C0000E4,0x16000194,0x5C02D2,
-0x5C02D2,0x160001B5,0x14000228,0xE0002D5,0xF6000014,0xF21400F5,0x2002D2,0x7A000034,0x54000028,0x40000028,0x38000019,0x2A00003D,0x820000D5,0x5A000086,0x2C00015B,0x1C0000E4,0x4002D2,0x300152,0x300152,0x300152,0x300152,0x56180001,0x56180001,0x56180001,0x30180001,0x30180001,0x24140001,0x2440152,0x2440152,0x2440152,0x34000025,0x34000025,
-0x24000001,0x8C0152,0x8C0152,0x1C000080,0x16000154,0x2440152,0x2440152,0x2440152,0x34000025,0x34000025,0x24000001,0x8C0152,0x8C0152,0x1C000080,0x16000154,0x8C0152,0x8C0152,0x1C000080,0x16000154,0x16000154,0xF6000014,0xF820003D,0x300152,0x7A000034,0x54000028,0x40000028,0x38000019,0x2A00003D,0x96000088,0x5A000062,0x640152,0x1C000080,
-0x640152,0x6801A5,0x7A4C0001,0x4E4C0001,0x40480002,0x9801A5,0x5C140000,0x40300001,0x13001A5,0x3E000022,0x320001A5,0x9801A5,0x5C140000,0x40300001,0x13001A5,0x3E000022,0x320001A5,0x13001A5,0x3E000022,0x320001A5,0x320001A5,0x9801A5,0x5C140000,0x40300001,0x13001A5,0x3E000022,0x320001A5,0x13001A5,0x3E000022,0x320001A5,0x320001A5,0x13001A5,
-0x3E000022,0x320001A5,0x320001A5,0x320001A5,0xFE140055,0x6C01A5,0xF4580082,0x9A000019,0x6A000028,0x4E000022,0x40000005,0x3A000055,0xD2040071,0x8600003A,0x44000001,0x320001A5,0xD801A5,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0xC003D,0x20000000,0x20000000,0x20000000,0x20000000,0x20000000,
-0x20000000,0x10000000,0x10000000,0x10000000,0xA000000,0x10003D,0x10003D,0x10003D,0x10003D,0x10003D,0x10003D,0xC000014,0xC000014,0xC000014,0x800000D,0x18003D,0x18003D,0x18003D,0x8000028,0x400003D,0xA8000000,0xC003D,0xC003D,0x4A000000,0x34000000,0x28000000,0x28000000,0x1A000000,0x44000011,0x34000009,0x14000001,0xC000014,
-0x14003D,};
-static const uint32_t g_etc1_to_bc7_m6_table100[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,
-0x3C0000,0x740000,0x740000,0x740000,0x12000001,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x740000,0x740000,0x740000,0x12000001,0x740000,0x740000,0x740000,0x12000001,0x12000001,0xC280000,0x280000,0x280000,0x42C0000,0x2300000,0x2340000,0x2340000,0x2400000,0x42C0000,0x2300000,0x540000,0x740000,
-0x540000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x1140000,0x1140000,0x2C000001,0x2C000001,0x880000,0x880000,0x880000,0x1140000,0x1140000,0x2C000001,0x1140000,0x1140000,0x2C000001,0x2C000001,0x1140000,
-0x1140000,0x2C000001,0x2C000001,0x2C000001,0x6680000,0x8600000,0x5C0000,0x7C0000,0x980000,0xC00000,0xE00000,0x1540000,0x740000,0x880000,0xC00000,0x2C000001,0xC00000,0x940001,0x2DC0000,0x1C40000,0x4A000000,0x2DC0000,0x1C40000,0x4A000000,0x1C40000,0x4A000000,0x4A000000,0x2DC0000,0x1C40000,0x4A000000,0x1C40000,0x4A000000,
-0x4A000000,0x1C40000,0x4A000000,0x4A000000,0x4A000000,0x2DC0000,0x1C40000,0x4A000000,0x1C40000,0x4A000000,0x4A000000,0x1C40000,0x4A000000,0x4A000000,0x4A000000,0x1C40000,0x4A000000,0x4A000000,0x4A000000,0x4A000000,0xBC0000,0xA00000,0xA00000,0xFC0000,0x16C0000,0x1DFC0000,0x4A000000,0x4A000000,0xCC0000,0x1180000,0x3FF00000,0x4A000000,
-0x13C0000,0x480C65,0xB62C0428,0x642C0428,0x4A2C0428,0xA014026D,0x6A1400D8,0x4C1C0165,0x5414026D,0x461000F0,0x3C14026D,0x8E0005EA,0x6A000163,0x4E080192,0x580001F6,0x4600001D,0x3C0001A5,0x440005ED,0x3E000284,0x38000301,0x2E0005EA,0x680C63,0x5E000594,0x48000438,0x4C0004BF,0x40000222,0x3A000313,0x400007BE,0x3A00041D,0x34000436,0x2E0006CB,0xD00C63,
-0x32000818,0x2C00076D,0x2600094E,0x22000C63,0xFE140162,0xF43807A1,0xF840089D,0xB400001E,0x76000024,0x5A000018,0x4A000001,0x4200007A,0xF2000212,0x900000D7,0x460001D4,0x34000436,0x940C63,0x5C05ED,0x9E440154,0x5E440154,0x4A400154,0x8C2801A5,0x64280002,0x4A300029,0x4E2801A5,0x4424004C,0x3C2801A5,0x8C05EA,0x6A000163,0x4A180152,0x580001F6,0x4600001D,
-0x3C0001A5,0x11805EA,0x3E000284,0x38000301,0x2E0005EA,0x8C05EA,0x6A000163,0x4A180152,0x580001F6,0x4600001D,0x3C0001A5,0x11805EA,0x3E000284,0x38000301,0x2E0005EA,0x11805EA,0x3E000284,0x38000301,0x2E0005EA,0x2E0005EA,0xFC1C00E5,0xFE4C0379,0xF2540384,0xB400001E,0x76000024,0x5A000018,0x4A000001,0x4200007A,0xF2000131,0x90000086,0x460001C4,0x38000301,
-0xC805EA,0x2C0428,0x2C0428,0x2C0428,0x2C0428,0x761400C8,0x761400C8,0x761400C8,0x3E1400C8,0x3E1400C8,0x2C1400C9,0x64000152,0x64000152,0x64000152,0x40000005,0x40000005,0x2E04000E,0x30000152,0x30000152,0x26000055,0x20000152,0x400428,0x400428,0x400428,0x3400018E,0x3400018E,0x2A0000F1,0x2800022D,0x2800022D,0x240000FA,0x1E0001BE,0x800428,
-0x800428,0x1C00028B,0x1C0002DB,0x1400042B,0xFC0C003E,0xF82001F1,0x2C0428,0xA400000D,0x6C000008,0x52000008,0x4C000000,0x38000019,0xB40000E3,0x76000072,0x3E000162,0x240000FA,0x5C0428,0x400154,0x400154,0x400154,0x400154,0x62280000,0x62280000,0x62280000,0x3A280000,0x3A280000,0x2C280001,0x600152,0x600152,0x600152,0x40000005,0x40000005,
-0x2C100001,0xC40152,0xC40152,0x26000055,0x20000152,0x600152,0x600152,0x600152,0x40000005,0x40000005,0x2C100001,0xC40152,0xC40152,0x26000055,0x20000152,0xC40152,0xC40152,0x26000055,0x20000152,0x20000152,0xFE100012,0xF2340048,0x400154,0xA400000D,0x6C000008,0x52000008,0x4C000000,0x38000019,0xC2000055,0x7C000032,0x8C0152,0x26000055,
-0x8C0152,0x7801A5,0x845C0000,0x585C0000,0x4A5C0000,0xB401A5,0x66240000,0x4A400000,0x16801A5,0x4800000D,0x3C0001A5,0xB401A5,0x66240000,0x4A400000,0x16801A5,0x4800000D,0x3C0001A5,0x16801A5,0x4800000D,0x3C0001A5,0x3C0001A5,0xB401A5,0x66240000,0x4A400000,0x16801A5,0x4800000D,0x3C0001A5,0x16801A5,0x4800000D,0x3C0001A5,0x3C0001A5,0x16801A5,
-0x4800000D,0x3C0001A5,0x3C0001A5,0x3C0001A5,0xFE280062,0x8001A5,0xFE6C0080,0xB4000005,0x78000012,0x5A000008,0x4A080000,0x46000034,0xF6080055,0x98000019,0x4E100000,0x3C0001A5,0xFC01A5,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x1400C8,0x3C000000,0x3C000000,0x3C000000,0x3C000000,0x3C000000,
-0x3C000000,0x1C000001,0x1C000001,0x1C000001,0x12000001,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x1C00C8,0x18000049,0x18000049,0x18000049,0x12000025,0x3800C8,0x3800C8,0x3800C8,0xE00007D,0x80000CA,0xFA040008,0x1400C8,0x1400C8,0x8A000000,0x60000000,0x4A000000,0x4A000000,0x30000000,0x7600003A,0x5600001D,0x24000004,0x18000049,
-0x2800C8,};
-static const uint32_t g_etc1_to_bc7_m6_table101[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,
-0x2500000,0xA40000,0xA40000,0xA40000,0x1A000001,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0x2500000,0xA40000,0xA40000,0xA40000,0x1A000001,0xA40000,0xA40000,0xA40000,0x1A000001,0x1A000001,0x3C0000,0x380000,0x380000,0x400000,0x2440000,0x4C0000,0x4C0000,0x5C0000,0x400000,0x2440000,0x740000,0xA40000,
-0x740000,0x6C0000,0x6C0000,0x6C0000,0x6C0000,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0x1440000,0x1440000,0x34000001,0x34000001,0xA00000,0xA00000,0xA00000,0x1440000,0x1440000,0x34000001,0x1440000,0x1440000,0x34000001,0x34000001,0x1440000,
-0x1440000,0x34000001,0x34000001,0x34000001,0x27C0000,0x740000,0x6C0000,0x2900000,0xB40000,0xE40000,0x1080000,0x1900000,0x880000,0xA00000,0xE40000,0x34000001,0xE40000,0xA40001,0x2F40000,0x1F40000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,
-0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x2F40000,0x1F40000,0x52000000,0x1F40000,0x52000000,0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x1F40000,0x52000000,0x52000000,0x52000000,0x52000000,0xD00000,0xB00000,0xB00000,0x3140000,0x1940000,0x27FC0000,0x52000000,0x52000000,0xE00000,0x1380000,0x49C40000,0x52000000,
-0x1600000,0x540EC9,0xC23805B4,0x6C3805B5,0x523805B4,0xB21C032D,0x762001A4,0x56240261,0x5E1C032D,0x501801A4,0x441C032D,0xA60005EA,0x7C000153,0x581001FE,0x620001B9,0x50000005,0x440401B9,0x500005ED,0x4A000234,0x3E0002A5,0x360005EA,0x780EC7,0x680006C8,0x520005B3,0x5800057F,0x4C0002A2,0x400003A7,0x4C000876,0x46000455,0x3C00044E,0x34000717,0xF40EC7,
-0x380009AC,0x320008B1,0x30000A97,0x28000EC7,0xFE20025D,0xFA440995,0xFE4C0AC9,0xCC000001,0x8A000004,0x66000003,0x5408001C,0x4E000030,0xFE000225,0xA60000A2,0x54000194,0x3C00044E,0xAC0EC7,0x6C05ED,0xA6540154,0x66540154,0x52500154,0x943801A5,0x6C380002,0x52400029,0x563801A5,0x4C34004C,0x443801A5,0xA405EA,0x78040153,0x52280152,0x620001B9,0x50000005,
-0x441001A5,0x14C05EA,0x4A000234,0x3E0002A5,0x360005EA,0xA405EA,0x78040153,0x52280152,0x620001B9,0x50000005,0x441001A5,0x14C05EA,0x4A000234,0x3E0002A5,0x360005EA,0x14C05EA,0x4A000234,0x3E0002A5,0x360005EA,0x360005EA,0xFC3000F8,0xF65C039D,0xFA640384,0xCC000001,0x8A000004,0x66000003,0x52100001,0x4E000030,0xFE08010A,0xA8000035,0x5400017B,0x3E0002A5,
-0xE805EA,0x3805B4,0x3805B4,0x3805B4,0x3805B4,0x861C0188,0x861C0188,0x861C0188,0x481C0188,0x481C0188,0x341C0189,0x7C000152,0x7C000152,0x7C000152,0x50000001,0x50000001,0x36080042,0x3C000152,0x3C000152,0x3200002D,0x28000152,0x5005B3,0x5005B3,0x5005B3,0x4000024E,0x4000024E,0x3400019B,0x340002A5,0x340002A5,0x2E000122,0x280001FB,0xA005B3,
-0xA005B3,0x26000389,0x200003B6,0x1A0005B3,0xF81400D1,0xFE2C0329,0x3805B4,0xC6000001,0x84000001,0x64000001,0x5A040009,0x48000004,0xE4000105,0x9600006A,0x4C00016B,0x2E000122,0x7005B3,0x500154,0x500154,0x500154,0x500154,0x6A380000,0x6A380000,0x6A380000,0x42380000,0x42380000,0x34380001,0x780152,0x780152,0x780152,0x4C080001,0x4C080001,
-0x34200001,0xF40152,0xF40152,0x3200002D,0x28000152,0x780152,0x780152,0x780152,0x4C080001,0x4C080001,0x34200001,0xF40152,0xF40152,0x3200002D,0x28000152,0xF40152,0xF40152,0x3200002D,0x28000152,0x28000152,0xFE200019,0xFA440048,0x500154,0xC4040001,0x80040000,0x62040000,0x54100000,0x48000004,0xF4000034,0x9A000012,0xAC0152,0x3200002D,
-0xAC0152,0x8801A5,0x8C6C0000,0x606C0000,0x526C0000,0xC801A5,0x6E340000,0x52500000,0x19801A5,0x50000004,0x440001A5,0xC801A5,0x6E340000,0x52500000,0x19801A5,0x50000004,0x440001A5,0x19801A5,0x50000004,0x440001A5,0x440001A5,0xC801A5,0x6E340000,0x52500000,0x19801A5,0x50000004,0x440001A5,0x19801A5,0x50000004,0x440001A5,0x440001A5,0x19801A5,
-0x50000004,0x440001A5,0x440001A5,0x440001A5,0xF6400071,0x9001A5,0xF67C0091,0xCC000000,0x8A000004,0x66000002,0x52180000,0x5000001D,0xFE180055,0xAE00000A,0x56200000,0x440001A5,0x12001A5,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x1C0188,0x54000000,0x54000000,0x54000000,0x54000000,0x54000000,
-0x54000000,0x28000000,0x28000000,0x28000000,0x1A000001,0x280188,0x280188,0x280188,0x280188,0x280188,0x280188,0x22000089,0x22000089,0x22000089,0x18000049,0x500188,0x500188,0x500188,0x100000F2,0xC00018A,0xFE0C0048,0x1C0188,0x1C0188,0xC4000000,0x88000000,0x68000000,0x68000000,0x44000000,0xA000007D,0x74000041,0x34000009,0x22000089,
-0x380188,};
-static const uint32_t g_etc1_to_bc7_m6_table102[] = {
-0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x240000,
-0x240000,0x240000,0x240000,0x6000000,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xC0001,0xE0C0000,0xE0C0000,0xE0C0000,0x140000,0x1C0000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,
-0x2680000,0xD80000,0xD80000,0xD80000,0x22000001,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,0x2680000,0xD80000,0xD80000,0xD80000,0x22000001,0xD80000,0xD80000,0xD80000,0x22000001,0x22000001,0x4C0000,0x480000,0x480000,0x540000,0x2580000,0x600000,0x600000,0x780000,0x540000,0x2580000,0x980000,0xD80000,
-0x980000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0x1740000,0x1740000,0x3C000001,0x3C000001,0xB80000,0xB80000,0xB80000,0x1740000,0x1740000,0x3C000001,0x1740000,0x1740000,0x3C000001,0x3C000001,0x1740000,
-0x1740000,0x3C000001,0x3C000001,0x3C000001,0x900000,0x840000,0x7C0000,0xA80000,0xD00000,0x1080000,0x12C0000,0x1CC0000,0x9C0000,0xB80000,0x1080000,0x3C000001,0x1080000,0xB40001,0x30C0000,0xBFC0000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,
-0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0x30C0000,0xBFC0000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0xBFC0000,0x5A000000,0x5A000000,0x5A000000,0x5A000000,0xE40000,0x8C00000,0x8C00000,0x1300000,0x1BC0000,0x31FC0000,0x5A000000,0x5A000000,0xF80000,0x1540000,0x51D40000,0x5A000000,
-0x1800000,0x600F1E,0xCA4805ED,0x764405ED,0x5A4405ED,0xBA2C034A,0x7E3001C5,0x5E340286,0x662C034A,0x582801BD,0x4C2C034A,0xAE1005EB,0x84100154,0x6020020F,0x6C0801B6,0x58100006,0x4C1401BE,0x5A0C05EB,0x5204020D,0x4604028A,0x3E0C05EB,0x900F1A,0x7A000675,0x5A1005EA,0x680004BE,0x5800021D,0x4C000362,0x580007E9,0x4E00036B,0x46000371,0x3C00069F,0x1240F1A,
-0x44000939,0x3E0007FE,0x38000A17,0x30000F1A,0xFE3402CD,0xFE4C0A1A,0xF65C0B55,0xD4100002,0x920C0004,0x6E100004,0x5C180025,0x560C0023,0xFE100262,0xBC000021,0x5E000159,0x46000371,0xD00F1A,0x7C05ED,0xAE640154,0x6E640154,0x5A600154,0x9C4801A5,0x74480002,0x5A500029,0x5E4801A5,0x5444004C,0x4C4801A5,0xBC05EA,0x80140153,0x5A380152,0x700001A6,0x58100005,
-0x4C2001A5,0x17C05EA,0x500001E8,0x48000266,0x3E0005EA,0xBC05EA,0x80140153,0x5A380152,0x700001A6,0x58100005,0x4C2001A5,0x17C05EA,0x500001E8,0x48000266,0x3E0005EA,0x17C05EA,0x500001E8,0x48000266,0x3E0005EA,0x3E0005EA,0xFE3C0121,0xFE6C039D,0xF27403AD,0xD4100001,0x920C0003,0x6E100003,0x5A200001,0x5800000D,0xFE180129,0xBC000008,0x5E000158,0x48000266,
-0x10C05EA,0x4405ED,0x4405ED,0x4405ED,0x4405ED,0x8E2C01A5,0x8E2C01A5,0x8E2C01A5,0x502C01A5,0x502C01A5,0x3C2C01A6,0x84100153,0x84100153,0x84100153,0x58100002,0x58100002,0x3E18004B,0x44100153,0x44100153,0x380C002A,0x300C0153,0x6805EA,0x6805EA,0x6805EA,0x52000205,0x52000205,0x3C0401A6,0x40000248,0x40000248,0x3A0000A9,0x2E00019A,0xD005EA,
-0xD005EA,0x2C000356,0x2A00037E,0x220005EA,0xFE2000FA,0xF63C0379,0x4405ED,0xD80C0001,0x8C100002,0x6C100002,0x6410000B,0x500C0002,0xFE0000B5,0xB6000015,0x5A040153,0x3A0000A9,0x9405EA,0x600154,0x600154,0x600154,0x600154,0x72480000,0x72480000,0x72480000,0x4A480000,0x4A480000,0x3C480001,0x900152,0x900152,0x900152,0x54180001,0x54180001,
-0x3C300001,0x1240152,0x1240152,0x3A000019,0x30000152,0x900152,0x900152,0x900152,0x54180001,0x54180001,0x3C300001,0x1240152,0x1240152,0x3A000019,0x30000152,0x1240152,0x1240152,0x3A000019,0x30000152,0x30000152,0xFA340020,0xF2540055,0x600154,0xD80C0000,0x88140000,0x6A140000,0x5C200000,0x52080000,0xFE0C0032,0xBC000004,0xD00152,0x3A000019,
-0xD00152,0x9801A5,0x947C0000,0x687C0000,0x5A7C0000,0xE001A5,0x76440000,0x5A600000,0x1CC01A5,0x5A040000,0x4C0001A5,0xE001A5,0x76440000,0x5A600000,0x1CC01A5,0x5A040000,0x4C0001A5,0x1CC01A5,0x5A040000,0x4C0001A5,0x4C0001A5,0xE001A5,0x76440000,0x5A600000,0x1CC01A5,0x5A040000,0x4C0001A5,0x1CC01A5,0x5A040000,0x4C0001A5,0x4C0001A5,0x1CC01A5,
-0x5A040000,0x4C0001A5,0x4C0001A5,0x4C0001A5,0xFE500071,0xA401A5,0xFE8C0091,0xD4100000,0x98000000,0x70080000,0x5A280000,0x5800000D,0xFE2C0062,0xBC040002,0x5E300000,0x4C0001A5,0x14001A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x2C01A5,0x5C100001,0x5C100001,0x5C100001,0x5C100001,0x5C100001,
-0x5C100001,0x30100001,0x30100001,0x30100001,0x220C0002,0x4001A5,0x4001A5,0x4001A5,0x4001A5,0x4001A5,0x4001A5,0x2E000050,0x2E000050,0x2E000050,0x22000011,0x7C01A5,0x7C01A5,0x7C01A5,0x1C0000C1,0x140001A5,0xF61C0062,0x2C01A5,0x2C01A5,0xCC100001,0x90100001,0x70100001,0x70100001,0x4C100001,0xE0000041,0xA400000D,0x40080001,0x2E000050,
-0x5801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table103[] = {
-0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x580000,
-0x580000,0x580000,0x580000,0xE000000,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x200000,0x200000,0x200000,0x2C0000,0x3C0000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,
-0x2800000,0x1080000,0x1080000,0x1080000,0x2A000001,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,0x2800000,0x1080000,0x1080000,0x1080000,0x2A000001,0x1080000,0x1080000,0x1080000,0x2A000001,0x2A000001,0x65C0000,0x580000,0x580000,0x4640000,0x26C0000,0x780000,0x780000,0x940000,0x4640000,0x26C0000,0xB80000,0x1080000,
-0xB80000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0xD00000,0xD00000,0xD00000,0x1A40000,0x1A40000,0x44000001,0xD00000,0xD00000,0xD00000,0x1A40000,0x1A40000,0x44000001,0x1A40000,0x1A40000,0x44000001,0x44000001,0xD00000,0xD00000,0xD00000,0x1A40000,0x1A40000,0x44000001,0x1A40000,0x1A40000,0x44000001,0x44000001,0x1A40000,
-0x1A40000,0x44000001,0x44000001,0x44000001,0x6A00000,0x2940000,0x8C0000,0x2BC0000,0xEC0000,0x1280000,0x1540000,0x3FC0000,0xB00000,0xD00000,0x1280000,0x44000001,0x1280000,0xC40001,0x3240000,0x17FC0000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,
-0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x3240000,0x17FC0000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x17FC0000,0x62000000,0x62000000,0x62000000,0x62000000,0xF80000,0xD40000,0xD40000,0x14C0000,0x1E40000,0x3BFC0000,0x62000000,0x62000000,0x10C0000,0x1740000,0x59E40000,0x62000000,
-0x1A40000,0x700F1E,0xD25805ED,0x7E5405ED,0x625405ED,0xC23C034A,0x864001C5,0x66440286,0x6E3C034A,0x603801BD,0x543C034A,0xB62005EB,0x8C200154,0x6830020F,0x741801B6,0x60200006,0x542401BE,0x621C05EB,0x5A14020D,0x4E14028A,0x461C05EB,0xA80F1A,0x8C000615,0x622005EA,0x74000416,0x620001BA,0x5404034A,0x6200073B,0x5800028C,0x4E0002AB,0x4600062A,0x1580F1A,
-0x50000899,0x4A000746,0x3E000983,0x38000F1A,0xFE440322,0xFA640A2E,0xFE6C0B55,0xDC200002,0x9A1C0004,0x76200004,0x64280025,0x5E1C0023,0xFE1C02BE,0xD0040000,0x66100159,0x4E0002AB,0xF00F1A,0x8C05ED,0xB6740154,0x76740154,0x62700154,0xA45801A5,0x7C580002,0x62600029,0x665801A5,0x5C54004C,0x545801A5,0x2D005EA,0x88240153,0x62480152,0x7A0C01A5,0x60200005,
-0x543001A5,0x1AC05EA,0x5C0001A8,0x50000221,0x460005EA,0x2D005EA,0x88240153,0x62480152,0x7A0C01A5,0x60200005,0x543001A5,0x1AC05EA,0x5C0001A8,0x50000221,0x460005EA,0x1AC05EA,0x5C0001A8,0x50000221,0x460005EA,0x460005EA,0xFE500132,0xF67C03C5,0xFA8403AD,0xDC200001,0x9A1C0003,0x76200003,0x62300001,0x6008000C,0xFC300155,0xD0040000,0x68040152,0x50000221,
-0x12C05EA,0x5405ED,0x5405ED,0x5405ED,0x5405ED,0x963C01A5,0x963C01A5,0x963C01A5,0x583C01A5,0x583C01A5,0x443C01A6,0x8C200153,0x8C200153,0x8C200153,0x60200002,0x60200002,0x4628004B,0x4C200153,0x4C200153,0x401C002A,0x381C0153,0x8005EA,0x8005EA,0x8005EA,0x620001BA,0x620001BA,0x441401A6,0x520001E4,0x520001E4,0x4200003B,0x38000162,0x10005EA,
-0x10005EA,0x380002DE,0x32000303,0x2A0005EA,0xFC380111,0xFE4C0379,0x5405ED,0xE01C0001,0x94200002,0x74200002,0x6C20000B,0x581C0002,0xFE1400C8,0xD0040000,0x62140153,0x4200003B,0xB405EA,0x700154,0x700154,0x700154,0x700154,0x7A580000,0x7A580000,0x7A580000,0x52580000,0x52580000,0x44580001,0xA80152,0xA80152,0xA80152,0x5C280001,0x5C280001,
-0x44400001,0x1580152,0x1580152,0x44000005,0x38000152,0xA80152,0xA80152,0xA80152,0x5C280001,0x5C280001,0x44400001,0x1580152,0x1580152,0x44000005,0x38000152,0x1580152,0x1580152,0x44000005,0x38000152,0x38000152,0xF6480029,0xFA640055,0x700154,0xE01C0000,0x90240000,0x72240000,0x64300000,0x5A180000,0xF624003D,0xD0040000,0xF00152,0x44000005,
-0xF00152,0xA801A5,0x9C8C0000,0x708C0000,0x628C0000,0xF801A5,0x7E540000,0x62700000,0x1FC01A5,0x62140000,0x540001A5,0xF801A5,0x7E540000,0x62700000,0x1FC01A5,0x62140000,0x540001A5,0x1FC01A5,0x62140000,0x540001A5,0x540001A5,0xF801A5,0x7E540000,0x62700000,0x1FC01A5,0x62140000,0x540001A5,0x1FC01A5,0x62140000,0x540001A5,0x540001A5,0x1FC01A5,
-0x62140000,0x540001A5,0x540001A5,0x540001A5,0xFA640080,0xB401A5,0xF69C00A4,0xDC200000,0xA0100000,0x78180000,0x62380000,0x60000005,0xF4480071,0xD0040000,0x66400000,0x540001A5,0x16401A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x3C01A5,0x64200001,0x64200001,0x64200001,0x64200001,0x64200001,
-0x64200001,0x38200001,0x38200001,0x38200001,0x2A1C0002,0x5401A5,0x5401A5,0x5401A5,0x5401A5,0x5401A5,0x5401A5,0x3A000020,0x3A000020,0x3A000020,0x2A040001,0xAC01A5,0xAC01A5,0xAC01A5,0x26000092,0x1C0001A5,0xFE2C0062,0x3C01A5,0x3C01A5,0xD4200001,0x98200001,0x78200001,0x78200001,0x54200001,0xFA080029,0xCE040000,0x48180001,0x3A000020,
-0x7801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table104[] = {
-0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x8C0000,
-0x8C0000,0x8C0000,0x8C0000,0x16000001,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x340000,0x340000,0x340000,0x2440000,0x640000,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x680001,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,
-0x9C0000,0x13C0000,0x13C0000,0x13C0000,0x34000000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x9C0000,0x13C0000,0x13C0000,0x13C0000,0x34000000,0x13C0000,0x13C0000,0x13C0000,0x34000000,0x34000000,0x700000,0x680001,0x680001,0x7C0000,0x840000,0x900000,0x900000,0xB00000,0x7C0000,0x840000,0xE00000,0x13C0000,
-0xE00000,0x9C0001,0x9C0001,0x9C0001,0x9C0001,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x1DC0000,0x1DC0000,0x4E000000,0x4E000000,0x2E80000,0x2E80000,0x2E80000,0x1DC0000,0x1DC0000,0x4E000000,0x1DC0000,0x1DC0000,0x4E000000,0x4E000000,0x1DC0000,
-0x1DC0000,0x4E000000,0x4E000000,0x4E000000,0xB80000,0xA80000,0x9C0001,0x2D40000,0x1080000,0x1500000,0x1800000,0x11F40000,0x2C40000,0x2E80000,0x1500000,0x4E000000,0x1500000,0xD80000,0x1400000,0x25F80000,0x6A000001,0x1400000,0x25F80000,0x6A000001,0x25F80000,0x6A000001,0x6A000001,0x1400000,0x25F80000,0x6A000001,0x25F80000,0x6A000001,
-0x6A000001,0x25F80000,0x6A000001,0x6A000001,0x6A000001,0x1400000,0x25F80000,0x6A000001,0x25F80000,0x6A000001,0x6A000001,0x25F80000,0x6A000001,0x6A000001,0x6A000001,0x25F80000,0x6A000001,0x6A000001,0x6A000001,0x6A000001,0x50C0000,0xAE40000,0xAE40000,0x16C0000,0x9F80000,0x47F80000,0x6A000001,0x6A000001,0x3240000,0x1940000,0x63D80000,0x6A000001,
-0x1CC0000,0x840F1A,0xDE6805EA,0x866805EA,0x6A6805EB,0xCC4C034A,0x8E5001C3,0x6E54028A,0x784C034A,0x684C01BE,0x5C4C034A,0xC23005EA,0x96300153,0x7040020D,0x7E2C01B2,0x6A300006,0x5E3401BD,0x6A3005EA,0x6224020F,0x58280286,0x4E3005ED,0xC40F1A,0x9E0005EB,0x6C3005EB,0x86000392,0x6C0801B6,0x5C18034A,0x740006B1,0x640001DC,0x5A000215,0x4E0005F1,0x18C0F1A,
-0x5C0007F7,0x50000662,0x4A0008D9,0x40000F1E,0xFE580372,0xF4780A82,0xF67C0B9A,0xE6300000,0xA2300003,0x80300002,0x6C380023,0x662C0025,0xFE3402F6,0xD6180002,0x7024015B,0x5A000215,0x1180F1A,0xA005EA,0xC0840152,0x80840152,0x6A840153,0xAC6C01A6,0x86680001,0x6C74002A,0x706801A6,0x6668004B,0x5C6801A6,0xEC05EA,0x90380153,0x6A5C0153,0x841C01A5,0x6A340002,
-0x5C4401A5,0x1E405EA,0x6600017D,0x5A0001F1,0x4E0005ED,0xEC05EA,0x90380153,0x6A5C0153,0x841C01A5,0x6A340002,0x5C4401A5,0x1E405EA,0x6600017D,0x5A0001F1,0x4E0005ED,0x1E405EA,0x6600017D,0x5A0001F1,0x4E0005ED,0x4E0005ED,0xFE640164,0xFE8C03CE,0xF49803D3,0xE6300000,0xA2300003,0x80300002,0x6C440002,0x6A1C000B,0xFE440171,0xD6180001,0x72140152,0x5A0001F1,
-0x15405EA,0x6805EA,0x6805EA,0x6805EA,0x6805EA,0xA24C01A5,0xA24C01A5,0xA24C01A5,0x624C01A5,0x624C01A5,0x4E4C01A5,0x96300152,0x96300152,0x96300152,0x68300005,0x68300005,0x503C004C,0x56300152,0x56300152,0x4A300029,0x40300154,0x29805EA,0x29805EA,0x29805EA,0x740001A5,0x740001A5,0x4E2401A5,0x5E00018B,0x5E00018B,0x4E000004,0x40080154,0x13805EA,
-0x13805EA,0x44000279,0x3C0002A5,0x320005ED,0xFC480129,0xF860039D,0x6805EA,0xE6300000,0xA0300002,0x80300002,0x7634000C,0x62300001,0xFE2800F2,0xD6180001,0x6C240152,0x4E000004,0xDC05EA,0x840152,0x840152,0x840152,0x840152,0x806C0001,0x806C0001,0x806C0001,0x5A6C0001,0x5A6C0001,0x4E680001,0xC40152,0xC40152,0xC40152,0x643C0001,0x643C0001,
-0x4E500000,0x18C0152,0x18C0152,0x4E000000,0x40000154,0xC40152,0xC40152,0xC40152,0x643C0001,0x643C0001,0x4E500000,0x18C0152,0x18C0152,0x4E000000,0x40000154,0x18C0152,0x18C0152,0x4E000000,0x40000154,0x40000154,0xFE580029,0xF4780062,0x840152,0xE6300000,0x98380000,0x7A380001,0x6E400000,0x622C0000,0xFE34003D,0xD6180000,0x1180152,0x4E000000,
-0x1180152,0xBC01A5,0xA4A00001,0x78A00001,0x6A9C0002,0x11401A5,0x86680000,0x6A840001,0xFF801A5,0x6A2C0001,0x5C0001A5,0x11401A5,0x86680000,0x6A840001,0xFF801A5,0x6A2C0001,0x5C0001A5,0xFF801A5,0x6A2C0001,0x5C0001A5,0x5C0001A5,0x11401A5,0x86680000,0x6A840001,0xFF801A5,0x6A2C0001,0x5C0001A5,0xFF801A5,0x6A2C0001,0x5C0001A5,0x5C0001A5,0xFF801A5,
-0x6A2C0001,0x5C0001A5,0x5C0001A5,0x5C0001A5,0xFE780080,0xC801A5,0xFEAC00AA,0xE2340000,0xA8240000,0x82280000,0x6A4C0001,0x6A000001,0xFC580071,0xD8180000,0x70500000,0x5C0001A5,0x18C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x4C01A5,0x6E300000,0x6E300000,0x6E300000,0x6E300000,0x6E300000,
-0x6E300000,0x42300000,0x42300000,0x42300000,0x34300000,0x7001A5,0x7001A5,0x7001A5,0x7001A5,0x7001A5,0x7001A5,0x4C000002,0x4C000002,0x4C000002,0x34140000,0xE401A5,0xE401A5,0xE401A5,0x30000061,0x260001A5,0xF8400071,0x4C01A5,0x4C01A5,0xE2300000,0xA4300000,0x84300000,0x84300000,0x5E300000,0xFC1C0032,0xCE180001,0x52280000,0x4C000002,
-0xA001A5,};
-static const uint32_t g_etc1_to_bc7_m6_table105[] = {
-0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0x25C0000,0xBC0000,
-0xBC0000,0xBC0000,0xBC0000,0x1E000001,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x440000,0x440000,0x440000,0x25C0000,0x880000,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0x780001,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,
-0xB40000,0x1700000,0x1700000,0x1700000,0x3C000000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x1700000,0x1700000,0x1700000,0x3C000000,0x1700000,0x1700000,0x1700000,0x3C000000,0x3C000000,0x8800000,0x780001,0x780001,0x28C0000,0x980000,0x2A40000,0x2A40000,0xCC0000,0x28C0000,0x980000,0x1000000,0x1700000,
-0x1000000,0xAC0001,0xAC0001,0xAC0001,0xAC0001,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x5FC0000,0x5FC0000,0x56000000,0x56000000,0x3000000,0x3000000,0x3000000,0x5FC0000,0x5FC0000,0x56000000,0x5FC0000,0x5FC0000,0x56000000,0x56000000,0x5FC0000,
-0x5FC0000,0x56000000,0x56000000,0x56000000,0x4C80000,0x4B80000,0xAC0001,0xEC0000,0x1240000,0x1700000,0x1A80000,0x1BF80000,0x2D80000,0x3000000,0x1700000,0x56000000,0x1700000,0xE80000,0x1580000,0x31F80000,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,
-0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x1580000,0x31F80000,0x72000001,0x31F80000,0x72000001,0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x31F80000,0x72000001,0x72000001,0x72000001,0x72000001,0x5200000,0xF80000,0xF80000,0x1840000,0x15FC0000,0x51F80000,0x72000001,0x72000001,0x13C0000,0x1B40000,0x6BE80000,0x72000001,
-0x1EC0000,0x940F1A,0xE67805EA,0x8E7805EA,0x727805EB,0xD45C034A,0x966001C3,0x7664028A,0x805C034A,0x705C01BE,0x645C034A,0xCA4005EA,0x9E400153,0x7850020D,0x863C01B2,0x72400006,0x664401BD,0x724005EA,0x6A34020F,0x60380286,0x564005ED,0xDC0F1A,0xA61005EB,0x744005EB,0x9200035A,0x741801B6,0x6428034A,0x80000651,0x7000017C,0x620001D5,0x580805ED,0x1BC0F1A,
-0x66000786,0x5C0005BA,0x50000861,0x48000F1E,0xFE6403C8,0xFC880A82,0xFE8C0B9A,0xEE400000,0xAA400003,0x88400002,0x74480023,0x6E3C0025,0xFE440361,0xDE280002,0x7834015B,0x620001D5,0x1380F1A,0xB005EA,0xC8940152,0x88940152,0x72940153,0xB47C01A6,0x8E780001,0x7484002A,0x787801A6,0x6E78004B,0x647801A6,0x10405EA,0x98480153,0x726C0153,0x8C2C01A5,0x72440002,
-0x645401A5,0x7FC05EA,0x70000163,0x620001D5,0x560005ED,0x10405EA,0x98480153,0x726C0153,0x8C2C01A5,0x72440002,0x645401A5,0x7FC05EA,0x70000163,0x620001D5,0x560005ED,0x7FC05EA,0x70000163,0x620001D5,0x560005ED,0x560005ED,0xFE780179,0xF8A003EA,0xFCA803D3,0xEE400000,0xAA400003,0x88400002,0x74540002,0x722C000B,0xFE5C0189,0xDE280001,0x7A240152,0x620001D5,
-0x17405EA,0x7805EA,0x7805EA,0x7805EA,0x7805EA,0xAA5C01A5,0xAA5C01A5,0xAA5C01A5,0x6A5C01A5,0x6A5C01A5,0x565C01A5,0x9E400152,0x9E400152,0x9E400152,0x70400005,0x70400005,0x584C004C,0x5E400152,0x5E400152,0x52400029,0x48400154,0x2B005EA,0x2B005EA,0x2B005EA,0x7C1001A5,0x7C1001A5,0x563401A5,0x6A000163,0x6A000163,0x56080002,0x48180154,0x16805EA,
-0x16805EA,0x4E000239,0x44000248,0x3A0005ED,0xFE580149,0xFE6C03A5,0x7805EA,0xEE400000,0xA8400002,0x88400002,0x7E44000C,0x6A400001,0xFE3C010A,0xDE280001,0x74340152,0x56080002,0xFC05EA,0x940152,0x940152,0x940152,0x940152,0x887C0001,0x887C0001,0x887C0001,0x627C0001,0x627C0001,0x56780001,0xDC0152,0xDC0152,0xDC0152,0x6C4C0001,0x6C4C0001,
-0x56600000,0x1BC0152,0x1BC0152,0x56100000,0x48000154,0xDC0152,0xDC0152,0xDC0152,0x6C4C0001,0x6C4C0001,0x56600000,0x1BC0152,0x1BC0152,0x56100000,0x48000154,0x1BC0152,0x1BC0152,0x56100000,0x48000154,0x48000154,0xFA6C0032,0xFC880062,0x940152,0xEE400000,0xA0480000,0x82480001,0x76500000,0x6A3C0000,0xFA48004A,0xDE280000,0x1380152,0x56100000,
-0x1380152,0xCC01A5,0xACB00001,0x80B00001,0x72AC0002,0x12C01A5,0x8E780000,0x72940001,0x1BF801A5,0x723C0001,0x640001A5,0x12C01A5,0x8E780000,0x72940001,0x1BF801A5,0x723C0001,0x640001A5,0x1BF801A5,0x723C0001,0x640001A5,0x640001A5,0x12C01A5,0x8E780000,0x72940001,0x1BF801A5,0x723C0001,0x640001A5,0x1BF801A5,0x723C0001,0x640001A5,0x640001A5,0x1BF801A5,
-0x723C0001,0x640001A5,0x640001A5,0x640001A5,0xFA8C0091,0xD801A5,0xF8C000B5,0xEA440000,0xB0340000,0x8A380000,0x725C0001,0x72100001,0xFC6C0082,0xE0280000,0x78600000,0x640001A5,0x1AC01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x5C01A5,0x76400000,0x76400000,0x76400000,0x76400000,0x76400000,
-0x76400000,0x4A400000,0x4A400000,0x4A400000,0x3C400000,0x8801A5,0x8801A5,0x8801A5,0x8801A5,0x8801A5,0x8801A5,0x58080000,0x58080000,0x58080000,0x3C240000,0x11401A5,0x11401A5,0x11401A5,0x38000034,0x2E0001A5,0xFE4C0075,0x5C01A5,0x5C01A5,0xEA400000,0xAC400000,0x8C400000,0x8C400000,0x66400000,0xF830003D,0xD6280001,0x5A380000,0x58080000,
-0xC001A5,};
-static const uint32_t g_etc1_to_bc7_m6_table106[] = {
-0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0x2740000,0xF00000,
-0xF00000,0xF00000,0xF00000,0x26000001,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x2540000,0x2540000,0x2540000,0x2740000,0xA80000,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0x880001,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,
-0xCC0000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0x1A00000,0x1A00000,0x1A00000,0x44000000,0x44000000,0x940000,0x880001,0x880001,0xA00000,0xAC0000,0xBC0000,0xBC0000,0xE80000,0xA00000,0xAC0000,0x1240000,0x1A00000,
-0x1240000,0xBC0001,0xBC0001,0xBC0001,0xBC0001,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x11FC0000,0x11FC0000,0x5E000000,0x5E000000,0x3180000,0x3180000,0x3180000,0x11FC0000,0x11FC0000,0x5E000000,0x11FC0000,0x11FC0000,0x5E000000,0x5E000000,0x11FC0000,
-0x11FC0000,0x5E000000,0x5E000000,0x5E000000,0xDC0000,0xCC80000,0xBC0001,0x3000000,0x1400000,0x1940000,0x1D00000,0x25FC0000,0x2EC0000,0x3180000,0x1940000,0x5E000000,0x1940000,0xF80000,0x1700000,0x3DF80000,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,
-0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x1700000,0x3DF80000,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x3DF80000,0x7A000001,0x7A000001,0x7A000001,0x7A000001,0x5340000,0x1080000,0x1080000,0x1A00000,0x23FC0000,0x5BF80000,0x7A000001,0x7A000001,0x1500000,0x1D00000,0x73F80000,0x7A000001,
-0x9FC0000,0xA40F1A,0xEE8805EA,0x968805EA,0x7A8805EB,0xDC6C034A,0x9E7001C3,0x7E74028A,0x886C034A,0x786C01BE,0x6C6C034A,0xD25005EA,0xA6500153,0x8060020D,0x8E4C01B2,0x7A500006,0x6E5401BD,0x7A5005EA,0x7244020F,0x68480286,0x5E5005ED,0xF40F1A,0xAE2005EB,0x7C5005EB,0xA004034A,0x7C2801B6,0x6C38034A,0x8C000611,0x7A000155,0x6A0801C5,0x601805ED,0x1F00F1A,
-0x6C000716,0x66000542,0x5C0007E9,0x50000F1E,0xFE780419,0xF4980ADA,0xF8A00BDB,0xF6500000,0xB2500003,0x90500002,0x7C580023,0x764C0025,0xFE5C03A2,0xE6380002,0x8044015B,0x6A0801C5,0x15C0F1A,0xC005EA,0xD0A40152,0x90A40152,0x7AA40153,0xBC8C01A6,0x96880001,0x7C94002A,0x808801A6,0x7688004B,0x6C8801A6,0x11C05EA,0xA0580153,0x7A7C0153,0x943C01A5,0x7A540002,
-0x6C6401A5,0x13FC05EA,0x7A000155,0x6C0001B5,0x5E0005ED,0x11C05EA,0xA0580153,0x7A7C0153,0x943C01A5,0x7A540002,0x6C6401A5,0x13FC05EA,0x7A000155,0x6C0001B5,0x5E0005ED,0x13FC05EA,0x7A000155,0x6C0001B5,0x5E0005ED,0x5E0005ED,0xFC9001A9,0xFEAC03FE,0xF4B803FE,0xF6500000,0xB2500003,0x90500002,0x7C640002,0x7A3C000B,0xFE7001C3,0xE6380001,0x82340152,0x6C0001B5,
-0x19805EA,0x8805EA,0x8805EA,0x8805EA,0x8805EA,0xB26C01A5,0xB26C01A5,0xB26C01A5,0x726C01A5,0x726C01A5,0x5E6C01A5,0xA6500152,0xA6500152,0xA6500152,0x78500005,0x78500005,0x605C004C,0x66500152,0x66500152,0x5A500029,0x50500154,0xC805EA,0xC805EA,0xC805EA,0x842001A5,0x842001A5,0x5E4401A5,0x78040153,0x78040153,0x5E180002,0x50280154,0x19805EA,
-0x19805EA,0x560001FD,0x4E00020D,0x420005ED,0xFE680164,0xF88003C2,0x8805EA,0xF6500000,0xB0500002,0x90500002,0x8654000C,0x72500001,0xFC4C0123,0xE6380001,0x7C440152,0x5E180002,0x12005EA,0xA40152,0xA40152,0xA40152,0xA40152,0x908C0001,0x908C0001,0x908C0001,0x6A8C0001,0x6A8C0001,0x5E880001,0xF40152,0xF40152,0xF40152,0x745C0001,0x745C0001,
-0x5E700000,0x1F00152,0x1F00152,0x5E200000,0x50000154,0xF40152,0xF40152,0xF40152,0x745C0001,0x745C0001,0x5E700000,0x1F00152,0x1F00152,0x5E200000,0x50000154,0x1F00152,0x1F00152,0x5E200000,0x50000154,0x50000154,0xF680003D,0xF4980071,0xA40152,0xF6500000,0xA8580000,0x8A580001,0x7E600000,0x724C0000,0xF2600055,0xE6380000,0x15C0152,0x5E200000,
-0x15C0152,0xDC01A5,0xB4C00001,0x88C00001,0x7ABC0002,0x14401A5,0x96880000,0x7AA40001,0x27F801A5,0x7A4C0001,0x6C0001A5,0x14401A5,0x96880000,0x7AA40001,0x27F801A5,0x7A4C0001,0x6C0001A5,0x27F801A5,0x7A4C0001,0x6C0001A5,0x6C0001A5,0x14401A5,0x96880000,0x7AA40001,0x27F801A5,0x7A4C0001,0x6C0001A5,0x27F801A5,0x7A4C0001,0x6C0001A5,0x6C0001A5,0x27F801A5,
-0x7A4C0001,0x6C0001A5,0x6C0001A5,0x6C0001A5,0xFAA000A2,0xE801A5,0xFECC00C1,0xF2540000,0xB8440000,0x92480000,0x7A6C0001,0x7A200001,0xF6880091,0xE8380000,0x80700000,0x6C0001A5,0x1D001A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x6C01A5,0x7E500000,0x7E500000,0x7E500000,0x7E500000,0x7E500000,
-0x7E500000,0x52500000,0x52500000,0x52500000,0x44500000,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0x60180000,0x60180000,0x60180000,0x44340000,0x14401A5,0x14401A5,0x14401A5,0x4200001D,0x360001A5,0xF8600080,0x6C01A5,0x6C01A5,0xF2500000,0xB4500000,0x94500000,0x94500000,0x6E500000,0xFE3C0041,0xDE380001,0x62480000,0x60180000,
-0xE401A5,};
-static const uint32_t g_etc1_to_bc7_m6_table107[] = {
-0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x1200000,
-0x1200000,0x1200000,0x1200000,0x2E000001,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0xA640000,0xA640000,0xA640000,0x28C0000,0xCC0000,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0x980001,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,
-0xE40000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0x1D00000,0x1D00000,0x1D00000,0x4C000000,0x4C000000,0xA40000,0x980001,0x980001,0xB40000,0xC00000,0xD00000,0xD00000,0x3000000,0xB40000,0xC00000,0x1480000,0x1D00000,
-0x1480000,0xCC0001,0xCC0001,0xCC0001,0xCC0001,0x3300000,0x3300000,0x3300000,0x1DFC0000,0x1DFC0000,0x66000000,0x3300000,0x3300000,0x3300000,0x1DFC0000,0x1DFC0000,0x66000000,0x1DFC0000,0x1DFC0000,0x66000000,0x66000000,0x3300000,0x3300000,0x3300000,0x1DFC0000,0x1DFC0000,0x66000000,0x1DFC0000,0x1DFC0000,0x66000000,0x66000000,0x1DFC0000,
-0x1DFC0000,0x66000000,0x66000000,0x66000000,0xF00000,0xDC0000,0xCC0001,0x1180000,0x1580000,0x1B40000,0x1F80000,0x31F80000,0x3000000,0x3300000,0x1B40000,0x66000000,0x1B40000,0x1080000,0x1880000,0x49F80000,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,
-0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x1880000,0x49F80000,0x82000001,0x49F80000,0x82000001,0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x49F80000,0x82000001,0x82000001,0x82000001,0x82000001,0x14C0000,0x5180000,0x5180000,0x1BC0000,0x31FC0000,0x65F80000,0x82000001,0x82000001,0x1680000,0x1F00000,0x7DCC0000,0x82000001,
-0x19FC0000,0xB40F1A,0xF69805EA,0x9E9805EA,0x829805EB,0xE47C034A,0xA68001C3,0x8684028A,0x907C034A,0x807C01BE,0x747C034A,0xDA6005EA,0xAE600153,0x8870020D,0x965C01B2,0x82600006,0x766401BD,0x826005EA,0x7A54020F,0x70580286,0x666005ED,0x10C0F1A,0xB63005EB,0x846005EB,0xA814034A,0x843801B6,0x7448034A,0x980005F1,0x820C0154,0x721801C5,0x682805ED,0xBF80F1A,
-0x780006AE,0x6C0004B6,0x64000795,0x58000F1E,0xFE8C046E,0xFCA80ADA,0xFEAC0BDF,0xFE600000,0xBA600003,0x98600002,0x84680023,0x7E5C0025,0xFE700403,0xEE480002,0x8854015B,0x721801C5,0x17C0F1A,0xD005EA,0xD8B40152,0x98B40152,0x82B40153,0xC49C01A6,0x9E980001,0x84A4002A,0x889801A6,0x7E98004B,0x749801A6,0x13405EA,0xA8680153,0x828C0153,0x9C4C01A5,0x82640002,
-0x747401A5,0x1FF805EA,0x820C0153,0x740001A9,0x660005ED,0x13405EA,0xA8680153,0x828C0153,0x9C4C01A5,0x82640002,0x747401A5,0x1FF805EA,0x820C0153,0x740001A9,0x660005ED,0x1FF805EA,0x820C0153,0x740001A9,0x660005ED,0x660005ED,0xFEA001C8,0xFAC40412,0xFCC803FE,0xFE600000,0xBA600003,0x98600002,0x84740002,0x824C000B,0xFC8401E2,0xEE480001,0x8A440152,0x740001A9,
-0x1B805EA,0x9805EA,0x9805EA,0x9805EA,0x9805EA,0xBA7C01A5,0xBA7C01A5,0xBA7C01A5,0x7A7C01A5,0x7A7C01A5,0x667C01A5,0xAE600152,0xAE600152,0xAE600152,0x80600005,0x80600005,0x686C004C,0x6E600152,0x6E600152,0x62600029,0x58600154,0xE005EA,0xE005EA,0xE005EA,0x8C3001A5,0x8C3001A5,0x665401A5,0x80140153,0x80140153,0x66280002,0x58380154,0x1CC05EA,
-0x1CC05EA,0x600001D5,0x560001C8,0x4A0005ED,0xFA7C0191,0xFE8C03CE,0x9805EA,0xFE600000,0xB8600002,0x98600002,0x8E64000C,0x7A600001,0xFC600152,0xEE480001,0x84540152,0x66280002,0x14005EA,0xB40152,0xB40152,0xB40152,0xB40152,0x989C0001,0x989C0001,0x989C0001,0x729C0001,0x729C0001,0x66980001,0x10C0152,0x10C0152,0x10C0152,0x7C6C0001,0x7C6C0001,
-0x66800000,0xBF80152,0xBF80152,0x66300000,0x58000154,0x10C0152,0x10C0152,0x10C0152,0x7C6C0001,0x7C6C0001,0x66800000,0xBF80152,0xBF80152,0x66300000,0x58000154,0xBF80152,0xBF80152,0x66300000,0x58000154,0x58000154,0xFE90003D,0xFCA80071,0xB40152,0xFE600000,0xB0680000,0x92680001,0x86700000,0x7A5C0000,0xFA700055,0xEE480000,0x17C0152,0x66300000,
-0x17C0152,0xEC01A5,0xBCD00001,0x90D00001,0x82CC0002,0x15C01A5,0x9E980000,0x82B40001,0x33F801A5,0x825C0001,0x740001A5,0x15C01A5,0x9E980000,0x82B40001,0x33F801A5,0x825C0001,0x740001A5,0x33F801A5,0x825C0001,0x740001A5,0x740001A5,0x15C01A5,0x9E980000,0x82B40001,0x33F801A5,0x825C0001,0x740001A5,0x33F801A5,0x825C0001,0x740001A5,0x740001A5,0x33F801A5,
-0x825C0001,0x740001A5,0x740001A5,0x740001A5,0xFCB000A4,0xFC01A5,0xF8E000CA,0xFA640000,0xC0540000,0x9A580000,0x827C0001,0x82300001,0xFE980091,0xF0480000,0x88800000,0x740001A5,0x1F001A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x7C01A5,0x86600000,0x86600000,0x86600000,0x86600000,0x86600000,
-0x86600000,0x5A600000,0x5A600000,0x5A600000,0x4C600000,0xB801A5,0xB801A5,0xB801A5,0xB801A5,0xB801A5,0xB801A5,0x68280000,0x68280000,0x68280000,0x4C440000,0x17401A5,0x17401A5,0x17401A5,0x4A000008,0x3E0001A5,0xFE6C0088,0x7C01A5,0x7C01A5,0xFA600000,0xBC600000,0x9C600000,0x9C600000,0x76600000,0xFA50004A,0xE6480001,0x6A580000,0x68280000,
-0x10801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table108[] = {
-0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0x1580000,
-0x1580000,0x1580000,0x1580000,0x38000000,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x4780000,0x4780000,0x4780000,0xA80000,0xF00000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,
-0x1000000,0x5F80000,0x5F80000,0x5F80000,0x54000001,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x1000000,0x5F80000,0x5F80000,0x5F80000,0x54000001,0x5F80000,0x5F80000,0x5F80000,0x54000001,0x54000001,0xB80000,0xAC0000,0xAC0000,0xC80000,0xD80000,0x2E80000,0x2E80000,0x1200000,0xC80000,0xD80000,0x16C0000,0x5F80000,
-0x16C0000,0xE00000,0xE00000,0xE00000,0xE00000,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x2BF80000,0x2BF80000,0x6E000001,0x6E000001,0x14C0000,0x14C0000,0x14C0000,0x2BF80000,0x2BF80000,0x6E000001,0x2BF80000,0x2BF80000,0x6E000001,0x6E000001,0x2BF80000,
-0x2BF80000,0x6E000001,0x6E000001,0x6E000001,0x1040000,0xEEC0000,0xE00000,0x1300000,0x1780000,0x1DC0000,0xFFC0000,0x3DF80000,0x1180000,0x14C0000,0x1DC0000,0x6E000001,0x1DC0000,0x1180001,0x1A40000,0x57F80000,0x8C000000,0x1A40000,0x57F80000,0x8C000000,0x57F80000,0x8C000000,0x8C000000,0x1A40000,0x57F80000,0x8C000000,0x57F80000,0x8C000000,
-0x8C000000,0x57F80000,0x8C000000,0x8C000000,0x8C000000,0x1A40000,0x57F80000,0x8C000000,0x57F80000,0x8C000000,0x8C000000,0x57F80000,0x8C000000,0x8C000000,0x8C000000,0x57F80000,0x8C000000,0x8C000000,0x8C000000,0x8C000000,0x3600000,0x12C0000,0x12C0000,0x1D80000,0x3FFC0000,0x71F40000,0x8C000000,0x8C000000,0x1800000,0xDFC0000,0x85FC0000,0x8C000000,
-0x29FC0000,0xC40F1E,0xFCAC05ED,0xA8A805ED,0x8CA805ED,0xEC90034A,0xB09401C5,0x90980286,0x9890034A,0x8A8C01BD,0x7E90034A,0xE07405EB,0xB6740154,0x9284020F,0x9E6C01B6,0x8A740006,0x7E7801BE,0x8C7005EB,0x8468020D,0x7868028A,0x707005EB,0x3240F1A,0xC04005EB,0x8C7405EA,0xB224034A,0x8E4C01B2,0x7E58034A,0xA40C05EB,0x8C1C0153,0x7C2C01C3,0x703C05EA,0x17FC0F1A,
-0x8400065A,0x7800043A,0x6C00071A,0x62000F1A,0xFEA004CC,0xF6BC0B32,0xF8C00C1E,0xFE74000E,0xC4700004,0xA0740004,0x8E7C0025,0x88700023,0xFE800477,0xFA580000,0x90640159,0x7C2C01C3,0x1A40F1A,0xE005ED,0xE0C80154,0xA0C80154,0x8CC40154,0xCEAC01A5,0xA6AC0002,0x8CB40029,0x90AC01A5,0x86A8004C,0x7EAC01A5,0x15005EA,0xB2780153,0x8C9C0152,0xA46001A5,0x8A740005,
-0x7E8401A5,0x2DF805EA,0x8C180152,0x7E0401A5,0x700005EA,0x15005EA,0xB2780153,0x8C9C0152,0xA46001A5,0x8A740005,0x7E8401A5,0x2DF805EA,0x8C180152,0x7E0401A5,0x700005EA,0x2DF805EA,0x8C180152,0x7E0401A5,0x700005EA,0x700005EA,0xFEB401E6,0xF2D4043D,0xF6DC0428,0xFE780005,0xC4700003,0xA0740003,0x8C840001,0x8A5C000C,0xFE980202,0xFA580000,0x92580152,0x7E0401A5,
-0x1E005EA,0xA805ED,0xA805ED,0xA805ED,0xA805ED,0xC09001A5,0xC09001A5,0xC09001A5,0x829001A5,0x829001A5,0x6E9001A6,0xB6740153,0xB6740153,0xB6740153,0x8A740002,0x8A740002,0x707C004B,0x76740153,0x76740153,0x6A70002A,0x62700153,0xFC05EA,0xFC05EA,0xFC05EA,0x964001A5,0x964001A5,0x6E6801A6,0x88280153,0x88280153,0x703C0001,0x62480152,0x3F805EA,
-0x3F805EA,0x6C0001B2,0x6000019A,0x540005EA,0xFE9001AE,0xFAA403EA,0xA805ED,0xFE740005,0xBE740002,0x9E740002,0x9674000B,0x82700002,0xFE74016D,0xFA580000,0x8C680153,0x703C0001,0x16805EA,0xC40154,0xC40154,0xC40154,0xC40154,0xA4AC0000,0xA4AC0000,0xA4AC0000,0x7CAC0000,0x7CAC0000,0x6EAC0001,0x3240152,0x3240152,0x3240152,0x867C0001,0x867C0001,
-0x6E940001,0x17FC0152,0x17FC0152,0x6E440001,0x62000152,0x3240152,0x3240152,0x3240152,0x867C0001,0x867C0001,0x6E940001,0x17FC0152,0x17FC0152,0x6E440001,0x62000152,0x17FC0152,0x17FC0152,0x6E440001,0x62000152,0x62000152,0xFEA0004A,0xF6BC0080,0xC40154,0xFE780001,0xBA780000,0x9C780000,0x8E840000,0x846C0000,0xF8880062,0xFA580000,0x1A40152,0x6E440001,
-0x1A40152,0xFC01A5,0xC6E00000,0x9AE00000,0x8CE00000,0x17801A5,0xA8A80000,0x8CC40000,0x3FFC01A5,0x8C680000,0x7E0001A5,0x17801A5,0xA8A80000,0x8CC40000,0x3FFC01A5,0x8C680000,0x7E0001A5,0x3FFC01A5,0x8C680000,0x7E0001A5,0x7E0001A5,0x17801A5,0xA8A80000,0x8CC40000,0x3FFC01A5,0x8C680000,0x7E0001A5,0x3FFC01A5,0x8C680000,0x7E0001A5,0x7E0001A5,0x3FFC01A5,
-0x8C680000,0x7E0001A5,0x7E0001A5,0x7E0001A5,0xFAC800B5,0x10C01A5,0xF2F400DD,0xFC7C0002,0xCA640000,0xA26C0000,0x8C8C0000,0x8C3C0000,0xFEAC00A4,0xFA580000,0x90940000,0x7E0001A5,0xDFC01A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x9001A5,0x8E740001,0x8E740001,0x8E740001,0x8E740001,0x8E740001,
-0x8E740001,0x62740001,0x62740001,0x62740001,0x54700002,0xD401A5,0xD401A5,0xD401A5,0xD401A5,0xD401A5,0xD401A5,0x703C0000,0x703C0000,0x703C0000,0x54580001,0x1AC01A5,0x1AC01A5,0x1AC01A5,0x54000001,0x460001A5,0xFA840091,0x9001A5,0x9001A5,0xFE740001,0xC2740001,0xA2740001,0xA2740001,0x7E740001,0xF8680055,0xF8580000,0x726C0001,0x703C0000,
-0x12C01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table109[] = {
-0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0x1880000,
-0x1880000,0x1880000,0x1880000,0x40000000,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0x800001,0xC880000,0xC880000,0xC880000,0xC00000,0x1140000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,
-0x1180000,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x11F80000,0x11F80000,0x11F80000,0x5C000001,0x5C000001,0xC80000,0xBC0000,0xBC0000,0x4D80000,0xEC0000,0x1000000,0x1000000,0x13C0000,0x4D80000,0xEC0000,0x1900000,0x11F80000,
-0x1900000,0xF00000,0xF00000,0xF00000,0xF00000,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x37F80000,0x37F80000,0x76000001,0x76000001,0x1640000,0x1640000,0x1640000,0x37F80000,0x37F80000,0x76000001,0x37F80000,0x37F80000,0x76000001,0x76000001,0x37F80000,
-0x37F80000,0x76000001,0x76000001,0x76000001,0x1180000,0x1000000,0xF00000,0x3440000,0x1940000,0x1FC0000,0x1DF80000,0x47FC0000,0x12C0000,0x1640000,0x1FC0000,0x76000001,0x1FC0000,0x1280001,0x1BC0000,0x61FC0000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,
-0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x1BC0000,0x61FC0000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x61FC0000,0x94000000,0x94000000,0x94000000,0x94000000,0x3740000,0x73C0000,0x73C0000,0x1F40000,0x4DFC0000,0x7BF40000,0x94000000,0x94000000,0x1940000,0x1DFC0000,0x8FD00000,0x94000000,
-0x39FC0000,0xD40F1E,0xFEBC05F1,0xB0B805ED,0x94B805ED,0xF4A0034A,0xB8A401C5,0x98A80286,0xA0A0034A,0x929C01BD,0x86A0034A,0xE88405EB,0xBE840154,0x9A94020F,0xA67C01B6,0x92840006,0x868801BE,0x948005EB,0x8C78020D,0x8078028A,0x788005EB,0x33C0F1A,0xC85005EB,0x948405EA,0xBA34034A,0x965C01B2,0x8668034A,0xAC1C05EB,0x942C0153,0x843C01C3,0x784C05EA,0x23FC0F1A,
-0x8E000629,0x820003F2,0x760006D7,0x6A000F1A,0xFEB40537,0xFECC0B32,0xFECC0C36,0xFE880026,0xCC800004,0xA8840004,0x968C0025,0x90800023,0xFE9804C6,0xFC6C0006,0x98740159,0x843C01C3,0x1C80F1A,0xF005ED,0xE8D80154,0xA8D80154,0x94D40154,0xD6BC01A5,0xAEBC0002,0x94C40029,0x98BC01A5,0x8EB8004C,0x86BC01A5,0x16805EA,0xBA880153,0x94AC0152,0xAC7001A5,0x92840005,
-0x869401A5,0x39F805EA,0x94280152,0x861401A5,0x780005EA,0x16805EA,0xBA880153,0x94AC0152,0xAC7001A5,0x92840005,0x869401A5,0x39F805EA,0x94280152,0x861401A5,0x780005EA,0x39F805EA,0x94280152,0x861401A5,0x780005EA,0x780005EA,0xFAC80226,0xFAE4043D,0xFEEC0428,0xFC900012,0xCC800003,0xA8840003,0x94940001,0x926C000C,0xFEAC0244,0xFE6C0002,0x9A680152,0x861401A5,
-0x3FC05EA,0xB805ED,0xB805ED,0xB805ED,0xB805ED,0xC8A001A5,0xC8A001A5,0xC8A001A5,0x8AA001A5,0x8AA001A5,0x76A001A6,0xBE840153,0xBE840153,0xBE840153,0x92840002,0x92840002,0x788C004B,0x7E840153,0x7E840153,0x7280002A,0x6A800153,0x11405EA,0x11405EA,0x11405EA,0x9E5001A5,0x9E5001A5,0x767801A6,0x90380153,0x90380153,0x784C0001,0x6A580152,0xFF805EA,
-0xFF805EA,0x760001A6,0x6800017E,0x5C0005EA,0xFEA001CB,0xFEAC040A,0xB805ED,0xFE88000D,0xC6840002,0xA6840002,0x9E84000B,0x8A800002,0xFE8001A3,0xFA6C0002,0x94780153,0x784C0001,0x18C05EA,0xD40154,0xD40154,0xD40154,0xD40154,0xACBC0000,0xACBC0000,0xACBC0000,0x84BC0000,0x84BC0000,0x76BC0001,0x33C0152,0x33C0152,0x33C0152,0x8E8C0001,0x8E8C0001,
-0x76A40001,0x23FC0152,0x23FC0152,0x76540001,0x6A000152,0x33C0152,0x33C0152,0x33C0152,0x8E8C0001,0x8E8C0001,0x76A40001,0x23FC0152,0x23FC0152,0x76540001,0x6A000152,0x23FC0152,0x23FC0152,0x76540001,0x6A000152,0x6A000152,0xFAB40055,0xFECC0080,0xD40154,0xFE880004,0xC2880000,0xA4880000,0x96940000,0x8C7C0000,0xFE94006A,0xFA6C0001,0x1C80152,0x76540001,
-0x1C80152,0x10C01A5,0xCEF00000,0xA2F00000,0x94F00000,0x19001A5,0xB0B80000,0x94D40000,0x4BFC01A5,0x94780000,0x860001A5,0x19001A5,0xB0B80000,0x94D40000,0x4BFC01A5,0x94780000,0x860001A5,0x4BFC01A5,0x94780000,0x860001A5,0x860001A5,0x19001A5,0xB0B80000,0x94D40000,0x4BFC01A5,0x94780000,0x860001A5,0x4BFC01A5,0x94780000,0x860001A5,0x860001A5,0x4BFC01A5,
-0x94780000,0x860001A5,0x860001A5,0x860001A5,0xF2E000C8,0x12001A5,0xFB0400DD,0xFE940005,0xD2740000,0xAA7C0000,0x949C0000,0x944C0000,0xF4C800B5,0xFE6C0001,0x98A40000,0x860001A5,0x1DF801A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0xA001A5,0x96840001,0x96840001,0x96840001,0x96840001,0x96840001,
-0x96840001,0x6A840001,0x6A840001,0x6A840001,0x5C800002,0xEC01A5,0xEC01A5,0xEC01A5,0xEC01A5,0xEC01A5,0xEC01A5,0x784C0000,0x784C0000,0x784C0000,0x5C680001,0x1DC01A5,0x1DC01A5,0x1DC01A5,0x5C100001,0x4E0001A5,0xF29400A2,0xA001A5,0xA001A5,0xFE840002,0xCA840001,0xAA840001,0xAA840001,0x86840001,0xFE740059,0xF86C0001,0x7A7C0001,0x784C0000,
-0x15001A5,};
-static const uint32_t g_etc1_to_bc7_m6_table110[] = {
-0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0x1B80000,
-0x1B80000,0x1B80000,0x1B80000,0x48000000,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x900001,0x9C0000,0x9C0000,0x9C0000,0xD80000,0x1340000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0xCC0000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,
-0x1300000,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x1DF40000,0x1DF40000,0x1DF40000,0x64000001,0x64000001,0x4D80000,0xCC0000,0xCC0000,0xEC0000,0x1000000,0x1140000,0x1140000,0x3540000,0xEC0000,0x1000000,0x1B00000,0x1DF40000,
-0x1B00000,0x1000000,0x1000000,0x1000000,0x1000000,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x43F80000,0x43F80000,0x7E000001,0x7E000001,0x17C0000,0x17C0000,0x17C0000,0x43F80000,0x43F80000,0x7E000001,0x43F80000,0x43F80000,0x7E000001,0x7E000001,0x43F80000,
-0x43F80000,0x7E000001,0x7E000001,0x7E000001,0x3280000,0x1100000,0x1000000,0x15C0000,0x3AC0000,0x11FC0000,0x29FC0000,0x53F80000,0x1400000,0x17C0000,0x11FC0000,0x7E000001,0x11FC0000,0x1380001,0x3D00000,0x6DFC0000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,
-0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x3D00000,0x6DFC0000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x6DFC0000,0x9C000000,0x9C000000,0x9C000000,0x9C000000,0x3880000,0xF4C0000,0xF4C0000,0xFFC0000,0x5BFC0000,0x85F40000,0x9C000000,0x9C000000,0x1AC0000,0x2FFC0000,0x97E00000,0x9C000000,
-0x47FC0000,0xE40F1E,0xFCCC0606,0xB8C805ED,0x9CC805ED,0xFCB0034A,0xC0B401C5,0xA0B80286,0xA8B0034A,0x9AAC01BD,0x8EB0034A,0xF09405EB,0xC6940154,0xA2A4020F,0xAE8C01B6,0x9A940006,0x8E9801BE,0x9C9005EB,0x9488020D,0x8888028A,0x809005EB,0x1540F1A,0xD06005EB,0x9C9405EA,0xC244034A,0x9E6C01B2,0x8E78034A,0xB42C05EB,0x9C3C0153,0x8C4C01C3,0x805C05EA,0x2FFC0F1A,
-0x98000606,0x8A00039E,0x7E000686,0x72000F1A,0xFEC405AE,0xF6DC0B8E,0xF8E00C65,0xFE9C0054,0xD4900004,0xB0940004,0x9E9C0025,0x98900023,0xFCB00557,0xFE800014,0xA0840159,0x8C4C01C3,0x1E80F1A,0x10005ED,0xF0E80154,0xB0E80154,0x9CE40154,0xDECC01A5,0xB6CC0002,0x9CD40029,0xA0CC01A5,0x96C8004C,0x8ECC01A5,0x18005EA,0xC2980153,0x9CBC0152,0xB48001A5,0x9A940005,
-0x8EA401A5,0x45F805EA,0x9C380152,0x8E2401A5,0x800005EA,0x18005EA,0xC2980153,0x9CBC0152,0xB48001A5,0x9A940005,0x8EA401A5,0x45F805EA,0x9C380152,0x8E2401A5,0x800005EA,0x45F805EA,0x9C380152,0x8E2401A5,0x800005EA,0x800005EA,0xFED8024B,0xF4F80465,0xF6FC0455,0xFEA40029,0xD4900003,0xB0940003,0x9CA40001,0x9A7C000C,0xFCC00269,0xFE840008,0xA2780152,0x8E2401A5,
-0x13FC05EA,0xC805ED,0xC805ED,0xC805ED,0xC805ED,0xD0B001A5,0xD0B001A5,0xD0B001A5,0x92B001A5,0x92B001A5,0x7EB001A6,0xC6940153,0xC6940153,0xC6940153,0x9A940002,0x9A940002,0x809C004B,0x86940153,0x86940153,0x7A90002A,0x72900153,0x12C05EA,0x12C05EA,0x12C05EA,0xA66001A5,0xA66001A5,0x7E8801A6,0x98480153,0x98480153,0x805C0001,0x72680152,0x1BF805EA,
-0x1BF805EA,0x7E0801A6,0x72000162,0x640005EA,0xFAB40206,0xFAC40411,0xC805ED,0xFE98001A,0xCE940002,0xAE940002,0xA694000B,0x92900002,0xFE9401BA,0xFC7C000B,0x9C880153,0x805C0001,0x1AC05EA,0xE40154,0xE40154,0xE40154,0xE40154,0xB4CC0000,0xB4CC0000,0xB4CC0000,0x8CCC0000,0x8CCC0000,0x7ECC0001,0x1540152,0x1540152,0x1540152,0x969C0001,0x969C0001,
-0x7EB40001,0x2FFC0152,0x2FFC0152,0x7E640001,0x72000152,0x1540152,0x1540152,0x1540152,0x969C0001,0x969C0001,0x7EB40001,0x2FFC0152,0x2FFC0152,0x7E640001,0x72000152,0x2FFC0152,0x2FFC0152,0x7E640001,0x72000152,0x72000152,0xF6C80062,0xF6DC0091,0xE40154,0xFCA00008,0xCA980000,0xAC980000,0x9EA40000,0x948C0000,0xFAAC0071,0xFE800002,0x1E80152,0x7E640001,
-0x1E80152,0x11C01A5,0xD7000000,0xAB000000,0x9D000000,0x1A801A5,0xB8C80000,0x9CE40000,0x57FC01A5,0x9C880000,0x8E0001A5,0x1A801A5,0xB8C80000,0x9CE40000,0x57FC01A5,0x9C880000,0x8E0001A5,0x57FC01A5,0x9C880000,0x8E0001A5,0x8E0001A5,0x1A801A5,0xB8C80000,0x9CE40000,0x57FC01A5,0x9C880000,0x8E0001A5,0x57FC01A5,0x9C880000,0x8E0001A5,0x8E0001A5,0x57FC01A5,
-0x9C880000,0x8E0001A5,0x8E0001A5,0x8E0001A5,0xFAF000C8,0x13001A5,0xF31400F4,0xFCAC000D,0xDA840000,0xB28C0000,0x9CAC0000,0x9C5C0000,0xFCD800B5,0xFE880002,0xA0B40000,0x8E0001A5,0x2BFC01A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0xB001A5,0x9E940001,0x9E940001,0x9E940001,0x9E940001,0x9E940001,
-0x9E940001,0x72940001,0x72940001,0x72940001,0x64900002,0x10401A5,0x10401A5,0x10401A5,0x10401A5,0x10401A5,0x10401A5,0x805C0000,0x805C0000,0x805C0000,0x64780001,0x5FC01A5,0x5FC01A5,0x5FC01A5,0x64200001,0x560001A5,0xFAA400A2,0xB001A5,0xB001A5,0xF894000A,0xD2940001,0xB2940001,0xB2940001,0x8E940001,0xFA880064,0xFC7C0002,0x828C0001,0x805C0000,
-0x17001A5,};
-static const uint32_t g_etc1_to_bc7_m6_table111[] = {
-0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0x1E80000,
-0x1E80000,0x1E80000,0x1E80000,0x50000000,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xAC0000,0xAC0000,0xAC0000,0xF00000,0x1580000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,
-0x3440000,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,0x3440000,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x27FC0000,0x27FC0000,0x27FC0000,0x6C000001,0x6C000001,0xCE80000,0xDC0000,0xDC0000,0x1000000,0x1140000,0x12C0000,0x12C0000,0x1700000,0x1000000,0x1140000,0x1D40000,0x27FC0000,
-0x1D40000,0x1100000,0x1100000,0x1100000,0x1100000,0x1940000,0x1940000,0x1940000,0x4FF80000,0x4FF80000,0x86000001,0x1940000,0x1940000,0x1940000,0x4FF80000,0x4FF80000,0x86000001,0x4FF80000,0x4FF80000,0x86000001,0x86000001,0x1940000,0x1940000,0x1940000,0x4FF80000,0x4FF80000,0x86000001,0x4FF80000,0x4FF80000,0x86000001,0x86000001,0x4FF80000,
-0x4FF80000,0x86000001,0x86000001,0x86000001,0x13C0000,0x9200000,0x1100000,0x3700000,0x1C80000,0x1FFC0000,0x37FC0000,0x5DFC0000,0x1540000,0x1940000,0x1FFC0000,0x86000001,0x1FFC0000,0x1480001,0x3E80000,0x79FC0000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,
-0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x3E80000,0x79FC0000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0xA4000000,0x39C0000,0x1600000,0x1600000,0x23FC0000,0x69F80000,0x8FF40000,0xA4000000,0xA4000000,0x1C00000,0x41FC0000,0x9FF00000,0xA4000000,
-0x57FC0000,0xF40F1E,0xFEDC061E,0xC0D805ED,0xA4D805ED,0xFCC00356,0xC8C401C5,0xA8C80286,0xB0C0034A,0xA2BC01BD,0x96C0034A,0xF8A405EB,0xCEA40154,0xAAB4020F,0xB69C01B6,0xA2A40006,0x96A801BE,0xA4A005EB,0x9C98020D,0x9098028A,0x88A005EB,0x16C0F1A,0xD87005EB,0xA4A405EA,0xCA54034A,0xA67C01B2,0x9688034A,0xBC3C05EB,0xA44C0153,0x945C01C3,0x886C05EA,0x3BFC0F1A,
-0xA20005F1,0x94000376,0x8400065A,0x7A000F1A,0xFED80614,0xFEEC0B8E,0xFEEC0C81,0xFEB00093,0xDCA00004,0xB8A40004,0xA6AC0025,0xA0A00023,0xFEBC05A3,0xFE94003E,0xA8940159,0x945C01C3,0x7FC0F1A,0x11005ED,0xF8F80154,0xB8F80154,0xA4F40154,0xE6DC01A5,0xBEDC0002,0xA4E40029,0xA8DC01A5,0x9ED8004C,0x96DC01A5,0x19805EA,0xCAA80153,0xA4CC0152,0xBC9001A5,0xA2A40005,
-0x96B401A5,0x51F805EA,0xA4480152,0x963401A5,0x880005EA,0x19805EA,0xCAA80153,0xA4CC0152,0xBC9001A5,0xA2A40005,0x96B401A5,0x51F805EA,0xA4480152,0x963401A5,0x880005EA,0x51F805EA,0xA4480152,0x963401A5,0x880005EA,0x880005EA,0xFEEC028B,0xFD080465,0xFF0C0455,0xFCBC0048,0xDCA00003,0xB8A40003,0xA4B40001,0xA28C000C,0xFCD80289,0xFE98001E,0xAA880152,0x963401A5,
-0x21FC05EA,0xD805ED,0xD805ED,0xD805ED,0xD805ED,0xD8C001A5,0xD8C001A5,0xD8C001A5,0x9AC001A5,0x9AC001A5,0x86C001A6,0xCEA40153,0xCEA40153,0xCEA40153,0xA2A40002,0xA2A40002,0x88AC004B,0x8EA40153,0x8EA40153,0x82A0002A,0x7AA00153,0x14405EA,0x14405EA,0x14405EA,0xAE7001A5,0xAE7001A5,0x869801A6,0xA0580153,0xA0580153,0x886C0001,0x7A780152,0x27F805EA,
-0x27F805EA,0x861801A6,0x7A000156,0x6C0005EA,0xFAC40225,0xFECC0439,0xD805ED,0xFEAC0032,0xD6A40002,0xB6A40002,0xAEA4000B,0x9AA00002,0xFAAC01E2,0xFE900015,0xA4980153,0x886C0001,0x1D005EA,0xF40154,0xF40154,0xF40154,0xF40154,0xBCDC0000,0xBCDC0000,0xBCDC0000,0x94DC0000,0x94DC0000,0x86DC0001,0x16C0152,0x16C0152,0x16C0152,0x9EAC0001,0x9EAC0001,
-0x86C40001,0x3BFC0152,0x3BFC0152,0x86740001,0x7A000152,0x16C0152,0x16C0152,0x16C0152,0x9EAC0001,0x9EAC0001,0x86C40001,0x3BFC0152,0x3BFC0152,0x86740001,0x7A000152,0x3BFC0152,0x3BFC0152,0x86740001,0x7A000152,0x7A000152,0xFED80062,0xFEEC0091,0xF40154,0xFEB4000D,0xD2A80000,0xB4A80000,0xA6B40000,0x9C9C0000,0xFCC00080,0xFE980005,0x7FC0152,0x86740001,
-0x7FC0152,0x12C01A5,0xDF100000,0xB3100000,0xA5100000,0x1C001A5,0xC0D80000,0xA4F40000,0x63FC01A5,0xA4980000,0x960001A5,0x1C001A5,0xC0D80000,0xA4F40000,0x63FC01A5,0xA4980000,0x960001A5,0x63FC01A5,0xA4980000,0x960001A5,0x960001A5,0x1C001A5,0xC0D80000,0xA4F40000,0x63FC01A5,0xA4980000,0x960001A5,0x63FC01A5,0xA4980000,0x960001A5,0x960001A5,0x63FC01A5,
-0xA4980000,0x960001A5,0x960001A5,0x960001A5,0xF70400DD,0x14001A5,0xFB2400F4,0xFEC00014,0xE2940000,0xBA9C0000,0xA4BC0000,0xA46C0000,0xFCEC00CA,0xFEA0000A,0xA8C40000,0x960001A5,0x3BFC01A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xC001A5,0xA6A40001,0xA6A40001,0xA6A40001,0xA6A40001,0xA6A40001,
-0xA6A40001,0x7AA40001,0x7AA40001,0x7AA40001,0x6CA00002,0x11C01A5,0x11C01A5,0x11C01A5,0x11C01A5,0x11C01A5,0x11C01A5,0x886C0000,0x886C0000,0x886C0000,0x6C880001,0x11FC01A5,0x11FC01A5,0x11FC01A5,0x6C300001,0x5E0001A5,0xF2B400B5,0xC001A5,0xC001A5,0xFCA8000D,0xDAA40001,0xBAA40001,0xBAA40001,0x96A40001,0xFC9C0071,0xFC900005,0x8A9C0001,0x886C0000,
-0x19401A5,};
-static const uint32_t g_etc1_to_bc7_m6_table112[] = {
-0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,
-0xBF80000,0xBF80000,0xBF80000,0x58000001,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xC00000,0xC00000,0xC00000,0x10C0000,0x17C0000,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0xEC0001,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,
-0x1600000,0x35FC0000,0x35FC0000,0x35FC0000,0x76000000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x35FC0000,0x35FC0000,0x35FC0000,0x76000000,0x35FC0000,0x35FC0000,0x35FC0000,0x76000000,0x76000000,0x6FC0000,0xEC0001,0xEC0001,0x1140000,0x3280000,0x1440000,0x1440000,0x1900000,0x1140000,0x3280000,0x1F80000,0x35FC0000,
-0x1F80000,0x1200001,0x1200001,0x1200001,0x1200001,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x5DF40000,0x5DF40000,0x90000000,0x90000000,0x1B00000,0x1B00000,0x1B00000,0x5DF40000,0x5DF40000,0x90000000,0x5DF40000,0x5DF40000,0x90000000,0x90000000,0x5DF40000,
-0x5DF40000,0x90000000,0x90000000,0x90000000,0x1500000,0x3340000,0x1200001,0x3880000,0x1E80000,0x31FC0000,0x47F80000,0x69FC0000,0x16C0000,0x1B00000,0x31FC0000,0x90000000,0x31FC0000,0x15C0000,0x9FC0000,0x87FC0000,0xAC000001,0x9FC0000,0x87FC0000,0xAC000001,0x87FC0000,0xAC000001,0xAC000001,0x9FC0000,0x87FC0000,0xAC000001,0x87FC0000,0xAC000001,
-0xAC000001,0x87FC0000,0xAC000001,0xAC000001,0xAC000001,0x9FC0000,0x87FC0000,0xAC000001,0x87FC0000,0xAC000001,0xAC000001,0x87FC0000,0xAC000001,0xAC000001,0xAC000001,0x87FC0000,0xAC000001,0xAC000001,0xAC000001,0xAC000001,0x1B40000,0x1740000,0x1740000,0x39FC0000,0x77FC0000,0x99FC0000,0xAC000001,0xAC000001,0x1D80000,0x53FC0000,0xA9E40000,0xAC000001,
-0x67FC0000,0x1080F1A,0xFCF0065A,0xC8EC05EA,0xACEC05EB,0xFED40376,0xD0D401C3,0xB0D8028A,0xBAD0034A,0xAAD001BE,0x9ED0034A,0xFCB805F1,0xD8B40153,0xB2C4020D,0xC0B001B2,0xACB40006,0xA0B801BD,0xACB405EA,0xA4A8020F,0x9AAC0286,0x90B405ED,0x1880F1A,0xE08405EB,0xAEB405EB,0xD268034A,0xAE8C01B6,0x9E9C034A,0xC64805EB,0xAC600154,0x9C6C01C5,0x927C05ED,0x49F80F1A,
-0xAC0805EB,0x9C000356,0x9000061E,0x82000F1E,0xFEE4068A,0xF9000BEA,0xFB040CAA,0xFEC400EA,0xE4B40003,0xC2B40002,0xAEBC0023,0xA8B00025,0xFED40612,0xFEB0008D,0xB2A8015B,0x9C6C01C5,0x19FC0F1A,0x12405EA,0xFD080156,0xC3080152,0xAD080153,0xEEF001A6,0xC8EC0001,0xAEF8002A,0xB2EC01A6,0xA8EC004B,0x9EEC01A6,0x3B005EA,0xD2BC0153,0xACE00153,0xC6A001A5,0xACB80002,
-0x9EC801A5,0x5DFC05EA,0xAC600153,0x9E4801A5,0x900005ED,0x3B005EA,0xD2BC0153,0xACE00153,0xC6A001A5,0xACB80002,0x9EC801A5,0x5DFC05EA,0xAC600153,0x9E4801A5,0x900005ED,0x5DFC05EA,0xAC600153,0x9E4801A5,0x900005ED,0x900005ED,0xFF0002B2,0xF71C0492,0xF9200483,0xFED40072,0xE4B40003,0xC2B40002,0xAEC80002,0xACA0000B,0xFCEC02D4,0xFEB80048,0xB4980152,0x9E4801A5,
-0x33FC05EA,0xEC05EA,0xEC05EA,0xEC05EA,0xEC05EA,0xE4D001A5,0xE4D001A5,0xE4D001A5,0xA4D001A5,0xA4D001A5,0x90D001A5,0xD8B40152,0xD8B40152,0xD8B40152,0xAAB40005,0xAAB40005,0x92C0004C,0x98B40152,0x98B40152,0x8CB40029,0x82B40154,0x35C05EA,0x35C05EA,0x35C05EA,0xB68401A5,0xB68401A5,0x90A801A5,0xAA680153,0xAA680153,0x907C0002,0x828C0154,0x33FC05EA,
-0x33FC05EA,0x902801A5,0x820C0154,0x740005ED,0xFED80248,0xFAE4043D,0xEC05EA,0xFEBC0054,0xE2B40002,0xC2B40002,0xB8B8000C,0xA4B40001,0xFEBC020C,0xFEA40031,0xAEA80152,0x907C0002,0x1F405EA,0x1080152,0x1080152,0x1080152,0x1080152,0xC2F00001,0xC2F00001,0xC2F00001,0x9CF00001,0x9CF00001,0x90EC0001,0x1880152,0x1880152,0x1880152,0xA6C00001,0xA6C00001,
-0x90D40000,0x49F80152,0x49F80152,0x90840000,0x82000154,0x1880152,0x1880152,0x1880152,0xA6C00001,0xA6C00001,0x90D40000,0x49F80152,0x49F80152,0x90840000,0x82000154,0x49F80152,0x49F80152,0x90840000,0x82000154,0x82000154,0xFAEC0071,0xF90000A2,0x1080152,0xF6CC0019,0xDABC0000,0xBCBC0001,0xB0C40000,0xA4B00000,0xFED00088,0xFEB0000D,0x19FC0152,0x90840000,
-0x19FC0152,0x14001A5,0xE7240001,0xBB240001,0xAD200002,0x1D801A5,0xC8EC0000,0xAD080001,0x71F801A5,0xACB00001,0x9E0001A5,0x1D801A5,0xC8EC0000,0xAD080001,0x71F801A5,0xACB00001,0x9E0001A5,0x71F801A5,0xACB00001,0x9E0001A5,0x9E0001A5,0x1D801A5,0xC8EC0000,0xAD080001,0x71F801A5,0xACB00001,0x9E0001A5,0x71F801A5,0xACB00001,0x9E0001A5,0x9E0001A5,0x71F801A5,
-0xACB00001,0x9E0001A5,0x9E0001A5,0x9E0001A5,0xFF1400E1,0x15401A5,0xF5380109,0xFED80028,0xEAA80000,0xC4AC0000,0xACD00001,0xAC840001,0xF70800DD,0xFEC00012,0xB2D40000,0x9E0001A5,0x4BFC01A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xD001A5,0xB0B40000,0xB0B40000,0xB0B40000,0xB0B40000,0xB0B40000,
-0xB0B40000,0x84B40000,0x84B40000,0x84B40000,0x76B40000,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x927C0000,0x927C0000,0x927C0000,0x76980000,0x1FF801A5,0x1FF801A5,0x1FF801A5,0x763C0000,0x680001A5,0xFCC800B5,0xD001A5,0xD001A5,0xFCB80014,0xE6B40000,0xC6B40000,0xC6B40000,0xA0B40000,0xF8B00080,0xFEA0000A,0x94AC0000,0x927C0000,
-0x1B801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table113[] = {
-0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x17F80000,
-0x17F80000,0x17F80000,0x17F80000,0x60000001,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xD00000,0xD00000,0xD00000,0x1240000,0x1A00000,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0xFC0001,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,
-0x1780000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x41FC0000,0x41FC0000,0x41FC0000,0x7E000000,0x7E000000,0xF0C0000,0xFC0001,0xFC0001,0x1280000,0x33C0000,0x1580000,0x1580000,0x1AC0000,0x1280000,0x33C0000,0xFFC0000,0x41FC0000,
-0xFFC0000,0x1300001,0x1300001,0x1300001,0x1300001,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x67FC0000,0x67FC0000,0x98000000,0x98000000,0x3C40000,0x3C40000,0x3C40000,0x67FC0000,0x67FC0000,0x98000000,0x67FC0000,0x67FC0000,0x98000000,0x98000000,0x67FC0000,
-0x67FC0000,0x98000000,0x98000000,0x98000000,0x1640000,0xB440000,0x1300001,0x1A00000,0x5FC0000,0x3FFC0000,0x55F80000,0x75F80000,0x1800000,0x3C40000,0x3FFC0000,0x98000000,0x3FFC0000,0x16C0000,0x23FC0000,0x93FC0000,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,
-0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0x23FC0000,0x93FC0000,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0x93FC0000,0xB4000001,0xB4000001,0xB4000001,0xB4000001,0x1C80000,0x1840000,0x1840000,0x4DFC0000,0x85FC0000,0xA5F00000,0xB4000001,0xB4000001,0x1F00000,0x65FC0000,0xB1F40000,0xB4000001,
-0x77FC0000,0x1180F1A,0xFF000686,0xD0FC05EA,0xB4FC05EB,0xFEE8039E,0xD8E401C3,0xB8E8028A,0xC2E0034A,0xB2E001BE,0xA6E0034A,0xFCCC0606,0xE0C40153,0xBAD4020D,0xC8C001B2,0xB4C40006,0xA8C801BD,0xB4C405EA,0xACB8020F,0xA2BC0286,0x98C405ED,0x1A00F1A,0xE89405EB,0xB6C405EB,0xDA78034A,0xB69C01B6,0xA6AC034A,0xCE5805EB,0xB4700154,0xA47C01C5,0x9A8C05ED,0x55F80F1A,
-0xB41805EB,0xA600034A,0x98000606,0x8A000F1E,0xFEF806F7,0xFF0C0BFA,0xFF0C0CEA,0xFED80150,0xECC40003,0xCAC40002,0xB6CC0023,0xB0C00025,0xFEE006BA,0xFEC000D3,0xBAB8015B,0xA47C01C5,0x27FC0F1A,0x13405EA,0xFF180162,0xCB180152,0xB5180153,0xF70001A6,0xD0FC0001,0xB708002A,0xBAFC01A6,0xB0FC004B,0xA6FC01A6,0x1C805EA,0xDACC0153,0xB4F00153,0xCEB001A5,0xB4C80002,
-0xA6D801A5,0x69FC05EA,0xB4700153,0xA65801A5,0x980005ED,0x1C805EA,0xDACC0153,0xB4F00153,0xCEB001A5,0xB4C80002,0xA6D801A5,0x69FC05EA,0xB4700153,0xA65801A5,0x980005ED,0x69FC05EA,0xB4700153,0xA65801A5,0x980005ED,0x980005ED,0xFF1402D5,0xFF2C0492,0xFF2C048B,0xFCE800A5,0xECC40003,0xCAC40002,0xB6D80002,0xB4B0000B,0xFEF802F6,0xFEC80065,0xBCA80152,0xA65801A5,
-0x41FC05EA,0xFC05EA,0xFC05EA,0xFC05EA,0xFC05EA,0xECE001A5,0xECE001A5,0xECE001A5,0xACE001A5,0xACE001A5,0x98E001A5,0xE0C40152,0xE0C40152,0xE0C40152,0xB2C40005,0xB2C40005,0x9AD0004C,0xA0C40152,0xA0C40152,0x94C40029,0x8AC40154,0x37405EA,0x37405EA,0x37405EA,0xBE9401A5,0xBE9401A5,0x98B801A5,0xB2780153,0xB2780153,0x988C0002,0x8A9C0154,0x3FFC05EA,
-0x3FFC05EA,0x983801A5,0x8A1C0154,0x7C0005ED,0xFEE80269,0xF4F80466,0xFC05EA,0xFED40071,0xEAC40002,0xCAC40002,0xC0C8000C,0xACC40001,0xFED00229,0xFEBC0048,0xB6B80152,0x988C0002,0xDFC05EA,0x1180152,0x1180152,0x1180152,0x1180152,0xCB000001,0xCB000001,0xCB000001,0xA5000001,0xA5000001,0x98FC0001,0x1A00152,0x1A00152,0x1A00152,0xAED00001,0xAED00001,
-0x98E40000,0x55F80152,0x55F80152,0x98940000,0x8A000154,0x1A00152,0x1A00152,0x1A00152,0xAED00001,0xAED00001,0x98E40000,0x55F80152,0x55F80152,0x98940000,0x8A000154,0x55F80152,0x55F80152,0x98940000,0x8A000154,0x8A000154,0xF7000080,0xFF0C00AA,0x1180152,0xFEDC0019,0xE2CC0000,0xC4CC0001,0xB8D40000,0xACC00000,0xF8EC0091,0xFEC40012,0x27FC0152,0x98940000,
-0x27FC0152,0x15001A5,0xEF340001,0xC3340001,0xB5300002,0x1F001A5,0xD0FC0000,0xB5180001,0x7DF801A5,0xB4C00001,0xA60001A5,0x1F001A5,0xD0FC0000,0xB5180001,0x7DF801A5,0xB4C00001,0xA60001A5,0x7DF801A5,0xB4C00001,0xA60001A5,0xA60001A5,0x1F001A5,0xD0FC0000,0xB5180001,0x7DF801A5,0xB4C00001,0xA60001A5,0x7DF801A5,0xB4C00001,0xA60001A5,0xA60001A5,0x7DF801A5,
-0xB4C00001,0xA60001A5,0xA60001A5,0xA60001A5,0xFB2C00F2,0x16401A5,0xFD480109,0xFEF40034,0xF2B80000,0xCCBC0000,0xB4E00001,0xB4940001,0xFF1800DD,0xFED80022,0xBAE40000,0xA60001A5,0x5BFC01A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xE001A5,0xB8C40000,0xB8C40000,0xB8C40000,0xB8C40000,0xB8C40000,
-0xB8C40000,0x8CC40000,0x8CC40000,0x8CC40000,0x7EC40000,0x14C01A5,0x14C01A5,0x14C01A5,0x14C01A5,0x14C01A5,0x14C01A5,0x9A8C0000,0x9A8C0000,0x9A8C0000,0x7EA80000,0x2BF801A5,0x2BF801A5,0x2BF801A5,0x7E4C0000,0x700001A5,0xF4D800C8,0xE001A5,0xE001A5,0xFECC0019,0xEEC40000,0xCEC40000,0xCEC40000,0xA8C40000,0xFEBC0088,0xFEB4000D,0x9CBC0000,0x9A8C0000,
-0x1DC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table114[] = {
-0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,
-0x21FC0000,0x21FC0000,0x21FC0000,0x68000001,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x8E00000,0x8E00000,0x8E00000,0x13C0000,0x1C00000,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x10C0001,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,
-0x1900000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x86000000,0x86000000,0x1200000,0x10C0001,0x10C0001,0x5380000,0x5500000,0x1700000,0x1700000,0x1C40000,0x5380000,0x5500000,0x1FF80000,0x4DFC0000,
-0x1FF80000,0x1400001,0x1400001,0x1400001,0x1400001,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x73FC0000,0x73FC0000,0xA0000000,0xA0000000,0x3DC0000,0x3DC0000,0x3DC0000,0x73FC0000,0x73FC0000,0xA0000000,0x73FC0000,0x73FC0000,0xA0000000,0xA0000000,0x73FC0000,
-0x73FC0000,0xA0000000,0xA0000000,0xA0000000,0x5740000,0x1580000,0x1400001,0x3B40000,0x19FC0000,0x4FFC0000,0x61FC0000,0x7FFC0000,0x1940000,0x3DC0000,0x4FFC0000,0xA0000000,0x4FFC0000,0x17C0000,0x3BFC0000,0x9FF80000,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,
-0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0x3BFC0000,0x9FF80000,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0x9FF80000,0xBC000001,0xBC000001,0xBC000001,0xBC000001,0x1DC0000,0x3940000,0x3940000,0x61FC0000,0x93F80000,0xAFF00000,0xBC000001,0xBC000001,0xDFC0000,0x75FC0000,0xBBC80000,0xBC000001,
-0x85FC0000,0x1280F1A,0xFF1006D7,0xD90C05EA,0xBD0C05EB,0xFEF803F2,0xE0F401C3,0xC0F8028A,0xCAF0034A,0xBAF001BE,0xAEF0034A,0xFEE00629,0xE8D40153,0xC2E4020D,0xD0D001B2,0xBCD40006,0xB0D801BD,0xBCD405EA,0xB4C8020F,0xAACC0286,0xA0D405ED,0x1B80F1A,0xF0A405EB,0xBED405EB,0xE288034A,0xBEAC01B6,0xAEBC034A,0xD66805EB,0xBC800154,0xAC8C01C5,0xA29C05ED,0x61F80F1A,
-0xBC2805EB,0xAE0C034A,0xA00005F1,0x92000F1E,0xFF0C0766,0xF9200C4A,0xFB240CF3,0xFEEC01C5,0xF4D40003,0xD2D40002,0xBEDC0023,0xB8D00025,0xFEF806FA,0xFED4013A,0xC2C8015B,0xAC8C01C5,0x37FC0F1A,0x14405EA,0xFD2C017E,0xD3280152,0xBD280153,0xFF1001A6,0xD90C0001,0xBF18002A,0xC30C01A6,0xB90C004B,0xAF0C01A6,0x1E005EA,0xE2DC0153,0xBD000153,0xD6C001A5,0xBCD80002,
-0xAEE801A5,0x75FC05EA,0xBC800153,0xAE6801A5,0xA00005ED,0x1E005EA,0xE2DC0153,0xBD000153,0xD6C001A5,0xBCD80002,0xAEE801A5,0x75FC05EA,0xBC800153,0xAE6801A5,0xA00005ED,0x75FC05EA,0xBC800153,0xAE6801A5,0xA00005ED,0xA00005ED,0xFF200312,0xF73C04BE,0xF94004B2,0xFEF800DE,0xF4D40003,0xD2D40002,0xBEE80002,0xBCC0000B,0xFD140322,0xFEE000A1,0xC4B80152,0xAE6801A5,
-0x51FC05EA,0x10C05EA,0x10C05EA,0x10C05EA,0x10C05EA,0xF4F001A5,0xF4F001A5,0xF4F001A5,0xB4F001A5,0xB4F001A5,0xA0F001A5,0xE8D40152,0xE8D40152,0xE8D40152,0xBAD40005,0xBAD40005,0xA2E0004C,0xA8D40152,0xA8D40152,0x9CD40029,0x92D40154,0x38C05EA,0x38C05EA,0x38C05EA,0xC6A401A5,0xC6A401A5,0xA0C801A5,0xBA880153,0xBA880153,0xA09C0002,0x92AC0154,0x4BFC05EA,
-0x4BFC05EA,0xA04801A5,0x922C0154,0x840005ED,0xFEF402A9,0xFD080466,0x10C05EA,0xFCE400A5,0xF2D40002,0xD2D40002,0xC8D8000C,0xB4D40001,0xFEE40266,0xFED00062,0xBEC80152,0xA09C0002,0x1DF805EA,0x1280152,0x1280152,0x1280152,0x1280152,0xD3100001,0xD3100001,0xD3100001,0xAD100001,0xAD100001,0xA10C0001,0x1B80152,0x1B80152,0x1B80152,0xB6E00001,0xB6E00001,
-0xA0F40000,0x61F80152,0x61F80152,0xA0A40000,0x92000154,0x1B80152,0x1B80152,0x1B80152,0xB6E00001,0xB6E00001,0xA0F40000,0x61F80152,0x61F80152,0xA0A40000,0x92000154,0x61F80152,0x61F80152,0xA0A40000,0x92000154,0x92000154,0xFF100080,0xF92000B5,0x1280152,0xFCF40029,0xEADC0000,0xCCDC0001,0xC0E40000,0xB4D00000,0xFEF80095,0xFEDC0019,0x37FC0152,0xA0A40000,
-0x37FC0152,0x16001A5,0xF7440001,0xCB440001,0xBD400002,0xDFC01A5,0xD90C0000,0xBD280001,0x89F801A5,0xBCD00001,0xAE0001A5,0xDFC01A5,0xD90C0000,0xBD280001,0x89F801A5,0xBCD00001,0xAE0001A5,0x89F801A5,0xBCD00001,0xAE0001A5,0xAE0001A5,0xDFC01A5,0xD90C0000,0xBD280001,0x89F801A5,0xBCD00001,0xAE0001A5,0x89F801A5,0xBCD00001,0xAE0001A5,0xAE0001A5,0x89F801A5,
-0xBCD00001,0xAE0001A5,0xAE0001A5,0xAE0001A5,0xF7400109,0x17801A5,0xF5580122,0xFD100048,0xFAC80000,0xD4CC0000,0xBCF00001,0xBCA40001,0xFF2C00F4,0xFEF0003A,0xC2F40000,0xAE0001A5,0x69FC01A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xF001A5,0xC0D40000,0xC0D40000,0xC0D40000,0xC0D40000,0xC0D40000,
-0xC0D40000,0x94D40000,0x94D40000,0x94D40000,0x86D40000,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0xA29C0000,0xA29C0000,0xA29C0000,0x86B80000,0x37F801A5,0x37F801A5,0x37F801A5,0x865C0000,0x780001A5,0xFCE800C8,0xF001A5,0xF001A5,0xFED80028,0xF6D40000,0xD6D40000,0xD6D40000,0xB0D40000,0xFCD40091,0xFAC80019,0xA4CC0000,0xA29C0000,
-0x1FC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table115[] = {
-0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x2DFC0000,
-0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xF40000,0xF40000,0xF40000,0x3500000,0x1E40000,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x11C0001,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,
-0x1A80000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x59FC0000,0x59FC0000,0x59FC0000,0x8E000000,0x8E000000,0x1300000,0x11C0001,0x11C0001,0x14C0000,0x5640000,0x1840000,0x1840000,0x1E00000,0x14C0000,0x5640000,0x2DFC0000,0x59FC0000,
-0x2DFC0000,0x1500001,0x1500001,0x1500001,0x1500001,0x3F40000,0x3F40000,0x3F40000,0x7FFC0000,0x7FFC0000,0xA8000000,0x3F40000,0x3F40000,0x3F40000,0x7FFC0000,0x7FFC0000,0xA8000000,0x7FFC0000,0x7FFC0000,0xA8000000,0xA8000000,0x3F40000,0x3F40000,0x3F40000,0x7FFC0000,0x7FFC0000,0xA8000000,0x7FFC0000,0x7FFC0000,0xA8000000,0xA8000000,0x7FFC0000,
-0x7FFC0000,0xA8000000,0xA8000000,0xA8000000,0x1880000,0x1680000,0x1500001,0x1CC0000,0x2DFC0000,0x5FF80000,0x6FFC0000,0x8BF80000,0x1A80000,0x3F40000,0x5FF80000,0xA8000000,0x5FF80000,0x18C0000,0x53FC0000,0xABF80000,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,
-0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0x53FC0000,0xABF80000,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0xABF80000,0xC4000001,0xC4000001,0xC4000001,0xC4000001,0x1F00000,0xBA40000,0xBA40000,0x73FC0000,0x9FFC0000,0xB9F00000,0xC4000001,0xC4000001,0x2BFC0000,0x87FC0000,0xC3D80000,0xC4000001,
-0x95FC0000,0x1380F1A,0xFF24071A,0xE11C05EA,0xC51C05EB,0xFF0C043A,0xE90401C3,0xC908028A,0xD300034A,0xC30001BE,0xB700034A,0xFEF4065A,0xF0E40153,0xCAF4020D,0xD8E001B2,0xC4E40006,0xB8E801BD,0xC4E405EA,0xBCD8020F,0xB2DC0286,0xA8E405ED,0x1D00F1A,0xF8B405EB,0xC6E405EB,0xEA98034A,0xC6BC01B6,0xB6CC034A,0xDE7805EB,0xC4900154,0xB49C01C5,0xAAAC05ED,0x6DF80F1A,
-0xC43805EB,0xB61C034A,0xA80005ED,0x9A000F1E,0xFF2007D6,0xFF2C0C62,0xFF2C0D3B,0xFF000242,0xFCE40003,0xDAE40002,0xC6EC0023,0xC0E00025,0xFF100782,0xFEE801CC,0xCAD8015B,0xB49C01C5,0x45FC0F1A,0x15405EA,0xFF3C019A,0xDB380152,0xC5380153,0xFF2401B2,0xE11C0001,0xC728002A,0xCB1C01A6,0xC11C004B,0xB71C01A6,0x1F805EA,0xEAEC0153,0xC5100153,0xDED001A5,0xC4E80002,
-0xB6F801A5,0x81FC05EA,0xC4900153,0xB67801A5,0xA80005ED,0x1F805EA,0xEAEC0153,0xC5100153,0xDED001A5,0xC4E80002,0xB6F801A5,0x81FC05EA,0xC4900153,0xB67801A5,0xA80005ED,0x81FC05EA,0xC4900153,0xB67801A5,0xA80005ED,0xA80005ED,0xFF340333,0xFF4C04BE,0xFF4C04BE,0xFF140109,0xFCE40003,0xDAE40002,0xC6F80002,0xC4D0000B,0xFF240356,0xFD0000DD,0xCCC80152,0xB67801A5,
-0x5FFC05EA,0x11C05EA,0x11C05EA,0x11C05EA,0x11C05EA,0xFD0001A5,0xFD0001A5,0xFD0001A5,0xBD0001A5,0xBD0001A5,0xA90001A5,0xF0E40152,0xF0E40152,0xF0E40152,0xC2E40005,0xC2E40005,0xAAF0004C,0xB0E40152,0xB0E40152,0xA4E40029,0x9AE40154,0x3A405EA,0x3A405EA,0x3A405EA,0xCEB401A5,0xCEB401A5,0xA8D801A5,0xC2980153,0xC2980153,0xA8AC0002,0x9ABC0154,0x57FC05EA,
-0x57FC05EA,0xA85801A5,0x9A3C0154,0x8C0005ED,0xFF0402D2,0xF5180491,0x11C05EA,0xFEF800C9,0xFAE40002,0xDAE40002,0xD0E8000C,0xBCE40001,0xFEF80289,0xFEE00099,0xC6D80152,0xA8AC0002,0x2BFC05EA,0x1380152,0x1380152,0x1380152,0x1380152,0xDB200001,0xDB200001,0xDB200001,0xB5200001,0xB5200001,0xA91C0001,0x1D00152,0x1D00152,0x1D00152,0xBEF00001,0xBEF00001,
-0xA9040000,0x6DF80152,0x6DF80152,0xA8B40000,0x9A000154,0x1D00152,0x1D00152,0x1D00152,0xBEF00001,0xBEF00001,0xA9040000,0x6DF80152,0x6DF80152,0xA8B40000,0x9A000154,0x6DF80152,0x6DF80152,0xA8B40000,0x9A000154,0x9A000154,0xFF200091,0xFF2C00C1,0x1380152,0xFD040034,0xF2EC0000,0xD4EC0001,0xC8F40000,0xBCE00000,0xFD1000A2,0xFCF00029,0x45FC0152,0xA8B40000,
-0x45FC0152,0x17001A5,0xFF540001,0xD3540001,0xC5500002,0x25FC01A5,0xE11C0000,0xC5380001,0x95F801A5,0xC4E00001,0xB60001A5,0x25FC01A5,0xE11C0000,0xC5380001,0x95F801A5,0xC4E00001,0xB60001A5,0x95F801A5,0xC4E00001,0xB60001A5,0xB60001A5,0x25FC01A5,0xE11C0000,0xC5380001,0x95F801A5,0xC4E00001,0xB60001A5,0x95F801A5,0xC4E00001,0xB60001A5,0xB60001A5,0x95F801A5,
-0xC4E00001,0xB60001A5,0xB60001A5,0xB60001A5,0xFF500109,0x18801A5,0xFD680122,0xFF200061,0xFCE40002,0xDCDC0000,0xC5000001,0xC4B40001,0xF5480109,0xFF100048,0xCB040000,0xB60001A5,0x79FC01A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0x10001A5,0xC8E40000,0xC8E40000,0xC8E40000,0xC8E40000,0xC8E40000,
-0xC8E40000,0x9CE40000,0x9CE40000,0x9CE40000,0x8EE40000,0x17C01A5,0x17C01A5,0x17C01A5,0x17C01A5,0x17C01A5,0x17C01A5,0xAAAC0000,0xAAAC0000,0xAAAC0000,0x8EC80000,0x43F801A5,0x43F801A5,0x43F801A5,0x8E6C0000,0x800001A5,0xF4F800DD,0x10001A5,0x10001A5,0xFAEC0034,0xFEE40000,0xDEE40000,0xDEE40000,0xB8E40000,0xFCE400A2,0xFED80022,0xACDC0000,0xAAAC0000,
-0x11FC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table116[] = {
-0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,
-0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xB040000,0xB040000,0xB040000,0x16C0000,0x7FC0000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1300000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,
-0x1C40000,0x67F80000,0x67F80000,0x67F80000,0x96000001,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x67F80000,0x67F80000,0x67F80000,0x96000001,0x67F80000,0x67F80000,0x67F80000,0x96000001,0x96000001,0x1440000,0x1300000,0x1300000,0x3600000,0x17C0000,0x19C0000,0x19C0000,0x3FC0000,0x3600000,0x17C0000,0x3FF80000,0x67F80000,
-0x3FF80000,0x1640000,0x1640000,0x1640000,0x1640000,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x8DFC0000,0x8DFC0000,0xB0000001,0xB0000001,0x15FC0000,0x15FC0000,0x15FC0000,0x8DFC0000,0x8DFC0000,0xB0000001,0x8DFC0000,0x8DFC0000,0xB0000001,0xB0000001,0x8DFC0000,
-0x8DFC0000,0xB0000001,0xB0000001,0xB0000001,0x59C0000,0x17C0000,0x1640000,0x1E40000,0x43FC0000,0x6FFC0000,0x7FF80000,0x97F80000,0x3BC0000,0x15FC0000,0x6FFC0000,0xB0000001,0x6FFC0000,0x19C0001,0x6FFC0000,0xB9F80000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,
-0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0xCE000000,0x15FC0000,0x5B80000,0x5B80000,0x8BFC0000,0xAFFC0000,0xC3F80000,0xCE000000,0xCE000000,0x4DFC0000,0x99FC0000,0xCDCC0000,0xCE000000,
-0xA5FC0000,0x1480F1E,0xFF340795,0xEB2C05ED,0xCF2C05ED,0xFF2404B6,0xF31801C5,0xD31C0286,0xDB14034A,0xCD1001BD,0xC114034A,0xFF0C06AE,0xF8F80154,0xD508020F,0xE0F001B6,0xCCF80006,0xC0FC01BE,0xCEF405EB,0xC6EC020D,0xBAEC028A,0xB2F405EB,0x3E80F1A,0xFECC05F1,0xCEF805EA,0xF4A8034A,0xD0D001B2,0xC0DC034A,0xE69005EB,0xCEA00153,0xBEB001C3,0xB2C005EA,0x79FC0F1A,
-0xCE4805EA,0xC02C034A,0xB21005EA,0xA4000F1A,0xFF340865,0xFB440CAA,0xFD480D3E,0xFF1402EF,0xFEFC0019,0xE2F80004,0xD1000025,0xCAF40023,0xFF240839,0xFF000256,0xD2E80159,0xBEB001C3,0x57FC0F1A,0x16405ED,0xFF5001C8,0xE34C0154,0xCF480154,0xFF3C01D5,0xE9300002,0xCF380029,0xD33001A5,0xC92C004C,0xC13001A5,0x19FC05EA,0xF4FC0153,0xCF200152,0xE6E401A5,0xCCF80005,
-0xC10801A5,0x8FF805EA,0xCE9C0152,0xC08801A5,0xB20005EA,0x19FC05EA,0xF4FC0153,0xCF200152,0xE6E401A5,0xCCF80005,0xC10801A5,0x8FF805EA,0xCE9C0152,0xC08801A5,0xB20005EA,0x8FF805EA,0xCE9C0152,0xC08801A5,0xB20005EA,0xB20005EA,0xFF500376,0xF96004ED,0xFB6404E4,0xFF2C016D,0xFEFC0010,0xE2F80003,0xCF080001,0xCCE0000C,0xFD40039E,0xFF180122,0xD4DC0152,0xC08801A5,
-0x71FC05EA,0x12C05ED,0x12C05ED,0x12C05ED,0x12C05ED,0xFD1401A9,0xFD1401A9,0xFD1401A9,0xC51401A5,0xC51401A5,0xB11401A6,0xF8F80153,0xF8F80153,0xF8F80153,0xCCF80002,0xCCF80002,0xB300004B,0xB8F80153,0xB8F80153,0xACF4002A,0xA4F40153,0x1C005EA,0x1C005EA,0x1C005EA,0xD8C401A5,0xD8C401A5,0xB0EC01A6,0xCAAC0153,0xCAAC0153,0xB2C00001,0xA4CC0152,0x65F805EA,
-0x65F805EA,0xB06C01A6,0xA4480152,0x960005EA,0xFF2002FE,0xFD280492,0x12C05ED,0xFF080103,0xFEF80003,0xE0F80002,0xD8F8000B,0xC4F40002,0xFF0C02D3,0xFEF400CA,0xCEEC0153,0xB2C00001,0x3DF805EA,0x1480154,0x1480154,0x1480154,0x1480154,0xE7300000,0xE7300000,0xE7300000,0xBF300000,0xBF300000,0xB1300001,0x3E80152,0x3E80152,0x3E80152,0xC9000001,0xC9000001,
-0xB1180001,0x79FC0152,0x79FC0152,0xB0C80001,0xA4000152,0x3E80152,0x3E80152,0x3E80152,0xC9000001,0xC9000001,0xB1180001,0x79FC0152,0x79FC0152,0xB0C80001,0xA4000152,0x79FC0152,0x79FC0152,0xB0C80001,0xA4000152,0xA4000152,0xFB3400A4,0xFB4400C8,0x1480154,0xFF180041,0xFCFC0000,0xDEFC0000,0xD1080000,0xC6F00000,0xF92800B5,0xFF04003A,0x57FC0152,0xB0C80001,
-0x57FC0152,0x18001A5,0xFF680008,0xDD640000,0xCF640000,0x41FC01A5,0xEB2C0000,0xCF480000,0xA1FC01A5,0xCEEC0000,0xC00001A5,0x41FC01A5,0xEB2C0000,0xCF480000,0xA1FC01A5,0xCEEC0000,0xC00001A5,0xA1FC01A5,0xCEEC0000,0xC00001A5,0xC00001A5,0x41FC01A5,0xEB2C0000,0xCF480000,0xA1FC01A5,0xCEEC0000,0xC00001A5,0xA1FC01A5,0xCEEC0000,0xC00001A5,0xC00001A5,0xA1FC01A5,
-0xCEEC0000,0xC00001A5,0xC00001A5,0xC00001A5,0xFD680120,0x19C01A5,0xF77C0139,0xFF400071,0xFEFC0010,0xE4F00000,0xCF100000,0xCEC00000,0xFF5C0109,0xFF2C0064,0xD3180000,0xC00001A5,0x89FC01A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0x11401A5,0xD0F80001,0xD0F80001,0xD0F80001,0xD0F80001,0xD0F80001,
-0xD0F80001,0xA4F80001,0xA4F80001,0xA4F80001,0x96F40002,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0xB2C00000,0xB2C00000,0xB2C00000,0x96DC0001,0x51F801A5,0x51F801A5,0x51F801A5,0x96840001,0x880001A5,0xFF0C00DD,0x11401A5,0x11401A5,0xFD00003D,0xFEF80002,0xE4F80001,0xE4F80001,0xC0F80001,0xFEF400AA,0xFAF00032,0xB4F00001,0xB2C00000,
-0x21FC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table117[] = {
-0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x1840000,0x47FC0000,
-0x47FC0000,0x47FC0000,0x47FC0000,0x82000000,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1040001,0x1180000,0x1180000,0x1180000,0x1840000,0x17FC0000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,
-0x1DC0000,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x73F80000,0x73F80000,0x73F80000,0x9E000001,0x9E000001,0x3540000,0x1400000,0x1400000,0x1740000,0x1900000,0x1B40000,0x1B40000,0x17FC0000,0x1740000,0x1900000,0x4DFC0000,0x73F80000,
-0x4DFC0000,0x1740000,0x1740000,0x1740000,0x1740000,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x99FC0000,0x99FC0000,0xB8000001,0xB8000001,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x99FC0000,0x99FC0000,0xB8000001,0x99FC0000,0x99FC0000,0xB8000001,0xB8000001,0x99FC0000,
-0x99FC0000,0xB8000001,0xB8000001,0xB8000001,0x1B00000,0x18C0000,0x1740000,0x3F80000,0x57FC0000,0x7FF80000,0x8BFC0000,0xA1FC0000,0x5D00000,0x2FFC0000,0x7FF80000,0xB8000001,0x7FF80000,0x1AC0001,0x87FC0000,0xC5F80000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,
-0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0x87FC0000,0xC5F80000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0xC5F80000,0xD6000000,0xD6000000,0xD6000000,0xD6000000,0x3DFC0000,0xDC80000,0xDC80000,0x9DFC0000,0xBDF80000,0xCDF80000,0xD6000000,0xD6000000,0x6BFC0000,0xABFC0000,0xD5DC0000,0xD6000000,
-0xB5FC0000,0x1580F1E,0xFF4407E9,0xF33C05ED,0xD73C05ED,0xFF300542,0xFB2801C5,0xDB2C0286,0xE324034A,0xD52001BD,0xC924034A,0xFF240716,0xFF080155,0xDD18020F,0xE90001B6,0xD5080006,0xC90C01BE,0xD70405EB,0xCEFC020D,0xC2FC028A,0xBB0405EB,0x7FC0F1A,0xFEE40611,0xD70805EA,0xFCB8034A,0xD8E001B2,0xC8EC034A,0xEEA005EB,0xD6B00153,0xC6C001C3,0xBAD005EA,0x85FC0F1A,
-0xD65805EA,0xC83C034A,0xBA2005EA,0xAC000F1A,0xFF4408FE,0xFF4C0CEA,0xF5580D89,0xFF2C03A6,0xFF10005D,0xEB080004,0xD9100025,0xD3040023,0xFF340886,0xFF180302,0xDAF80159,0xC6C001C3,0x65FC0F1A,0x17405ED,0xFF60020D,0xEB5C0154,0xD7580154,0xFF5001FD,0xF1400002,0xD7480029,0xDB4001A5,0xD13C004C,0xC94001A5,0x31FC05EA,0xFD0C0153,0xD7300152,0xEEF401A5,0xD5080005,
-0xC91801A5,0x9BF805EA,0xD6AC0152,0xC89801A5,0xBA0005EA,0x31FC05EA,0xFD0C0153,0xD7300152,0xEEF401A5,0xD5080005,0xC91801A5,0x9BF805EA,0xD6AC0152,0xC89801A5,0xBA0005EA,0x9BF805EA,0xD6AC0152,0xC89801A5,0xBA0005EA,0xBA0005EA,0xFF5803C9,0xFF6C04F5,0xFF6C0504,0xFF4001AA,0xFF18003B,0xEB080003,0xD7180001,0xD4F0000C,0xFF5403C9,0xFF2C017D,0xDCEC0152,0xC89801A5,
-0x7FFC05EA,0x13C05ED,0x13C05ED,0x13C05ED,0x13C05ED,0xFF2401B5,0xFF2401B5,0xFF2401B5,0xCD2401A5,0xCD2401A5,0xB92401A6,0xFD080155,0xFD080155,0xFD080155,0xD5080002,0xD5080002,0xBB10004B,0xC1080153,0xC1080153,0xB504002A,0xAD040153,0x1D805EA,0x1D805EA,0x1D805EA,0xE0D401A5,0xE0D401A5,0xB8FC01A6,0xD2BC0153,0xD2BC0153,0xBAD00001,0xACDC0152,0x71F805EA,
-0x71F805EA,0xB87C01A6,0xAC580152,0x9E0005EA,0xFF2C032B,0xF53804C1,0x13C05ED,0xFF1C0141,0xFF0C0013,0xE9080002,0xE108000B,0xCD040002,0xFF2002F9,0xFF100109,0xD6FC0153,0xBAD00001,0x4BFC05EA,0x1580154,0x1580154,0x1580154,0x1580154,0xEF400000,0xEF400000,0xEF400000,0xC7400000,0xC7400000,0xB9400001,0x7FC0152,0x7FC0152,0x7FC0152,0xD1100001,0xD1100001,
-0xB9280001,0x85FC0152,0x85FC0152,0xB8D80001,0xAC000152,0x7FC0152,0x7FC0152,0x7FC0152,0xD1100001,0xD1100001,0xB9280001,0x85FC0152,0x85FC0152,0xB8D80001,0xAC000152,0x85FC0152,0x85FC0152,0xB8D80001,0xAC000152,0xAC000152,0xF74800B5,0xF35400DD,0x1580154,0xFD300055,0xFD140002,0xE70C0000,0xD9180000,0xCF000000,0xFF3400B9,0xFD200048,0x65FC0152,0xB8D80001,
-0x65FC0152,0x19001A5,0xFF78001D,0xE5740000,0xD7740000,0x59FC01A5,0xF33C0000,0xD7580000,0xADFC01A5,0xD6FC0000,0xC80001A5,0x59FC01A5,0xF33C0000,0xD7580000,0xADFC01A5,0xD6FC0000,0xC80001A5,0xADFC01A5,0xD6FC0000,0xC80001A5,0xC80001A5,0x59FC01A5,0xF33C0000,0xD7580000,0xADFC01A5,0xD6FC0000,0xC80001A5,0xADFC01A5,0xD6FC0000,0xC80001A5,0xC80001A5,0xADFC01A5,
-0xD6FC0000,0xC80001A5,0xC80001A5,0xC80001A5,0xFF780122,0x1AC01A5,0xFF8C0139,0xFF580091,0xFF240020,0xED000000,0xD7200000,0xD6D00000,0xFF700120,0xFF40007D,0xDB280000,0xC80001A5,0x99FC01A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0x12401A5,0xD9080001,0xD9080001,0xD9080001,0xD9080001,0xD9080001,
-0xD9080001,0xAD080001,0xAD080001,0xAD080001,0x9F040002,0x1B001A5,0x1B001A5,0x1B001A5,0x1B001A5,0x1B001A5,0x1B001A5,0xBAD00000,0xBAD00000,0xBAD00000,0x9EEC0001,0x5DF401A5,0x5DF401A5,0x5DF401A5,0x9E940001,0x900001A5,0xF71C00F2,0x12401A5,0x12401A5,0xFF10004A,0xFD08000A,0xED080001,0xED080001,0xC9080001,0xFD0C00B5,0xFF00003D,0xBD000001,0xBAD00000,
-0x31FC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table118[] = {
-0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,
-0x53FC0000,0x53FC0000,0x53FC0000,0x8A000000,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1140001,0x1280000,0x1280000,0x1280000,0x19C0000,0x25FC0000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,
-0x1F40000,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x1F40000,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0x7FF80000,0x7FF80000,0x7FF80000,0xA6000001,0xA6000001,0xB640000,0x1500000,0x1500000,0x7840000,0x1A40000,0x1C80000,0x1C80000,0x2BFC0000,0x7840000,0x1A40000,0x5DF80000,0x7FF80000,
-0x5DF80000,0x1840000,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0xA5F80000,0xA5F80000,0xC0000001,0xC0000001,0x47FC0000,0x47FC0000,0x47FC0000,0xA5F80000,0xA5F80000,0xC0000001,0xA5F80000,0xA5F80000,0xC0000001,0xC0000001,0xA5F80000,
-0xA5F80000,0xC0000001,0xC0000001,0xC0000001,0x1C40000,0x79C0000,0x1840000,0x1BFC0000,0x6BFC0000,0x8DFC0000,0x99FC0000,0xADF80000,0x5E40000,0x47FC0000,0x8DFC0000,0xC0000001,0x8DFC0000,0x1BC0001,0x9FFC0000,0xD1F80000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,
-0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0x9FFC0000,0xD1F80000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0xD1F80000,0xDE000000,0xDE000000,0xDE000000,0xDE000000,0x63FC0000,0x1DC0000,0x1DC0000,0xB1FC0000,0xC9FC0000,0xD7F80000,0xDE000000,0xDE000000,0x89FC0000,0xBBFC0000,0xDDEC0000,0xDE000000,
-0xC3FC0000,0x1680F1E,0xFF5C0861,0xFB4C05ED,0xDF4C05ED,0xFF4405BA,0xFF3801D5,0xE33C0286,0xEB34034A,0xDD3001BD,0xD134034A,0xFF300786,0xFF1C017C,0xE528020F,0xF11001B6,0xDD180006,0xD11C01BE,0xDF1405EB,0xD70C020D,0xCB0C028A,0xC31405EB,0x1FFC0F1A,0xFEFC0651,0xDF1805EA,0xFED4035A,0xE0F001B2,0xD0FC034A,0xF6B005EB,0xDEC00153,0xCED001C3,0xC2E005EA,0x91FC0F1A,
-0xDE6805EA,0xD04C034A,0xC23005EA,0xB4000F1A,0xFF58097A,0xFB640D0E,0xFD680D89,0xFF400463,0xFF2400D1,0xF3180004,0xE1200025,0xDB140023,0xFF440933,0xFF3003D3,0xE3080159,0xCED001C3,0x75FC0F1A,0x18405ED,0xFF740248,0xF36C0154,0xDF680154,0xFF5C0239,0xF9500002,0xDF580029,0xE35001A5,0xD94C004C,0xD15001A5,0x49FC05EA,0xFF280163,0xDF400152,0xF70401A5,0xDD180005,
-0xD12801A5,0xA7F805EA,0xDEBC0152,0xD0A801A5,0xC20005EA,0x49FC05EA,0xFF280163,0xDF400152,0xF70401A5,0xDD180005,0xD12801A5,0xA7F805EA,0xDEBC0152,0xD0A801A5,0xC20005EA,0xA7F805EA,0xDEBC0152,0xD0A801A5,0xC20005EA,0xC20005EA,0xFD7403F6,0xF980051D,0xFB840515,0xFF540209,0xFF30007D,0xF3180003,0xDF280001,0xDD00000C,0xFF5C040A,0xFF4801C4,0xE4FC0152,0xD0A801A5,
-0x8FFC05EA,0x14C05ED,0x14C05ED,0x14C05ED,0x14C05ED,0xFD3801D5,0xFD3801D5,0xFD3801D5,0xD53401A5,0xD53401A5,0xC13401A6,0xFF1C0163,0xFF1C0163,0xFF1C0163,0xDD180002,0xDD180002,0xC320004B,0xC9180153,0xC9180153,0xBD14002A,0xB5140153,0x1F005EA,0x1F005EA,0x1F005EA,0xE8E401A5,0xE8E401A5,0xC10C01A6,0xDACC0153,0xDACC0153,0xC2E00001,0xB4EC0152,0x7DF805EA,
-0x7DF805EA,0xC08C01A6,0xB4680152,0xA60005EA,0xFF3C035D,0xFD4804C1,0x14C05ED,0xFF2C0182,0xFF200033,0xF1180002,0xE918000B,0xD5140002,0xFF340322,0xFF1C014E,0xDF0C0153,0xC2E00001,0x5BFC05EA,0x1680154,0x1680154,0x1680154,0x1680154,0xF7500000,0xF7500000,0xF7500000,0xCF500000,0xCF500000,0xC1500001,0x1FFC0152,0x1FFC0152,0x1FFC0152,0xD9200001,0xD9200001,
-0xC1380001,0x91FC0152,0x91FC0152,0xC0E80001,0xB4000152,0x1FFC0152,0x1FFC0152,0x1FFC0152,0xD9200001,0xD9200001,0xC1380001,0x91FC0152,0x91FC0152,0xC0E80001,0xB4000152,0x91FC0152,0x91FC0152,0xC0E80001,0xB4000152,0xB4000152,0xFF5800B5,0xFB6400DD,0x1680154,0xFF440062,0xFD28000A,0xEF1C0000,0xE1280000,0xD7100000,0xFD4C00C8,0xFF300059,0x75FC0152,0xC0E80001,
-0x75FC0152,0x1A001A5,0xFF8C0034,0xED840000,0xDF840000,0x71FC01A5,0xFB4C0000,0xDF680000,0xB9FC01A5,0xDF0C0000,0xD00001A5,0x71FC01A5,0xFB4C0000,0xDF680000,0xB9FC01A5,0xDF0C0000,0xD00001A5,0xB9FC01A5,0xDF0C0000,0xD00001A5,0xD00001A5,0x71FC01A5,0xFB4C0000,0xDF680000,0xB9FC01A5,0xDF0C0000,0xD00001A5,0xB9FC01A5,0xDF0C0000,0xD00001A5,0xD00001A5,0xB9FC01A5,
-0xDF0C0000,0xD00001A5,0xD00001A5,0xD00001A5,0xFD900139,0x1BC01A5,0xF79C0154,0xFF6C00AA,0xFF3C003A,0xF5100000,0xDF300000,0xDEE00000,0xF7880139,0xFF6400A2,0xE3380000,0xD00001A5,0xA7FC01A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0x13401A5,0xE1180001,0xE1180001,0xE1180001,0xE1180001,0xE1180001,
-0xE1180001,0xB5180001,0xB5180001,0xB5180001,0xA7140002,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0xC2E00000,0xC2E00000,0xC2E00000,0xA6FC0001,0x67FC01A5,0x67FC01A5,0x67FC01A5,0xA6A40001,0x980001A5,0xFF2C00F2,0x13401A5,0x13401A5,0xFF200059,0xFD1C0012,0xF5180001,0xF5180001,0xD1180001,0xFD1C00C8,0xFF140048,0xC5100001,0xC2E00000,
-0x3FFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table119[] = {
-0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,
-0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x5380000,0x5380000,0x5380000,0x1B40000,0x35FC0000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0x1600000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,
-0xFFC0000,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0xFFC0000,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0x8BF80000,0x8BF80000,0x8BF80000,0xAE000001,0xAE000001,0x1780000,0x1600000,0x1600000,0x3980000,0x1B80000,0x3DC0000,0x3DC0000,0x3DFC0000,0x3980000,0x1B80000,0x6BFC0000,0x8BF80000,
-0x6BFC0000,0x1940000,0x1940000,0x1940000,0x1940000,0x5FFC0000,0x5FFC0000,0x5FFC0000,0xB1F80000,0xB1F80000,0xC8000001,0x5FFC0000,0x5FFC0000,0x5FFC0000,0xB1F80000,0xB1F80000,0xC8000001,0xB1F80000,0xB1F80000,0xC8000001,0xC8000001,0x5FFC0000,0x5FFC0000,0x5FFC0000,0xB1F80000,0xB1F80000,0xC8000001,0xB1F80000,0xB1F80000,0xC8000001,0xC8000001,0xB1F80000,
-0xB1F80000,0xC8000001,0xC8000001,0xC8000001,0x3D40000,0xFAC0000,0x1940000,0x39FC0000,0x7DFC0000,0x9DF80000,0xA7F80000,0xB7FC0000,0x5F80000,0x5FFC0000,0x9DF80000,0xC8000001,0x9DF80000,0x1CC0001,0xB7FC0000,0xDDF40000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,
-0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xB7FC0000,0xDDF40000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0xE6000000,0x8BFC0000,0x1EC0000,0x1EC0000,0xC5FC0000,0xD7FC0000,0xE1FC0000,0xE6000000,0xE6000000,0xA7FC0000,0xCDFC0000,0xE5FC0000,0xE6000000,
-0xD3FC0000,0x1780F1E,0xFF6808D9,0xFF6005F1,0xE75C05ED,0xFF5C0662,0xFF480215,0xEB4C0286,0xF344034A,0xE54001BD,0xD944034A,0xFF4407F7,0xFF3401DC,0xED38020F,0xF92001B6,0xE5280006,0xD92C01BE,0xE72405EB,0xDF1C020D,0xD31C028A,0xCB2405EB,0x37FC0F1A,0xFF1406B1,0xE72805EA,0xFEF00392,0xE90001B2,0xD90C034A,0xFEC005EB,0xE6D00153,0xD6E001C3,0xCAF005EA,0x9DFC0F1A,
-0xE67805EA,0xD85C034A,0xCA4005EA,0xBC000F1A,0xFF6409F4,0xFF6C0D5E,0xF5780DD6,0xFF54052A,0xFF380175,0xFB280004,0xE9300025,0xE3240023,0xFF5C09AA,0xFF40048F,0xEB180159,0xD6E001C3,0x83FC0F1A,0x19405ED,0xFF8402A5,0xFB7C0154,0xE7780154,0xFF740279,0xFF600004,0xE7680029,0xEB6001A5,0xE15C004C,0xD96001A5,0x63FC05EA,0xFF40018B,0xE7500152,0xFF1401A5,0xE5280005,
-0xD93801A5,0xB3F805EA,0xE6CC0152,0xD8B801A5,0xCA0005EA,0x63FC05EA,0xFF40018B,0xE7500152,0xFF1401A5,0xE5280005,0xD93801A5,0xB3F805EA,0xE6CC0152,0xD8B801A5,0xCA0005EA,0xB3F805EA,0xE6CC0152,0xD8B801A5,0xCA0005EA,0xCA0005EA,0xFF800435,0xFF8C052D,0xFF8C053D,0xFF6C026A,0xFF4800CE,0xFB280003,0xE7380001,0xE510000C,0xFF78042A,0xFF5C022E,0xED0C0152,0xD8B801A5,
-0x9FF805EA,0x15C05ED,0x15C05ED,0x15C05ED,0x15C05ED,0xFF4801F1,0xFF4801F1,0xFF4801F1,0xDD4401A5,0xDD4401A5,0xC94401A6,0xFF30017D,0xFF30017D,0xFF30017D,0xE5280002,0xE5280002,0xCB30004B,0xD1280153,0xD1280153,0xC524002A,0xBD240153,0xDFC05EA,0xDFC05EA,0xDFC05EA,0xF0F401A5,0xF0F401A5,0xC91C01A6,0xE2DC0153,0xE2DC0153,0xCAF00001,0xBCFC0152,0x89F805EA,
-0x89F805EA,0xC89C01A6,0xBC780152,0xAE0005EA,0xFF4C038A,0xF75C04EE,0x15C05ED,0xFF4401C3,0xFF300062,0xF9280002,0xF128000B,0xDD240002,0xFF3C0371,0xFF300182,0xE71C0153,0xCAF00001,0x69FC05EA,0x1780154,0x1780154,0x1780154,0x1780154,0xFF600000,0xFF600000,0xFF600000,0xD7600000,0xD7600000,0xC9600001,0x37FC0152,0x37FC0152,0x37FC0152,0xE1300001,0xE1300001,
-0xC9480001,0x9DFC0152,0x9DFC0152,0xC8F80001,0xBC000152,0x37FC0152,0x37FC0152,0x37FC0152,0xE1300001,0xE1300001,0xC9480001,0x9DFC0152,0x9DFC0152,0xC8F80001,0xBC000152,0x9DFC0152,0x9DFC0152,0xC8F80001,0xBC000152,0xBC000152,0xFB6C00C8,0xF37400F4,0x1780154,0xFD580071,0xFF400012,0xF72C0000,0xE9380000,0xDF200000,0xF56400DD,0xFF480064,0x83FC0152,0xC8F80001,
-0x83FC0152,0x1B001A5,0xFF9C0061,0xF5940000,0xE7940000,0x89FC01A5,0xFF640002,0xE7780000,0xC5FC01A5,0xE71C0000,0xD80001A5,0x89FC01A5,0xFF640002,0xE7780000,0xC5FC01A5,0xE71C0000,0xD80001A5,0xC5FC01A5,0xE71C0000,0xD80001A5,0xD80001A5,0x89FC01A5,0xFF640002,0xE7780000,0xC5FC01A5,0xE71C0000,0xD80001A5,0xC5FC01A5,0xE71C0000,0xD80001A5,0xD80001A5,0xC5FC01A5,
-0xE71C0000,0xD80001A5,0xD80001A5,0xD80001A5,0xF7A40152,0x1CC01A5,0xFFAC0154,0xFF8400D0,0xFF640062,0xFD200000,0xE7400000,0xE6F00000,0xFF980139,0xFD8000C8,0xEB480000,0xD80001A5,0xB7FC01A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0x14401A5,0xE9280001,0xE9280001,0xE9280001,0xE9280001,0xE9280001,
-0xE9280001,0xBD280001,0xBD280001,0xBD280001,0xAF240002,0x1E001A5,0x1E001A5,0x1E001A5,0x1E001A5,0x1E001A5,0x1E001A5,0xCAF00000,0xCAF00000,0xCAF00000,0xAF0C0001,0x73FC01A5,0x73FC01A5,0x73FC01A5,0xAEB40001,0xA00001A5,0xF73C0109,0x14401A5,0x14401A5,0xFB340071,0xFB2C0022,0xFD280001,0xFD280001,0xD9280001,0xF93000DD,0xFF200061,0xCD200001,0xCAF00000,
-0x4FFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table120[] = {
-0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,
-0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x14C0000,0x14C0000,0x14C0000,0x1D00000,0x45FC0000,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x1700001,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,
-0x2BFC0000,0x97FC0000,0x97FC0000,0x97FC0000,0xB8000000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x2BFC0000,0x97FC0000,0x97FC0000,0x97FC0000,0xB8000000,0x97FC0000,0x97FC0000,0x97FC0000,0xB8000000,0xB8000000,0xD880000,0x1700001,0x1700001,0x5AC0000,0x1D00000,0x1F80000,0x1F80000,0x53FC0000,0x5AC0000,0x1D00000,0x7DF80000,0x97FC0000,
-0x7DF80000,0x1A40001,0x1A40001,0x1A40001,0x1A40001,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0xBFF80000,0xBFF80000,0xD2000000,0xD2000000,0x7BFC0000,0x7BFC0000,0x7BFC0000,0xBFF80000,0xBFF80000,0xD2000000,0xBFF80000,0xBFF80000,0xD2000000,0xD2000000,0xBFF80000,
-0xBFF80000,0xD2000000,0xD2000000,0xD2000000,0x7E80000,0x9C00000,0x1A40001,0x5BFC0000,0x93FC0000,0xADFC0000,0xB5FC0000,0xC3FC0000,0x29FC0000,0x7BFC0000,0xADFC0000,0xD2000000,0xADFC0000,0x1E00000,0xD3FC0000,0xE9FC0000,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,
-0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xEE000001,0xB7FC0000,0x17FC0000,0x17FC0000,0xDBFC0000,0xE7F80000,0xEDF40000,0xEE000001,0xEE000001,0xC9FC0000,0xE1FC0000,0xEFF00000,0xEE000001,
-0xE3FC0000,0x18C0F1A,0xFF800983,0xFF70062A,0xEF7005EB,0xFF680746,0xFF6002AB,0xF35C028A,0xFD54034A,0xED5401BE,0xE154034A,0xFF5C0899,0xFF4C028C,0xF548020D,0xFF3801BA,0xEF380006,0xE33C01BD,0xEF3805EA,0xE72C020F,0xDD300286,0xD33805ED,0x53FC0F1A,0xFF38073B,0xF13805EB,0xFF140416,0xF11001B6,0xE120034A,0xFEE40615,0xEEE40154,0xDEF001C5,0xD50005ED,0xABF80F1A,
-0xEE8C05EB,0xE070034A,0xD25405ED,0xC4000F1E,0xFF780A99,0xFD880D72,0xFD880DDA,0xFF6C0649,0xFF500261,0xFF3C001F,0xF1400023,0xEB340025,0xFF700A53,0xFF54059A,0xF52C015B,0xDEF001C5,0x95FC0F1A,0x1A805EA,0xFF980303,0xFF8C0162,0xEF8C0153,0xFF8C02DE,0xFF78003B,0xF17C002A,0xF57001A6,0xEB70004B,0xE17001A6,0x7DFC05EA,0xFF5801E4,0xEF640153,0xFF3801BA,0xEF3C0002,
-0xE14C01A5,0xBFFC05EA,0xEEE40153,0xE0CC01A5,0xD20005ED,0x7DFC05EA,0xFF5801E4,0xEF640153,0xFF3801BA,0xEF3C0002,0xE14C01A5,0xBFFC05EA,0xEEE40153,0xE0CC01A5,0xD20005ED,0xBFFC05EA,0xEEE40153,0xE0CC01A5,0xD20005ED,0xD20005ED,0xFF94046D,0xFBA4054E,0xFBA4054B,0xFF8002E8,0xFF680146,0xFF400011,0xF14C0002,0xEF24000B,0xFD940481,0xFF7402C9,0xF71C0152,0xE0CC01A5,
-0xAFFC05EA,0x17005EA,0x17005EA,0x17005EA,0x17005EA,0xFF5C0221,0xFF5C0221,0xFF5C0221,0xE75401A5,0xE75401A5,0xD35401A5,0xFF4401A8,0xFF4401A8,0xFF4401A8,0xED380005,0xED380005,0xD544004C,0xDB380152,0xDB380152,0xCF380029,0xC5380154,0x29FC05EA,0x29FC05EA,0x29FC05EA,0xF90801A5,0xF90801A5,0xD32C01A5,0xECEC0153,0xECEC0153,0xD3000002,0xC5100154,0x97F805EA,
-0x97F805EA,0xD2AC01A5,0xC4900154,0xB60005ED,0xFD6403CC,0xFF6C04ED,0x17005EA,0xFF540211,0xFF4400AD,0xFF3C0006,0xFB3C000C,0xE7380001,0xFF58039E,0xFD4C01E2,0xF12C0152,0xD3000002,0x7BFC05EA,0x18C0152,0x18C0152,0x18C0152,0x18C0152,0xFF740005,0xFF740005,0xFF740005,0xDF740001,0xDF740001,0xD3700001,0x53FC0152,0x53FC0152,0x53FC0152,0xE9440001,0xE9440001,
-0xD3580000,0xABF80152,0xABF80152,0xD3080000,0xC4000154,0x53FC0152,0x53FC0152,0x53FC0152,0xE9440001,0xE9440001,0xD3580000,0xABF80152,0xABF80152,0xD3080000,0xC4000154,0xABF80152,0xABF80152,0xD3080000,0xC4000154,0xC4000154,0xF78000DD,0xFD8800F2,0x18C0152,0xFB700091,0xFF540028,0xFF400001,0xF3480000,0xE7340000,0xFD7400DD,0xFF5C007D,0x95FC0152,0xD3080000,
-0x95FC0152,0x1C401A5,0xFFB00092,0xFDA80001,0xEFA40002,0xA5FC01A5,0xFF880020,0xEF8C0001,0xD3FC01A5,0xEF340001,0xE00001A5,0xA5FC01A5,0xFF880020,0xEF8C0001,0xD3FC01A5,0xEF340001,0xE00001A5,0xD3FC01A5,0xEF340001,0xE00001A5,0xE00001A5,0xA5FC01A5,0xFF880020,0xEF8C0001,0xD3FC01A5,0xEF340001,0xE00001A5,0xD3FC01A5,0xEF340001,0xE00001A5,0xE00001A5,0xD3FC01A5,
-0xEF340001,0xE00001A5,0xE00001A5,0xE00001A5,0xFFB40154,0x1E001A5,0xF9C0016D,0xFFA000FA,0xFF7C0092,0xFF48000A,0xEF540001,0xEF080001,0xFFAC015A,0xFF9400E9,0xF5580000,0xE00001A5,0xC7FC01A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0x15401A5,0xF3380000,0xF3380000,0xF3380000,0xF3380000,0xF3380000,
-0xF3380000,0xC7380000,0xC7380000,0xC7380000,0xB9380000,0x1F801A5,0x1F801A5,0x1F801A5,0x1F801A5,0x1F801A5,0x1F801A5,0xD5000000,0xD5000000,0xD5000000,0xB91C0000,0x81FC01A5,0x81FC01A5,0x81FC01A5,0xB8C00000,0xAA0001A5,0xFF4C010D,0x15401A5,0x15401A5,0xFD480080,0xFD400034,0xFF3C0005,0xFF3C0005,0xE3380000,0xFF3C00E9,0xFF3C0071,0xD7300000,0xD5000000,
-0x5FFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table121[] = {
-0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0x79F80000,
-0x79F80000,0x79F80000,0x79F80000,0xA2000001,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x1480000,0x75C0000,0x75C0000,0x75C0000,0x1E80000,0x55FC0000,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x1800001,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,
-0x43FC0000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0x43FC0000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xC0000000,0xC0000000,0x19C0000,0x1800001,0x1800001,0x1C00000,0x1E40000,0x17FC0000,0x17FC0000,0x67FC0000,0x1C00000,0x1E40000,0x8BFC0000,0xA3FC0000,
-0x8BFC0000,0x1B40001,0x1B40001,0x1B40001,0x1B40001,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0xCBF80000,0xCBF80000,0xDA000000,0xDA000000,0x93FC0000,0x93FC0000,0x93FC0000,0xCBF80000,0xCBF80000,0xDA000000,0xCBF80000,0xCBF80000,0xDA000000,0xDA000000,0xCBF80000,
-0xCBF80000,0xDA000000,0xDA000000,0xDA000000,0x3FC0000,0x1D40000,0x1B40001,0x79FC0000,0xA7FC0000,0xBDF80000,0xC3FC0000,0xCFF80000,0x51FC0000,0x93FC0000,0xBDF80000,0xDA000000,0xBDF80000,0x1F00000,0xEBFC0000,0xF5FC0000,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,
-0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xEBFC0000,0xF5FC0000,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xF5FC0000,0xF6000001,0xF6000001,0xF6000001,0xF6000001,0xDFFC0000,0x97FC0000,0x97FC0000,0xEFFC0000,0xF5F80000,0xF7F40000,0xF6000001,0xF6000001,0xE7FC0000,0xF1FC0000,0xF9C40000,0xF6000001,
-0xF3FC0000,0x19C0F1A,0xFF8C0A17,0xFF84069F,0xF78005EB,0xFF8007FE,0xFF700371,0xFB6C028A,0xFF640362,0xF56401BE,0xE964034A,0xFF740939,0xFF60036B,0xFD58020D,0xFF4C021D,0xF7480006,0xEB4C01BD,0xF74805EA,0xEF3C020F,0xE5400286,0xDB4805ED,0x6BFC0F1A,0xFF4C07E9,0xF94805EB,0xFF2C04BE,0xF92001B6,0xE930034A,0xFF080675,0xF6F40154,0xE70001C5,0xDD1005ED,0xB7F80F1A,
-0xF69C05EB,0xE880034A,0xDA6405ED,0xCC000F1E,0xFF8C0B22,0xF5980DDA,0xF79C0E27,0xFF800738,0xFF680366,0xFF540098,0xF9500023,0xF3440025,0xFF800AF5,0xFF7006AA,0xFD3C015B,0xE70001C5,0xA3FC0F1A,0x1B805EA,0xFFA8037E,0xFFA0019A,0xF79C0153,0xFF980356,0xFF8800A9,0xF98C002A,0xFD8001A6,0xF380004B,0xE98001A6,0x95FC05EA,0xFF7C0248,0xF7740153,0xFF580205,0xF74C0002,
-0xE95C01A5,0xCBFC05EA,0xF6F40153,0xE8DC01A5,0xDA0005ED,0x95FC05EA,0xFF7C0248,0xF7740153,0xFF580205,0xF74C0002,0xE95C01A5,0xCBFC05EA,0xF6F40153,0xE8DC01A5,0xDA0005ED,0xCBFC05EA,0xF6F40153,0xE8DC01A5,0xDA0005ED,0xDA0005ED,0xFBAC04B5,0xFFAC057E,0xF5B8057E,0xFF940349,0xFF7C01D6,0xFF5C005E,0xF95C0002,0xF734000B,0xFFA404B5,0xFF900329,0xFF2C0152,0xE8DC01A5,
-0xBFF805EA,0x18005EA,0x18005EA,0x18005EA,0x18005EA,0xFF6C0266,0xFF6C0266,0xFF6C0266,0xEF6401A5,0xEF6401A5,0xDB6401A5,0xFF5C01E8,0xFF5C01E8,0xFF5C01E8,0xF5480005,0xF5480005,0xDD54004C,0xE3480152,0xE3480152,0xD7480029,0xCD480154,0x41FC05EA,0x41FC05EA,0x41FC05EA,0xFF1C01A6,0xFF1C01A6,0xDB3C01A5,0xF4FC0153,0xF4FC0153,0xDB100002,0xCD200154,0xA1FC05EA,
-0xA1FC05EA,0xDABC01A5,0xCCA00154,0xBE0005ED,0xFF7403FE,0xF77C051E,0x18005EA,0xFF68026D,0xFF5800F9,0xFF50002D,0xFF4C000D,0xEF480001,0xFF6403EA,0xFF5C022E,0xF93C0152,0xDB100002,0x89FC05EA,0x19C0152,0x19C0152,0x19C0152,0x19C0152,0xFD880019,0xFD880019,0xFD880019,0xE7840001,0xE7840001,0xDB800001,0x6BFC0152,0x6BFC0152,0x6BFC0152,0xF1540001,0xF1540001,
-0xDB680000,0xB7F80152,0xB7F80152,0xDB180000,0xCC000154,0x6BFC0152,0x6BFC0152,0x6BFC0152,0xF1540001,0xF1540001,0xDB680000,0xB7F80152,0xB7F80152,0xDB180000,0xCC000154,0xB7F80152,0xB7F80152,0xDB180000,0xCC000154,0xCC000154,0xFF9000DD,0xF5980109,0x19C0152,0xFD8400A2,0xFF70003D,0xFF58000A,0xFB580000,0xEF440000,0xFD8800F2,0xFF740095,0xA3FC0152,0xDB180000,
-0xA3FC0152,0x1D401A5,0xFFC400C1,0xFFB80011,0xF7B40002,0xBDFC01A5,0xFFA00050,0xF79C0001,0xDFF801A5,0xF7440001,0xE80001A5,0xBDFC01A5,0xFFA00050,0xF79C0001,0xDFF801A5,0xF7440001,0xE80001A5,0xDFF801A5,0xF7440001,0xE80001A5,0xE80001A5,0xBDFC01A5,0xFFA00050,0xF79C0001,0xDFF801A5,0xF7440001,0xE80001A5,0xDFF801A5,0xF7440001,0xE80001A5,0xE80001A5,0xDFF801A5,
-0xF7440001,0xE80001A5,0xE80001A5,0xE80001A5,0xFDCC016D,0x1F001A5,0xFFCC0179,0xFDBC0122,0xFFA800C8,0xFF70003A,0xF7640001,0xF7180001,0xF7CC016D,0xFFB80120,0xFD680000,0xE80001A5,0xD7FC01A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0x16401A5,0xFB480000,0xFB480000,0xFB480000,0xFB480000,0xFB480000,
-0xFB480000,0xCF480000,0xCF480000,0xCF480000,0xC1480000,0x15FC01A5,0x15FC01A5,0x15FC01A5,0x15FC01A5,0x15FC01A5,0x15FC01A5,0xDD100000,0xDD100000,0xDD100000,0xC12C0000,0x8DFC01A5,0x8DFC01A5,0x8DFC01A5,0xC0D00000,0xB20001A5,0xF9600120,0x16401A5,0x16401A5,0xFF580091,0xFF500041,0xFF4C000D,0xFF4C000D,0xEB480000,0xFD5400F2,0xFD4C0082,0xDF400000,0xDD100000,
-0x6FFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table122[] = {
-0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,
-0x85F80000,0x85F80000,0x85F80000,0xAA000001,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0xF6C0000,0xF6C0000,0xF6C0000,0x3FC0000,0x63FC0000,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x1900001,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,
-0x5BFC0000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0x5BFC0000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xC8000000,0xC8000000,0x1AC0000,0x1900001,0x1900001,0x1D40000,0x1F80000,0x35FC0000,0x35FC0000,0x7BFC0000,0x1D40000,0x1F80000,0x9BFC0000,0xAFFC0000,
-0x9BFC0000,0x1C40001,0x1C40001,0x1C40001,0x1C40001,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xD7F80000,0xD7F80000,0xE2000000,0xE2000000,0xABFC0000,0xABFC0000,0xABFC0000,0xD7F80000,0xD7F80000,0xE2000000,0xD7F80000,0xD7F80000,0xE2000000,0xE2000000,0xD7F80000,
-0xD7F80000,0xE2000000,0xE2000000,0xE2000000,0x3BFC0000,0x1E40000,0x1C40001,0x97FC0000,0xBBFC0000,0xCBFC0000,0xD1FC0000,0xD9FC0000,0x77FC0000,0xABFC0000,0xCBFC0000,0xE2000000,0xCBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1AC0EC7,0xFF9C0A97,0xFF940717,0xFF9005EA,0xFF9808B1,0xFF84044E,0xFF8002A5,0xFF7C03A7,0xFD7401B9,0xF174032D,0xFF8C09AC,0xFF700455,0xFF680234,0xFF6402A2,0xFF580005,0xF35C01A4,0xFF5805B3,0xF74C01FE,0xED500261,0xE35805B4,0x83FC0EC7,0xFF640876,0xFF5805ED,0xFF4C057F,0xFF3801B9,0xF140032D,0xFF2C06C8,0xFF040153,0xEF1001A4,0xE32005B5,0xC3F80EC7,
-0xFEAC05EA,0xF090032D,0xE27405B4,0xD4000EC9,0xFFA00B5D,0xFDA80D89,0xFDA80DDA,0xFF8807F3,0xFF7C0465,0xFF680153,0xFF600030,0xFB54001C,0xFF980B2F,0xFF800755,0xFF50016C,0xEF1001A4,0xB3FC0EC7,0x1C805B3,0xFFBC03B6,0xFFAC01FB,0xFFAC0152,0xFFB00389,0xFFA00122,0xFF98002D,0xFF94019B,0xFB900042,0xF1900189,0xAFFC05B3,0xFF9402A5,0xFF840152,0xFF70024E,0xFF5C0001,
-0xF16C0188,0xD7FC05B3,0xFF040152,0xF0EC0188,0xE20005B4,0xAFFC05B3,0xFF9402A5,0xFF840152,0xFF70024E,0xFF5C0001,0xF16C0188,0xD7FC05B3,0xFF040152,0xF0EC0188,0xE20005B4,0xD7FC05B3,0xFF040152,0xF0EC0188,0xE20005B4,0xE20005B4,0xFFBC04B5,0xFBC4054B,0xFBC4054B,0xFFB003A2,0xFF900261,0xFF7C00D1,0xFF6C0004,0xFD480009,0xFFB404BE,0xFFA40388,0xFF50016B,0xF0EC0188,
-0xCDFC05B3,0x19005EA,0x19005EA,0x19005EA,0x19005EA,0xFF8002A5,0xFF8002A5,0xFF8002A5,0xF77401A5,0xF77401A5,0xE37401A5,0xFF680234,0xFF680234,0xFF680234,0xFD580005,0xFD580005,0xE564004C,0xEB580152,0xEB580152,0xDF580029,0xD5580154,0x59FC05EA,0x59FC05EA,0x59FC05EA,0xFF3801B9,0xFF3801B9,0xE34C01A5,0xFD0C0153,0xFD0C0153,0xE3200002,0xD5300154,0xADFC05EA,
-0xADFC05EA,0xE2CC01A5,0xD4B00154,0xC60005ED,0xFF84042D,0xFF8C051E,0x19005EA,0xFF7802C9,0xFF6C0155,0xFF64006D,0xFF600030,0xF7580001,0xFF780411,0xFF740289,0xFF500153,0xE3200002,0x99FC05EA,0x1AC0152,0x1AC0152,0x1AC0152,0x1AC0152,0xFF98002D,0xFF98002D,0xFF98002D,0xEF940001,0xEF940001,0xE3900001,0x83FC0152,0x83FC0152,0x83FC0152,0xF9640001,0xF9640001,
-0xE3780000,0xC3F80152,0xC3F80152,0xE3280000,0xD4000154,0x83FC0152,0x83FC0152,0x83FC0152,0xF9640001,0xF9640001,0xE3780000,0xC3F80152,0xC3F80152,0xE3280000,0xD4000154,0xC3F80152,0xC3F80152,0xE3280000,0xD4000154,0xD4000154,0xFFA000F4,0xFDA80109,0x1AC0152,0xFF9800B5,0xFF840059,0xFF740019,0xFF6C0004,0xF7540000,0xFF940104,0xFB9000B5,0xB3FC0152,0xE3280000,
-0xB3FC0152,0x1E0018A,0xFFDC00F2,0xFFCC0049,0xFFC40001,0xD5FC0188,0xFFB80089,0xFFAC0000,0xEBF80188,0xFF540000,0xF0000188,0xD5FC0188,0xFFB80089,0xFFAC0000,0xEBF80188,0xFF540000,0xF0000188,0xEBF80188,0xFF540000,0xF0000188,0xF0000188,0xD5FC0188,0xFFB80089,0xFFAC0000,0xEBF80188,0xFF540000,0xF0000188,0xEBF80188,0xFF540000,0xF0000188,0xF0000188,0xEBF80188,
-0xFF540000,0xF0000188,0xF0000188,0xF0000188,0xF3E0016D,0x27FC0188,0xF9E0016D,0xFDD40139,0xFFBC00E9,0xFF980074,0xFF740000,0xFF280000,0xFDD80154,0xFFCC0122,0xFF90000D,0xF0000188,0xE5FC0188,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0x17401A5,0xFD580004,0xFD580004,0xFD580004,0xFD580004,0xFD580004,
-0xFD580004,0xD7580000,0xD7580000,0xD7580000,0xC9580000,0x2FFC01A5,0x2FFC01A5,0x2FFC01A5,0x2FFC01A5,0x2FFC01A5,0x2FFC01A5,0xE5200000,0xE5200000,0xE5200000,0xC93C0000,0x99FC01A5,0x99FC01A5,0x99FC01A5,0xC8E00000,0xBA0001A5,0xFF6C0128,0x17401A5,0x17401A5,0xFF6800A4,0xFF640055,0xFF5C001D,0xFF5C001D,0xF3580000,0xF9680109,0xFD6000A2,0xE7500000,0xE5200000,
-0x7FF801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table123[] = {
-0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,
-0x91F80000,0x91F80000,0x91F80000,0xB2000001,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1800000,0x1800000,0x1800000,0x1BFC0000,0x73FC0000,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x1A00001,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,
-0x75FC0000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0x75FC0000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xD0000000,0xD0000000,0x7BC0000,0x1A00001,0x1A00001,0x5E40000,0x1FFC0000,0x53FC0000,0x53FC0000,0x8FFC0000,0x5E40000,0x1FFC0000,0xA9FC0000,0xBBFC0000,
-0xA9FC0000,0x1D40001,0x1D40001,0x1D40001,0x1D40001,0xC3FC0000,0xC3FC0000,0xC3FC0000,0xE1FC0000,0xE1FC0000,0xEA000000,0xC3FC0000,0xC3FC0000,0xC3FC0000,0xE1FC0000,0xE1FC0000,0xEA000000,0xE1FC0000,0xE1FC0000,0xEA000000,0xEA000000,0xC3FC0000,0xC3FC0000,0xC3FC0000,0xE1FC0000,0xE1FC0000,0xEA000000,0xE1FC0000,0xE1FC0000,0xEA000000,0xEA000000,0xE1FC0000,
-0xE1FC0000,0xEA000000,0xEA000000,0xEA000000,0x75FC0000,0x3F40000,0x1D40001,0xB5FC0000,0xCFFC0000,0xDBFC0000,0xDFF80000,0xE5F40000,0x9FFC0000,0xC3FC0000,0xDBFC0000,0xEA000000,0xDBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B80C63,0xFFB0094E,0xFFA006CB,0xFFA005EA,0xFFA4076D,0xFF940436,0xFF8C0301,0xFF880313,0xFF8401A5,0xF584026D,0xFF980818,0xFF88041D,0xFF800284,0xFF7C0222,0xFF6C001D,0xF56C00F0,0xFF6C0438,0xFB600192,0xF1640165,0xE9680428,0x95FC0C63,0xFF7C07BE,0xFF7005ED,0xFF6404BF,0xFF4C01F6,0xF554026D,0xFF380594,0xFF280163,0xF32400D8,0xE9340428,0xCBFC0C63,
-0xFEE005EA,0xF4B4026D,0xE88C0428,0xDA000C65,0xFFA809C9,0xFFAC0B85,0xF5B80BDB,0xFF9806E9,0xFF8C041A,0xFF7C0175,0xFF78007A,0xFF680001,0xFFA409B3,0xFF90067F,0xFF640186,0xF32400D8,0xBFF80C63,0x1D0042B,0xFFC402DB,0xFFC001BE,0xFFBC0152,0xFFC4028B,0xFFB400FA,0xFFB00055,0xFFA800F1,0xFDA0000E,0xF5A000C9,0xBDFC0428,0xFFAC022D,0xFF9C0152,0xFF88018E,0xFF780005,
-0xF58000C8,0xDFF80428,0xFF340152,0xF50C00C8,0xE8000428,0xBDFC0428,0xFFAC022D,0xFF9C0152,0xFF88018E,0xFF780005,0xF58000C8,0xDFF80428,0xFF340152,0xF50C00C8,0xE8000428,0xDFF80428,0xFF340152,0xF50C00C8,0xE8000428,0xE8000428,0xFDCC0378,0xFFCC03D3,0xFFCC03E3,0xFFC002B6,0xFFA801D5,0xFF9000C1,0xFF880019,0xFF640000,0xFFC4037B,0xFDBC02B6,0xFF700162,0xF50C00C8,
-0xD7FC0428,0x1A005EA,0x1A005EA,0x1A005EA,0x1A005EA,0xFF8C0301,0xFF8C0301,0xFF8C0301,0xFF8401A5,0xFF8401A5,0xEB8401A5,0xFF800284,0xFF800284,0xFF800284,0xFF6C001D,0xFF6C001D,0xED74004C,0xF3680152,0xF3680152,0xE7680029,0xDD680154,0x71FC05EA,0x71FC05EA,0x71FC05EA,0xFF4C01F6,0xFF4C01F6,0xEB5C01A5,0xFF280163,0xFF280163,0xEB300002,0xDD400154,0xB9FC05EA,
-0xB9FC05EA,0xEADC01A5,0xDCC00154,0xCE0005ED,0xFB980484,0xF79C0551,0x1A005EA,0xFF900321,0xFF8001C1,0xFF7800C2,0xFF78007A,0xFF680001,0xFF900452,0xFF8002F4,0xFF64016D,0xEB300002,0xA7FC05EA,0x1BC0152,0x1BC0152,0x1BC0152,0x1BC0152,0xFFB00055,0xFFB00055,0xFFB00055,0xF7A40001,0xF7A40001,0xEBA00001,0x9BFC0152,0x9BFC0152,0x9BFC0152,0xFF780005,0xFF780005,
-0xEB880000,0xCFF80152,0xCFF80152,0xEB380000,0xDC000154,0x9BFC0152,0x9BFC0152,0x9BFC0152,0xFF780005,0xFF780005,0xEB880000,0xCFF80152,0xCFF80152,0xEB380000,0xDC000154,0xCFF80152,0xCFF80152,0xEB380000,0xDC000154,0xDC000154,0xFBB40109,0xF5B80122,0x1BC0152,0xFFA400DA,0xFF98007D,0xFF8C0041,0xFF880019,0xFF640000,0xFDB00109,0xFFA400C8,0xC1FC0152,0xEB380000,
-0xC1FC0152,0x1E800CA,0xFFE0007D,0xFFD80025,0xFFD40001,0xE3FC00C8,0xFFCC0049,0xFFC00001,0xF1F800C8,0xFF840000,0xF40000C8,0xE3FC00C8,0xFFCC0049,0xFFC00001,0xF1F800C8,0xFF840000,0xF40000C8,0xF1F800C8,0xFF840000,0xF40000C8,0xF40000C8,0xE3FC00C8,0xFFCC0049,0xFFC00001,0xF1F800C8,0xFF840000,0xF40000C8,0xF1F800C8,0xFF840000,0xF40000C8,0xF40000C8,0xF1F800C8,
-0xFF840000,0xF40000C8,0xF40000C8,0xF40000C8,0xF7E800B5,0x67FC00C8,0xFDE800B5,0xFFD8009D,0xFFD00075,0xFFC0003D,0xFF9C0000,0xFF640000,0xFFDC00B4,0xFFD80095,0xFFB00008,0xF40000C8,0xEDFC00C8,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0x18401A5,0xFF6C000D,0xFF6C000D,0xFF6C000D,0xFF6C000D,0xFF6C000D,
-0xFF6C000D,0xDF680000,0xDF680000,0xDF680000,0xD1680000,0x47FC01A5,0x47FC01A5,0x47FC01A5,0x47FC01A5,0x47FC01A5,0x47FC01A5,0xED300000,0xED300000,0xED300000,0xD14C0000,0xA5F801A5,0xA5F801A5,0xA5F801A5,0xD0F00000,0xC20001A5,0xF9800139,0x18401A5,0x18401A5,0xFB7C00C8,0xFF780071,0xFF700034,0xFF700034,0xFB680000,0xFF74010D,0xFD7400B5,0xEF600000,0xED300000,
-0x8DFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table124[] = {
-0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,
-0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1940000,0x1940000,0x1940000,0x37FC0000,0x83FC0000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,
-0x8FFC0000,0xC9F80000,0xC9F80000,0xC9F80000,0xD8000001,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0x8FFC0000,0xC9F80000,0xC9F80000,0xC9F80000,0xD8000001,0xC9F80000,0xC9F80000,0xC9F80000,0xD8000001,0xD8000001,0x1D00000,0x1B40000,0x1B40000,0x1FC0000,0x4BFC0000,0x75FC0000,0x75FC0000,0xA5FC0000,0x1FC0000,0x4BFC0000,0xBBFC0000,0xC9F80000,
-0xBBFC0000,0x1E80000,0x1E80000,0x1E80000,0x1E80000,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xEFFC0000,0xEFFC0000,0xF2000001,0xF2000001,0xDFFC0000,0xDFFC0000,0xDFFC0000,0xEFFC0000,0xEFFC0000,0xF2000001,0xEFFC0000,0xEFFC0000,0xF2000001,0xF2000001,0xEFFC0000,
-0xEFFC0000,0xF2000001,0xF2000001,0xF2000001,0xB5FC0000,0x57FC0000,0x1E80000,0xD7FC0000,0xE5FC0000,0xEBFC0000,0xEDFC0000,0xF1F80000,0xCBFC0000,0xDFFC0000,0xEBFC0000,0xF2000001,0xEBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1C40A26,0xFFBC0819,0xFFB4067D,0xFFB005ED,0xFFB00662,0xFFA8043D,0xFFA40362,0xFFA002BE,0xFF9801C9,0xF99801E2,0xFFA406D7,0xFF9C0415,0xFF9802FB,0xFF8801E3,0xFF84007A,0xFB840073,0xFF84031B,0xFD780159,0xF77000A2,0xEF7802D3,0xA9FC0A26,0xFF940717,0xFF8C05EA,0xFF7C0432,0xFF70024D,0xF96801E2,0xFF640489,0xFF4C01A8,0xF93C003F,0xEF4802D2,0xD5F80A26,
-0xFF1405EA,0xF8D801E2,0xEEAC02D2,0xE2000A26,0xFFBC082A,0xFBC40972,0xFBC409B6,0xFFAC061E,0xFFA003F3,0xFF9001DC,0xFF8C00FA,0xFF7C0029,0xFFB407F7,0xFFA405EA,0xFF7C01C3,0xF93C003F,0xC9FC0A26,0x1DC02D5,0xFFD40228,0xFFCC0194,0xFFCC0154,0xFFD001B5,0xFFC400E4,0xFFC40080,0xFFB8007D,0xFFB40001,0xF9B4003D,0xCFFC02D2,0xFFC001C8,0xFFB80152,0xFFAC00FE,0xFF940025,
-0xF994003E,0xE7FC02D2,0xFF6C0152,0xF934003D,0xEE0002D2,0xCFFC02D2,0xFFC001C8,0xFFB80152,0xFFAC00FE,0xFF940025,0xF994003E,0xE7FC02D2,0xFF6C0152,0xF934003D,0xEE0002D2,0xE7FC02D2,0xFF6C0152,0xF934003D,0xEE0002D2,0xEE0002D2,0xFFD80263,0xF7DC02A9,0xF7DC02B4,0xFDD00209,0xFFC0016A,0xFFAC00B1,0xFFA8003D,0xFF8C0019,0xFDD8026D,0xFFC801D6,0xFF98015B,0xF934003D,
-0xE1FC02D2,0x1B005ED,0x1B005ED,0x1B005ED,0x1B005ED,0xFFA40362,0xFFA40362,0xFFA40362,0xFF9801C9,0xFF9801C9,0xF39801A6,0xFF9802FB,0xFF9802FB,0xFF9802FB,0xFF84007A,0xFF84007A,0xF584004B,0xFB7C0153,0xFB7C0153,0xEF78002A,0xE7780153,0x8DFC05EA,0x8DFC05EA,0x8DFC05EA,0xFF70024D,0xFF70024D,0xF37001A6,0xFF4C01A8,0xFF4C01A8,0xF5440001,0xE7500152,0xC7FC05EA,
-0xC7FC05EA,0xF2F001A6,0xE6CC0152,0xD80005EA,0xFFAC04B2,0xFFAC055A,0x1B005ED,0xFFA403A1,0xFF940252,0xFF900163,0xFF8C00FA,0xFF7C0029,0xFFA00491,0xFF980365,0xFF7C01B3,0xF5440001,0xB9FC05EA,0x1CC0154,0x1CC0154,0x1CC0154,0x1CC0154,0xFFC40080,0xFFC40080,0xFFC40080,0xFFB40001,0xFFB40001,0xF3B40001,0xB7FC0152,0xB7FC0152,0xB7FC0152,0xFF940025,0xFF940025,
-0xF39C0001,0xDDF40152,0xDDF40152,0xF34C0001,0xE6000152,0xB7FC0152,0xB7FC0152,0xB7FC0152,0xFF940025,0xFF940025,0xF39C0001,0xDDF40152,0xDDF40152,0xF34C0001,0xE6000152,0xDDF40152,0xDDF40152,0xF34C0001,0xE6000152,0xE6000152,0xFDC80120,0xFFCC0120,0x1CC0154,0xFDC000F4,0xFFB000B4,0xFFA8006A,0xFFA8003D,0xFF8C0019,0xFFC40120,0xFBC000F2,0xD3FC0152,0xF34C0001,
-0xD3FC0152,0x1F4003D,0xFFEC0028,0xFFEC000D,0xFFE80000,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xF7F8003D,0xFFB80000,0xF800003D,0xF800003D,0xEFFC003D,0xFFE40014,0xFFDC0000,0xF7F8003D,0xFFB80000,0xF800003D,0xF7F8003D,0xFFB80000,0xF800003D,0xF800003D,0xF7F8003D,
-0xFFB80000,0xF800003D,0xF800003D,0xF800003D,0xFBF00034,0xA7FC003D,0xF3F4003D,0xFFEC0029,0xFFE80020,0xFFD40011,0xFFC80000,0xFFA80000,0xFFF00032,0xFFEC0032,0xFFD00001,0xF800003D,0xF5FC003D,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0x19801A5,0xFF800022,0xFF800022,0xFF800022,0xFF800022,0xFF800022,
-0xFF800022,0xE77C0001,0xE77C0001,0xE77C0001,0xD9780002,0x63FC01A5,0x63FC01A5,0x63FC01A5,0x63FC01A5,0x63FC01A5,0x63FC01A5,0xF5440000,0xF5440000,0xF5440000,0xD9600001,0xB3F801A5,0xB3F801A5,0xB3F801A5,0xD9080001,0xCA0001A5,0xFF8C0151,0x19801A5,0x19801A5,0xFF9000DD,0xFF8C0091,0xFF880055,0xFF880055,0xFD7C0005,0xFD8C0120,0xFF8000DA,0xF7740001,0xF5440000,
-0x9FF801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table125[] = {
-0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,
-0xA9FC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1880001,0x1A40000,0x1A40000,0x1A40000,0x4FFC0000,0x93FC0000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0x1C40000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,
-0xA9FC0000,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xA9FC0000,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xD5F80000,0xD5F80000,0xD5F80000,0xE0000001,0xE0000001,0x9E00000,0x1C40000,0x1C40000,0x35FC0000,0x73FC0000,0x93FC0000,0x93FC0000,0xB9FC0000,0x35FC0000,0x73FC0000,0xC9FC0000,0xD5F80000,
-0xC9FC0000,0x1F80000,0x1F80000,0x1F80000,0x1F80000,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0xFBFC0000,0xFA000001,0xFA000001,0xF7FC0000,0xF7FC0000,0xF7FC0000,0xFBFC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0xFBFC0000,0xFA000001,0xFA000001,0xFBFC0000,
-0xFBFC0000,0xFA000001,0xFA000001,0xFA000001,0xEDFC0000,0xD7FC0000,0x1F80000,0xF5FC0000,0xF9FC0000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xF3FC0000,0xF7FC0000,0xFBFC0000,0xFA000001,0xFBFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1D0088E,0xFFC80749,0xFFC00651,0xFFC005ED,0xFFC405B2,0xFFBC0463,0xFFB403DD,0xFFAC02BE,0xFFAC0215,0xFDA801AA,0xFFBC05EF,0xFFB0042C,0xFFA40383,0xFFA001F3,0xFF9C010A,0xFD94004F,0xFF94028E,0xFF900163,0xFB84003E,0xF58801FF,0xBBFC088E,0xFFAC069F,0xFFA405EA,0xFF9403FA,0xFF8802BD,0xFD7C01AA,0xFF7C0401,0xFF640218,0xFD540005,0xF55C01FE,0xDDFC088E,
-0xFF4805EA,0xFCF801AA,0xF4C401FE,0xE800088E,0xFFCC0739,0xFFCC07F2,0xFFCC0846,0xFFC005B1,0xFFB003FD,0xFFA4024E,0xFFA4019A,0xFF940096,0xFFC4071A,0xFFB40585,0xFF94023D,0xFD540005,0xD5FC088E,0x1E801FD,0xFDE401B5,0xFFE0016D,0xFFDC0154,0xFFDC0149,0xFFD400EA,0xFFD000B4,0xFFCC0069,0xFFC80028,0xFDC40005,0xDFFC01FD,0xFFD80188,0xFFCC0154,0xFFB800D2,0xFFB80059,
-0xFDA80006,0xEFFC01FD,0xFF9C0152,0xFD540005,0xF40001FE,0xDFFC01FD,0xFFD80188,0xFFCC0154,0xFFB800D2,0xFFB80059,0xFDA80006,0xEFFC01FD,0xFF9C0152,0xFD540005,0xF40001FE,0xEFFC01FD,0xFF9C0152,0xFD540005,0xF40001FE,0xF40001FE,0xFFE401C5,0xFDE801D9,0xFDE801E8,0xFFDC0179,0xFFD40132,0xFFC800CE,0xFFBC007D,0xFFA80049,0xFFE001CA,0xFFDC0172,0xFFB80156,0xFD540005,
-0xEBFC01FD,0x1C005ED,0x1C005ED,0x1C005ED,0x1C005ED,0xFFB403DD,0xFFB403DD,0xFFB403DD,0xFFAC0215,0xFFAC0215,0xFBA801A6,0xFFA40383,0xFFA40383,0xFFA40383,0xFF9C010A,0xFF9C010A,0xFD94004B,0xFF900163,0xFF900163,0xF788002A,0xEF880153,0xA5FC05EA,0xA5FC05EA,0xA5FC05EA,0xFF8802BD,0xFF8802BD,0xFB8001A6,0xFF640218,0xFF640218,0xFD540001,0xEF600152,0xD3FC05EA,
-0xD3FC05EA,0xFB0001A6,0xEEDC0152,0xE00005EA,0xFFBC04E6,0xF9C00581,0x1C005ED,0xFFB40402,0xFFAC02EA,0xFFA401FD,0xFFA4019A,0xFF940096,0xFFB404BE,0xFFB003CE,0xFF940234,0xFD540001,0xC7FC05EA,0x1DC0154,0x1DC0154,0x1DC0154,0x1DC0154,0xFFD000B4,0xFFD000B4,0xFFD000B4,0xFFC80028,0xFFC80028,0xFBC40001,0xCFFC0152,0xCFFC0152,0xCFFC0152,0xFFB80059,0xFFB80059,
-0xFBAC0001,0xE7FC0152,0xE7FC0152,0xFB5C0001,0xEE000152,0xCFFC0152,0xCFFC0152,0xCFFC0152,0xFFB80059,0xFFB80059,0xFBAC0001,0xE7FC0152,0xE7FC0152,0xFB5C0001,0xEE000152,0xE7FC0152,0xE7FC0152,0xFB5C0001,0xEE000152,0xEE000152,0xFFD80122,0xF7DC0139,0x1DC0154,0xFDD80109,0xFFC800DA,0xFFBC00A9,0xFFBC007D,0xFFA80049,0xFFD00132,0xFBD40109,0xE1FC0152,0xFB5C0001,
-0xE1FC0152,0x1FC0005,0xFFF80004,0xFFF80001,0xFFF80000,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFDF80005,0xFFEC0000,0xFC000005,0xFC000005,0xFBFC0005,0xFFF80002,0xFFF40000,0xFDF80005,0xFFEC0000,0xFC000005,0xFDF80005,0xFFEC0000,0xFC000005,0xFC000005,0xFDF80005,
-0xFFEC0000,0xFC000005,0xFC000005,0xFC000005,0xFFF80004,0xE7FC0005,0xF7FC0005,0xFBFC0005,0xFFF80002,0xFFF00001,0xFFF00000,0xFFE40000,0xF9FC0005,0xFBFC0005,0xFFF00000,0xFC000005,0xFDF80005,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0x1A801A5,0xFF900049,0xFF900049,0xFF900049,0xFF900049,0xFF900049,
-0xFF900049,0xEF8C0001,0xEF8C0001,0xEF8C0001,0xE1880002,0x7BFC01A5,0x7BFC01A5,0x7BFC01A5,0x7BFC01A5,0x7BFC01A5,0x7BFC01A5,0xFD540000,0xFD540000,0xFD540000,0xE1700001,0xBFF801A5,0xBFF801A5,0xBFF801A5,0xE1180001,0xD20001A5,0xFBA40152,0x1A801A5,0x1A801A5,0xFFA000F2,0xFD9C00B5,0xFF980071,0xFF980071,0xFD900019,0xFD9C0139,0xFF9400E9,0xFF840001,0xFD540000,
-0xADFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table126[] = {
-0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,
-0xB5FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x1980001,0x3B40000,0x3B40000,0x3B40000,0x69FC0000,0xA1FC0000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0x1D40000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,
-0xC1FC0000,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xE1F80000,0xE1F80000,0xE1F80000,0xE8000001,0xE8000001,0x1F40000,0x1D40000,0x1D40000,0x6DFC0000,0x9BFC0000,0xB1FC0000,0xB1FC0000,0xCDFC0000,0x6DFC0000,0x9BFC0000,0xD9FC0000,0xE1F80000,
-0xD9FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1DC0693,0xFFD405C6,0xFFD0052E,0xFFD004EE,0xFFD0049F,0xFFC403C6,0xFFC40362,0xFFC00295,0xFFB8021E,0xFFB801A5,0xFFC4048F,0xFFBC037F,0xFFBC0306,0xFFAC01D4,0xFFAC012B,0xFFA4006A,0xFFAC01E3,0xFFA0010E,0xFF980011,0xF798012A,0xC9FC0691,0xFFB80566,0xFFB804ED,0xFFAC0367,0xFFA0029E,0xFF9401A5,0xFF940316,0xFF8801D1,0xFF700008,0xF7700129,0xE5F80691,
-0xFF6C04ED,0xFF2001A5,0xF6EC0129,0xEC000691,0xFFD805AB,0xF5D8064D,0xF5D80672,0xFFCC0496,0xFFC00366,0xFFB8022B,0xFFB401AE,0xFFA800C3,0xFFD00576,0xFFC4045F,0xFFB001FA,0xFF700008,0xDFF80691,0x1F00126,0xFFEC0105,0xFFEC00EA,0xFFE800DD,0xFFE800C6,0xFFE000A1,0xFFE00088,0xFFD8004C,0xFFD80028,0xFFD40000,0xEBFC0126,0xFFE400F1,0xFFDC00DD,0xFFD80088,0xFFCC0050,
-0xFFC00000,0xF5FC0126,0xFFB800DD,0xFF7C0000,0xF6000129,0xEBFC0126,0xFFE400F1,0xFFDC00DD,0xFFD80088,0xFFCC0050,0xFFC00000,0xF5FC0126,0xFFB800DD,0xFF7C0000,0xF6000129,0xF5FC0126,0xFFB800DD,0xFF7C0000,0xF6000129,0xF6000129,0xFDF0010C,0xFFEC0110,0xFFEC0121,0xFFEC00F3,0xFFDC00C6,0xFFD80092,0xFFD00061,0xFFC00049,0xFFEC010B,0xFFE800E5,0xFFD000DE,0xFF7C0000,
-0xF3FC0126,0x1D004EE,0x1D004EE,0x1D004EE,0x1D004EE,0xFFC40362,0xFFC40362,0xFFC40362,0xFFB8021E,0xFFB8021E,0xFFB801A5,0xFFBC0306,0xFFBC0306,0xFFBC0306,0xFFAC012B,0xFFAC012B,0xFFA4006A,0xFFA0010E,0xFFA0010E,0xFD98000E,0xF59800DE,0xB7FC04ED,0xB7FC04ED,0xB7FC04ED,0xFFA0029E,0xFFA0029E,0xFF9401A5,0xFF8801D1,0xFF8801D1,0xFF700008,0xF57400DD,0xDDF404ED,
-0xDDF404ED,0xFF2001A5,0xF4F400DD,0xE60004ED,0xFFCC042E,0xFFCC0492,0x1D004EE,0xFFC40382,0xFFC002BD,0xFFB401EE,0xFFB401AE,0xFFA800C3,0xFBC8042D,0xFFBC0366,0xFFA801F5,0xFF700008,0xD3FC04ED,0x1E800DD,0x1E800DD,0x1E800DD,0x1E800DD,0xFFE00088,0xFFE00088,0xFFE00088,0xFFD80028,0xFFD80028,0xFFD40000,0xDFFC00DD,0xDFFC00DD,0xDFFC00DD,0xFFCC0050,0xFFCC0050,
-0xFFC00000,0xEFFC00DD,0xEFFC00DD,0xFF7C0000,0xF40000DD,0xDFFC00DD,0xDFFC00DD,0xDFFC00DD,0xFFCC0050,0xFFCC0050,0xFFC00000,0xEFFC00DD,0xEFFC00DD,0xFF7C0000,0xF40000DD,0xEFFC00DD,0xEFFC00DD,0xFF7C0000,0xF40000DD,0xF40000DD,0xF9E800C8,0xFDE800C8,0x1E800DD,0xFFDC00B4,0xFFDC0095,0xFFD80082,0xFFD00061,0xFFC00049,0xF7E800C8,0xFFDC00AA,0xEBFC00DD,0xFF7C0000,
-0xEBFC00DD,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0x1B801A5,0xFFA4006A,0xFFA4006A,0xFFA4006A,0xFFA4006A,0xFFA4006A,
-0xFFA4006A,0xF79C0001,0xF79C0001,0xF79C0001,0xE9980002,0x93FC01A5,0x93FC01A5,0x93FC01A5,0x93FC01A5,0x93FC01A5,0x93FC01A5,0xFF700008,0xFF700008,0xFF700008,0xE9800001,0xCBF801A5,0xCBF801A5,0xCBF801A5,0xE9280001,0xDA0001A5,0xF3B4016D,0x1B801A5,0x1B801A5,0xFFAC0115,0xFFA800DA,0xFFAC00A2,0xFFAC00A2,0xFFA0003A,0xF9B00152,0xFBAC0109,0xFF98000D,0xFF700008,
-0xBDF801A5,};
-static const uint32_t g_etc1_to_bc7_m6_table127[] = {
-0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,
-0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0xBC40000,0xBC40000,0xBC40000,0x81FC0000,0xB1FC0000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,
-0xD9FC0000,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xEDF80000,0xEDF80000,0xEDF80000,0xF0000001,0xF0000001,0x37FC0000,0x1E40000,0x1E40000,0xA7FC0000,0xC1FC0000,0xCFFC0000,0xCFFC0000,0xE1FC0000,0xA7FC0000,0xC1FC0000,0xE7FC0000,0xEDF80000,
-0xE7FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1E4049B,0xFFDC041B,0xFFD803C2,0xFFD8039E,0xFFDC0373,0xFFD002EE,0xFFD002AE,0xFFCC0231,0xFFCC01F1,0xFFC801A5,0xFFD0032F,0xFFD0028E,0xFFC40236,0xFFC00181,0xFFBC011B,0xFFBC00A2,0xFFB80127,0xFFAC00AA,0xFFAC0001,0xFBA80072,0xD5FC0499,0xFFCC03E2,0xFFC8039D,0xFFB802AF,0xFFB80236,0xFFAC01A5,0xFFAC0216,0xFFA00149,0xFF880028,0xFB840071,0xEBF80499,
-0xFF90039D,0xFF5401A5,0xFB0C0071,0xF0000499,0xFFD8040B,0xF9E00465,0xF9E00482,0xFFD40343,0xFFCC0297,0xFFC401D1,0xFFBC0173,0xFFB800C2,0xFFD803EE,0xFFD0031A,0xFFB8015E,0xFF880028,0xE5FC0499,0x1F40072,0xFFF40062,0xFFF00059,0xFFF00055,0xFFF40052,0xFFEC003D,0xFFEC0034,0xFFE40020,0xFFE40010,0xFFE40000,0xF5FC0071,0xFFF0005D,0xFFE80055,0xFFE40030,0xFFE40020,
-0xFFD80000,0xF9FC0071,0xFFD40055,0xFFAC0000,0xFA000071,0xF5FC0071,0xFFF0005D,0xFFE80055,0xFFE40030,0xFFE40020,0xFFD80000,0xF9FC0071,0xFFD40055,0xFFAC0000,0xFA000071,0xF9FC0071,0xFFD40055,0xFFAC0000,0xFA000071,0xFA000071,0xFFF40060,0xF3F40072,0xF3F40072,0xFFF0005A,0xFFF0004A,0xFFE40036,0xFFE40022,0xFFD8001D,0xFDF40060,0xFFF0005A,0xFFE00056,0xFFAC0000,
-0xF9FC0071,0x1D8039E,0x1D8039E,0x1D8039E,0x1D8039E,0xFFD002AE,0xFFD002AE,0xFFD002AE,0xFFCC01F1,0xFFCC01F1,0xFFC801A5,0xFFC40236,0xFFC40236,0xFFC40236,0xFFBC011B,0xFFBC011B,0xFFBC00A2,0xFFAC00AA,0xFFAC00AA,0xFFAC0001,0xF9A80056,0xC9FC039D,0xC9FC039D,0xC9FC039D,0xFFB80236,0xFFB80236,0xFFAC01A5,0xFFA00149,0xFFA00149,0xFF880028,0xF9880055,0xE5F8039D,
-0xE5F8039D,0xFF5401A5,0xF9140055,0xEC00039D,0xFFD8032A,0xF5D80379,0x1D8039E,0xFFCC02A5,0xFFC4022E,0xFFBC01AB,0xFFBC0173,0xFFB800C2,0xFFD00305,0xFFD0028A,0xFFB8015A,0xFF880028,0xDFF8039D,0x1F00055,0x1F00055,0x1F00055,0x1F00055,0xFFEC0034,0xFFEC0034,0xFFEC0034,0xFFE40010,0xFFE40010,0xFFE40000,0xEBFC0055,0xEBFC0055,0xEBFC0055,0xFFE40020,0xFFE40020,
-0xFFD80000,0xF5FC0055,0xF5FC0055,0xFFAC0000,0xF8000055,0xEBFC0055,0xEBFC0055,0xEBFC0055,0xFFE40020,0xFFE40020,0xFFD80000,0xF5FC0055,0xF5FC0055,0xFFAC0000,0xF8000055,0xF5FC0055,0xF5FC0055,0xFFAC0000,0xF8000055,0xF8000055,0xFDF00048,0xFFEC0050,0x1F00055,0xFBF00048,0xFDEC003D,0xFFE4002D,0xFFE40022,0xFFD8001D,0xFBF00048,0xFFE80041,0xF3FC0055,0xFFAC0000,
-0xF3FC0055,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0x1C801A5,0xFFBC00A2,0xFFBC00A2,0xFFBC00A2,0xFFBC00A2,0xFFBC00A2,
-0xFFBC00A2,0xFFAC0001,0xFFAC0001,0xFFAC0001,0xF1A80002,0xABFC01A5,0xABFC01A5,0xABFC01A5,0xABFC01A5,0xABFC01A5,0xABFC01A5,0xFF880028,0xFF880028,0xFF880028,0xF1900001,0xD7F801A5,0xD7F801A5,0xD7F801A5,0xF1380001,0xE20001A5,0xFBC4016D,0x1C801A5,0x1C801A5,0xFFBC0132,0xFFBC00FA,0xFFBC00CA,0xFFBC00CA,0xFFB4006A,0xFFBC015A,0xFFBC0122,0xFFB00032,0xFF880028,
-0xCBFC01A5,};
-static const uint32_t g_etc1_to_bc7_m6_table128[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x240000,0x240000,0x240000,0x240000,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x680000,0x680000,0x10000001,0x10000001,0x340000,0x340000,0x340000,0x680000,0x680000,0x10000001,0x680000,0x680000,0x10000001,0x10000001,0x680000,
-0x680000,0x10000001,0x10000001,0x10000001,0x2280000,0xA240000,0x240000,0x300000,0x3C0000,0x4C0000,0x540000,0x800000,0x2C0000,0x340000,0x4C0000,0x10000001,0x4C0000,0x780000,0x2B00000,0x1680000,0x3A000001,0x2B00000,0x1680000,0x3A000001,0x1680000,0x3A000001,0x3A000001,0x2B00000,0x1680000,0x3A000001,0x1680000,0x3A000001,
-0x3A000001,0x1680000,0x3A000001,0x3A000001,0x3A000001,0x2B00000,0x1680000,0x3A000001,0x1680000,0x3A000001,0x3A000001,0x1680000,0x3A000001,0x3A000001,0x3A000001,0x1680000,0x3A000001,0x3A000001,0x3A000001,0x3A000001,0x2940000,0x800000,0x800000,0xC80000,0x1240000,0xBF40000,0x3A000001,0x3A000001,0xA40000,0xE00000,0x2FF00000,0x3A000001,
-0xFC0000,0x280964,0xAA0C00D8,0x560C00D8,0x3A0C00D9,0x76000372,0x52000061,0x3A000002,0x38000374,0x32000151,0x26000372,0x4E000768,0x460002BA,0x34000153,0x340004A5,0x2E00024E,0x2400040A,0x26000768,0x2600049D,0x2000059E,0x1800076B,0x380964,0x3A000463,0x2E000274,0x2E000596,0x2800031F,0x22000484,0x220007ED,0x24000543,0x1C000607,0x180007AB,0x700964,
-0x1C000694,0x1C00070C,0x16000864,0x12000964,0xE4000231,0xF41805B2,0xF820054D,0x74000239,0x4A000252,0x3C00023D,0x3000019B,0x2A000306,0x90000413,0x5E000317,0x2E0003AB,0x1C000607,0x500964,0x340768,0xA01400A4,0x541000A4,0x3A1000A5,0x76000372,0x52000061,0x3A000002,0x38000374,0x32000151,0x26000372,0x4C0768,0x460002BA,0x34000153,0x340004A5,0x2E00024E,
-0x2400040A,0x980768,0x2600049D,0x2000059E,0x1800076B,0x4C0768,0x460002BA,0x34000153,0x340004A5,0x2E00024E,0x2400040A,0x980768,0x2600049D,0x2000059E,0x1800076B,0x980768,0x2600049D,0x2000059E,0x1800076B,0x1800076B,0xE4000231,0xF82004F2,0xFC280405,0x74000239,0x4A000252,0x3C00023D,0x3000019B,0x2A000306,0x900003C2,0x5E0002F3,0x2E0003A2,0x2000059E,
-0x6C0768,0xC00D8,0xC00D8,0xC00D8,0xC00D8,0x36000000,0x36000000,0x36000000,0x1A000000,0x1A000000,0x10000001,0x1A0000A2,0x1A0000A2,0x1A0000A2,0x1200003D,0x1200003D,0xE000022,0xC0000A2,0xC0000A2,0xA000062,0x80000A2,0x1000D8,0x1000D8,0x1000D8,0x12000061,0x12000061,0xC000039,0xC0000B2,0xC0000B2,0xA000072,0x80000AB,0x2000D8,
-0x2000D8,0xA000093,0x60000C3,0x40000DB,0x76000032,0xF8000004,0xC00D8,0x32000041,0x1E00003D,0x1A000041,0x16000034,0x12000050,0x3400006B,0x2600005D,0x100000A3,0xA000072,0x1800D8,0x1000A4,0x1000A4,0x1000A4,0x1000A4,0x36000000,0x36000000,0x36000000,0x1A000000,0x1A000000,0x10000001,0x21800A2,0x21800A2,0x21800A2,0x1200003D,0x1200003D,
-0xE000022,0x3000A2,0x3000A2,0xA000062,0x80000A2,0x21800A2,0x21800A2,0x21800A2,0x1200003D,0x1200003D,0xE000022,0x3000A2,0x3000A2,0xA000062,0x80000A2,0x3000A2,0x3000A2,0xA000062,0x80000A2,0x80000A2,0x76000032,0xF8000004,0x1000A4,0x32000041,0x1E00003D,0x1A000041,0x16000034,0x12000050,0x42000061,0x26000059,0x2400A2,0xA000062,
-0x2400A2,0x4C0374,0x90240000,0x50240000,0x3A240001,0x740372,0x52000061,0x3A000002,0xE80372,0x32000151,0x26000372,0x740372,0x52000061,0x3A000002,0xE80372,0x32000151,0x26000372,0xE80372,0x32000151,0x26000372,0x26000372,0x740372,0x52000061,0x3A000002,0xE80372,0x32000151,0x26000372,0xE80372,0x32000151,0x26000372,0x26000372,0xE80372,
-0x32000151,0x26000372,0x26000372,0x26000372,0xF4000164,0x540372,0xF8400188,0x7C000132,0x56000161,0x3C00013D,0x340000DA,0x320001BA,0xB20001D4,0x68000179,0x360000B9,0x26000372,0xA40372,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table129[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x340000,0x340000,0x340000,0x340000,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x980000,0x980000,0x18000001,0x18000001,0x4C0000,0x4C0000,0x4C0000,0x980000,0x980000,0x18000001,0x980000,0x980000,0x18000001,0x18000001,0x980000,
-0x980000,0x18000001,0x18000001,0x18000001,0x3C0000,0x380000,0x340000,0x2440000,0x2540000,0x6C0000,0x7C0000,0xBC0000,0x400000,0x4C0000,0x6C0000,0x18000001,0x6C0000,0x880000,0xC80000,0x1980000,0x42000001,0xC80000,0x1980000,0x42000001,0x1980000,0x42000001,0x42000001,0xC80000,0x1980000,0x42000001,0x1980000,0x42000001,
-0x42000001,0x1980000,0x42000001,0x42000001,0x42000001,0xC80000,0x1980000,0x42000001,0x1980000,0x42000001,0x42000001,0x1980000,0x42000001,0x42000001,0x42000001,0x1980000,0x42000001,0x42000001,0x42000001,0x42000001,0x2A80000,0x900000,0x900000,0xE40000,0x14C0000,0x15F40000,0x42000001,0x42000001,0xB80000,0x1000000,0x39C40000,0x42000001,
-0x1200000,0x300C14,0xBE1001C4,0x601001C4,0x421001C5,0x8E000372,0x62000025,0x4400000A,0x44000374,0x38000109,0x2E000372,0x5C000933,0x5200036A,0x400001DB,0x3A00053D,0x34000266,0x2E000453,0x2E000934,0x2C0005A5,0x26000696,0x1E000933,0x440C14,0x460005BB,0x3A000354,0x340006A6,0x34000387,0x2800050C,0x280009FD,0x2C000686,0x2400073B,0x1E000997,0x880C14,
-0x26000891,0x200008C2,0x1C000AAC,0x16000C14,0xFE000264,0xF8200772,0xFC280785,0x8800025D,0x5A000281,0x44000266,0x3C000195,0x36000351,0xB20004DE,0x68000393,0x3600045B,0x2400073B,0x600C14,0x400934,0xB21C0154,0x5E180154,0x42180155,0x8E000372,0x62000025,0x44040009,0x44000374,0x38000109,0x2E000372,0x5C0933,0x5200036A,0x400001DB,0x3A00053D,0x34000266,
-0x2E000453,0xB80933,0x2C0005A5,0x26000696,0x1E000933,0x5C0933,0x5200036A,0x400001DB,0x3A00053D,0x34000266,0x2E000453,0xB80933,0x2C0005A5,0x26000696,0x1E000933,0xB80933,0x2C0005A5,0x26000696,0x1E000933,0x1E000933,0xFE000264,0xFE2C061E,0xF23405B4,0x8800025D,0x5A000281,0x44000266,0x3C000195,0x36000351,0xB2000465,0x68000362,0x36000452,0x26000696,
-0x800933,0x1001C4,0x1001C4,0x1001C4,0x1001C4,0x4E000000,0x4E000000,0x4E000000,0x26000000,0x26000000,0x18000001,0x26000152,0x26000152,0x26000152,0x1E00007D,0x1E00007D,0x18000041,0x12000152,0x12000152,0x100000CA,0xC000152,0x21801C3,0x21801C3,0x21801C3,0x180000D1,0x180000D1,0x12000079,0x12000176,0x12000176,0x100000EE,0xC000162,0x3001C3,
-0x3001C3,0xE000141,0xA00018B,0x80001C3,0x9200006A,0xFC080044,0x1001C4,0x50000089,0x3200007D,0x2600007D,0x2200006A,0x1A00009D,0x560000E9,0x320000BE,0x16000155,0x100000EE,0x2401C3,0x180154,0x180154,0x180154,0x180154,0x4E000000,0x4E000000,0x4E000000,0x26000000,0x26000000,0x18000001,0x2240152,0x2240152,0x2240152,0x1E00007D,0x1E00007D,
-0x18000041,0x4C0152,0x4C0152,0x100000CA,0xC000152,0x2240152,0x2240152,0x2240152,0x1E00007D,0x1E00007D,0x18000041,0x4C0152,0x4C0152,0x100000CA,0xC000152,0x4C0152,0x4C0152,0x100000CA,0xC000152,0xC000152,0x9200006A,0xFC080034,0x180154,0x50000089,0x3200007D,0x2600007D,0x2200006A,0x1A00009D,0x560000D0,0x3C0000B4,0x340152,0x100000CA,
-0x340152,0x5C0374,0x98340000,0x58340000,0x42340001,0x8C0372,0x62000025,0x420C0001,0x1180372,0x38000109,0x2E000372,0x8C0372,0x62000025,0x420C0001,0x1180372,0x38000109,0x2E000372,0x1180372,0x38000109,0x2E000372,0x2E000372,0x8C0372,0x62000025,0x420C0001,0x1180372,0x38000109,0x2E000372,0x1180372,0x38000109,0x2E000372,0x2E000372,0x1180372,
-0x38000109,0x2E000372,0x2E000372,0x2E000372,0xFC080152,0x640372,0xFE4C0190,0x920000E9,0x6000010D,0x4A000104,0x3E000092,0x3600016D,0xD0000190,0x7A000128,0x4200007D,0x2E000372,0xC80372,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table130[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x440000,0x440000,0x440000,0x440000,0x640000,0x640000,0x640000,0xCC0000,0xCC0000,0x20000001,0x640000,0x640000,0x640000,0xCC0000,0xCC0000,0x20000001,0xCC0000,0xCC0000,0x20000001,0x20000001,0x640000,0x640000,0x640000,0xCC0000,0xCC0000,0x20000001,0xCC0000,0xCC0000,0x20000001,0x20000001,0xCC0000,
-0xCC0000,0x20000001,0x20000001,0x20000001,0x64C0000,0x480000,0x440000,0x5C0000,0x700000,0x900000,0xA40000,0xF80000,0x540000,0x640000,0x900000,0x20000001,0x900000,0x980000,0xE00000,0x1CC0000,0x4A000001,0xE00000,0x1CC0000,0x4A000001,0x1CC0000,0x4A000001,0x4A000001,0xE00000,0x1CC0000,0x4A000001,0x1CC0000,0x4A000001,
-0x4A000001,0x1CC0000,0x4A000001,0x4A000001,0x4A000001,0xE00000,0x1CC0000,0x4A000001,0x1CC0000,0x4A000001,0x4A000001,0x1CC0000,0x4A000001,0x4A000001,0x4A000001,0x1CC0000,0x4A000001,0x4A000001,0x4A000001,0x4A000001,0x2BC0000,0x8A00000,0x8A00000,0x1000000,0x1740000,0x1FF80000,0x4A000001,0x4A000001,0x2CC0000,0x11C0000,0x41D40000,0x4A000001,
-0x1400000,0x380F44,0xCE180304,0x6A180304,0x4A180305,0xA6000372,0x6E000005,0x4C04003A,0x50000374,0x440000C1,0x36000372,0x70000B53,0x62000455,0x460002A3,0x460005E5,0x40000296,0x3400049B,0x36000B53,0x320006F5,0x2C0007BE,0x24000B53,0x500F44,0x4C000773,0x40000494,0x400007D6,0x3A000417,0x2E0005B4,0x2E000C75,0x32000816,0x2A00089F,0x22000BD4,0xA00F44,
-0x2C000AD9,0x26000ABA,0x20000D5F,0x1A000F44,0xFE000344,0xFC2809B2,0xFE2C0A49,0x9E00029D,0x6C0002C1,0x4C0002A2,0x440001A6,0x3A0003B2,0xD00005C2,0x7C000403,0x3E000573,0x2A00089F,0x700F44,0x480B54,0xC2240244,0x68200244,0x4A200245,0xA6000372,0x6E000005,0x4E080032,0x50000374,0x440000C1,0x36000372,0x6C0B53,0x62000455,0x460002A3,0x460005E5,0x40000296,
-0x3400049B,0xDC0B53,0x320006F5,0x2C0007BE,0x24000B53,0x6C0B53,0x62000455,0x460002A3,0x460005E5,0x40000296,0x3400049B,0xDC0B53,0x320006F5,0x2C0007BE,0x24000B53,0xDC0B53,0x320006F5,0x2C0007BE,0x24000B53,0x24000B53,0xFE000344,0xF43807D4,0xF8400788,0x9E00029D,0x6C0002C1,0x4C0002A2,0x440001A6,0x3A0003B2,0xD0000519,0x7C0003C3,0x3E000563,0x2C0007BE,
-0x9C0B53,0x180304,0x180304,0x180304,0x180304,0x66000000,0x66000000,0x66000000,0x32000000,0x32000000,0x20000001,0x32000242,0x32000242,0x32000242,0x280000CD,0x280000CD,0x1E00006D,0x18000242,0x18000242,0x1600015A,0x10000242,0x200303,0x200303,0x200303,0x22000156,0x22000156,0x180000D1,0x18000282,0x18000282,0x1600019A,0xE000263,0x3C0303,
-0x3C0303,0x10000213,0xE0002AE,0xA000303,0xC40000B4,0xFE0C00D8,0x180304,0x640000F5,0x460000DD,0x2E0000E1,0x2E0000B4,0x22000104,0x64000191,0x5000014A,0x1E000248,0x1600019A,0x2C0303,0x200244,0x200244,0x200244,0x200244,0x66000000,0x66000000,0x66000000,0x32000000,0x32000000,0x20000001,0x2300242,0x2300242,0x2300242,0x280000CD,0x280000CD,
-0x1E00006D,0x640242,0x640242,0x1600015A,0x10000242,0x2300242,0x2300242,0x2300242,0x280000CD,0x280000CD,0x1E00006D,0x640242,0x640242,0x1600015A,0x10000242,0x640242,0x640242,0x1600015A,0x10000242,0x10000242,0xC40000B4,0xFE0C00B4,0x200244,0x640000F5,0x460000DD,0x2E0000E1,0x2E0000B4,0x22000104,0x74000164,0x50000131,0x480242,0x1600015A,
-0x480242,0x6C0374,0xA0440000,0x60440000,0x4A440001,0xA40372,0x6E000005,0x4A1C0001,0x14C0372,0x440000C1,0x36000372,0xA40372,0x6E000005,0x4A1C0001,0x14C0372,0x440000C1,0x36000372,0x14C0372,0x440000C1,0x36000372,0x36000372,0xA40372,0x6E000005,0x4A1C0001,0x14C0372,0x440000C1,0x36000372,0x14C0372,0x440000C1,0x36000372,0x36000372,0x14C0372,
-0x440000C1,0x36000372,0x36000372,0x36000372,0xFE140164,0x2740372,0xF86001A5,0xA60000A4,0x6C0000DD,0x540000B9,0x46000059,0x42000132,0xF200015A,0x8C0000F4,0x4C000041,0x36000372,0xE80372,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table131[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x100000,0x100000,0x100000,0x100000,
-0x100000,0x200000,0x200000,0x200000,0x4000001,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x200000,0x200000,0x200000,0x4000001,0x200000,0x200000,0x200000,0x4000001,0x4000001,0xC0000,0xC0000,0xC0000,0x20C0000,0x40C0000,0x100000,0x100000,0x140000,0x20C0000,0x40C0000,0x180000,0x200000,
-0x180000,0x540000,0x540000,0x540000,0x540000,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0xFC0000,0xFC0000,0x28000001,0x28000001,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0x28000001,0xFC0000,0xFC0000,0x28000001,0x28000001,0xFC0000,
-0xFC0000,0x28000001,0x28000001,0x28000001,0x2600000,0x4580000,0x540000,0x2700000,0x8C0000,0xB00000,0xCC0000,0x1340000,0x680000,0x7C0000,0xB00000,0x28000001,0xB00000,0xA80000,0xF80000,0x1FC0000,0x52000001,0xF80000,0x1FC0000,0x52000001,0x1FC0000,0x52000001,0x52000001,0xF80000,0x1FC0000,0x52000001,0x1FC0000,0x52000001,
-0x52000001,0x1FC0000,0x52000001,0x52000001,0x52000001,0xF80000,0x1FC0000,0x52000001,0x1FC0000,0x52000001,0x52000001,0x1FC0000,0x52000001,0x52000001,0x52000001,0x1FC0000,0x52000001,0x52000001,0x52000001,0x52000001,0x4D00000,0xB40000,0xB40000,0x1180000,0x19C0000,0x29F80000,0x52000001,0x52000001,0xE40000,0x13C0000,0x49E40000,0x52000001,
-0x1640000,0x401198,0xDE200408,0x74200408,0x52200409,0xB6080386,0x78080015,0x560C007E,0x5C040386,0x4C0400A9,0x3E040386,0x84000BE8,0x680003F9,0x5200029B,0x5200058D,0x460001EE,0x3A00043B,0x40000BE8,0x3E0006C9,0x320007A2,0x2A000BEB,0x601194,0x62000829,0x4C00052C,0x4C00083E,0x460003FF,0x3A0005A4,0x3A000D81,0x38000852,0x320008C3,0x28000CAC,0xC41194,
-0x32000C15,0x2C000BBE,0x26000ED7,0x20001194,0xFE0C03FE,0xFE2C0B9E,0xF63C0C94,0xB40001F1,0x76000211,0x5A0001E1,0x4C000105,0x4200030B,0xF200058B,0x90000386,0x46000541,0x320008C3,0x8C1194,0x580BE8,0xCE300288,0x70300288,0x52300289,0xB20C0372,0x760C0005,0x54140042,0x5A0C0372,0x4C0400A5,0x3E0C0372,0x2800BE8,0x680003F9,0x5200029B,0x5200058D,0x460001EE,
-0x3A00043B,0x1080BE8,0x3E0006C9,0x320007A2,0x2A000BEB,0x2800BE8,0x680003F9,0x5200029B,0x5200058D,0x460001EE,0x3A00043B,0x1080BE8,0x3E0006C9,0x320007A2,0x2A000BEB,0x1080BE8,0x3E0006C9,0x320007A2,0x2A000BEB,0x2A000BEB,0xFE1403AD,0xFA440844,0xFE4C0810,0xB40001F1,0x76000211,0x5A0001E1,0x4C000105,0x4200030B,0xF20004AA,0x90000335,0x46000531,0x320007A2,
-0xB80BE8,0x200408,0x200408,0x200408,0x200408,0x76080014,0x76080014,0x76080014,0x3C040014,0x3C040014,0x28040015,0x48000288,0x48000288,0x48000288,0x3400009D,0x3400009D,0x28000032,0x22000288,0x22000288,0x20000151,0x1600028A,0x300408,0x300408,0x300408,0x2E000196,0x2E000196,0x220000C3,0x1E0002FE,0x1E0002FE,0x1C0001B6,0x160002CA,0x5C0408,
-0x5C0408,0x1600029B,0x1400035E,0xE00040B,0xF6000082,0xF2140195,0x200408,0x820000C1,0x5A0000A9,0x440000A9,0x40000082,0x2E0000DA,0x960001AB,0x5A000144,0x2C000291,0x1C0001B6,0x400408,0x300288,0x300288,0x300288,0x300288,0x720C0000,0x720C0000,0x720C0000,0x3A0C0000,0x3A0C0000,0x280C0001,0x2440288,0x2440288,0x2440288,0x3400009D,0x3400009D,
-0x28000032,0x8C0288,0x8C0288,0x20000151,0x1600028A,0x2440288,0x2440288,0x2440288,0x3400009D,0x3400009D,0x28000032,0x8C0288,0x8C0288,0x20000151,0x1600028A,0x8C0288,0x8C0288,0x20000151,0x1600028A,0x1600028A,0xF6000082,0xF82000DD,0x300288,0x820000C1,0x5A0000A9,0x440000A9,0x40000082,0x2E0000DA,0x9600015A,0x5A000120,0x640288,0x20000151,
-0x640288,0x7C0374,0xA8540000,0x68540000,0x52540001,0xBC0372,0x78080001,0x522C0001,0x17C0372,0x4A000091,0x3E000372,0xBC0372,0x78080001,0x522C0001,0x17C0372,0x4A000091,0x3E000372,0x17C0372,0x4A000091,0x3E000372,0x3E000372,0xBC0372,0x78080001,0x522C0001,0x17C0372,0x4A000091,0x3E000372,0x17C0372,0x4A000091,0x3E000372,0x3E000372,0x17C0372,
-0x4A000091,0x3E000372,0x3E000372,0x3E000372,0xFC30016D,0xA840372,0xFE6C01B1,0xBC000071,0x80000095,0x5E000080,0x5000002D,0x4A0000FA,0xF80C0152,0x9E0000B5,0x56000014,0x3E000372,0x10C0372,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x40014,0x10000000,0x10000000,0x10000000,0x10000000,0x10000000,
-0x10000000,0x8000000,0x8000000,0x8000000,0x4000001,0x80012,0x80012,0x80012,0x80012,0x80012,0x80012,0x6000005,0x6000005,0x6000005,0x4000005,0xC0012,0xC0012,0xC0012,0x400000A,0x2000012,0x58000000,0x40014,0x40014,0x28000000,0x1C000000,0x14000000,0x14000000,0xE000000,0x20000005,0x16000002,0xA000001,0x6000005,
-0xC0012,};
-static const uint32_t g_etc1_to_bc7_m6_table132[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,
-0x2C0000,0x580000,0x580000,0x580000,0xE000000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x580000,0x580000,0x580000,0xE000000,0x580000,0x580000,0x580000,0xE000000,0xE000000,0x200000,0x1C0001,0x1C0001,0x6200000,0x240000,0x280000,0x280000,0x300000,0x6200000,0x240000,0x3C0000,0x580000,
-0x3C0000,0x640001,0x640001,0x640001,0x640001,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x1300000,0x1300000,0x32000000,0x32000000,0x980000,0x980000,0x980000,0x1300000,0x1300000,0x32000000,0x1300000,0x1300000,0x32000000,0x32000000,0x1300000,
-0x1300000,0x32000000,0x32000000,0x32000000,0x4740000,0x6C0000,0x640001,0x2880000,0xAC0000,0xD80000,0xF80000,0x1780000,0x800000,0x980000,0xD80000,0x32000000,0xD80000,0xB80001,0x1140000,0xFF80000,0x5C000000,0x1140000,0xFF80000,0x5C000000,0xFF80000,0x5C000000,0x5C000000,0x1140000,0xFF80000,0x5C000000,0xFF80000,0x5C000000,
-0x5C000000,0xFF80000,0x5C000000,0x5C000000,0x5C000000,0x1140000,0xFF80000,0x5C000000,0xFF80000,0x5C000000,0x5C000000,0xFF80000,0x5C000000,0x5C000000,0x5C000000,0xFF80000,0x5C000000,0x5C000000,0x5C000000,0x5C000000,0xE80000,0xAC40000,0xAC40000,0x1380000,0x1C80000,0x35F00000,0x5C000000,0x5C000000,0xFC0000,0x15C0000,0x53D80000,0x5C000000,
-0x18C0000,0x501423,0xEC2C055E,0x7E2C055E,0x5C2C055E,0xCA1003E3,0x86100072,0x6014011D,0x661003E3,0x560C00EA,0x461003E5,0xA0000BE8,0x7A00034E,0x5C04028E,0x620004C5,0x5200010D,0x460003B4,0x4E000BE8,0x4A00060E,0x3E0006ED,0x34000BE8,0x2741423,0x6E0008EE,0x58000601,0x5800089F,0x4C0003D6,0x400005B1,0x46000E44,0x4400082B,0x3C000898,0x34000D09,0xF01423,
-0x38000D6E,0x32000CD1,0x2C00102C,0x26001425,0xFE200511,0xF8400DDB,0xFC480EE3,0xD000011D,0x8A000126,0x6A00010D,0x5A00005E,0x50000214,0xFE00052B,0xA80002B6,0x5400046A,0x3C000898,0xA81423,0x680BEB,0xD444028A,0x7A40028A,0x5C40028A,0xBA200373,0x821C0002,0x5E280041,0x62200373,0x561800A6,0x461C0375,0x9C0BE8,0x7A00034E,0x5C080288,0x620004C5,0x5200010D,
-0x460003B4,0x13C0BE8,0x4A00060E,0x3E0006ED,0x34000BE8,0x9C0BE8,0x7A00034E,0x5C080288,0x620004C5,0x5200010D,0x460003B4,0x13C0BE8,0x4A00060E,0x3E0006ED,0x34000BE8,0x13C0BE8,0x4A00060E,0x3E0006ED,0x34000BE8,0x34000BE8,0xFA2C0402,0xF458087D,0xF860084A,0xD000011D,0x8A000126,0x6A00010D,0x5A00005E,0x50000214,0xFC040428,0xA800023D,0x54000451,0x3E0006ED,
-0xE00BE8,0x2C055E,0x2C055E,0x2C055E,0x2C055E,0x8A100071,0x8A100071,0x8A100071,0x46100072,0x46100072,0x32100071,0x64000288,0x64000288,0x64000288,0x46000049,0x46000049,0x32000004,0x30000288,0x30000288,0x2A000104,0x20000288,0x40055E,0x40055E,0x40055E,0x3A0001F2,0x3A0001F2,0x2E0000F1,0x28000363,0x28000363,0x260001B2,0x1E0002F4,0x80055E,
-0x80055E,0x20000359,0x1C000411,0x14000561,0xFC0C00AC,0xF8200291,0x2C055E,0xAC00006A,0x72000050,0x56000059,0x4C000032,0x40000082,0xC2000199,0x84000101,0x3E000298,0x260001B2,0x5C055E,0x40028A,0x40028A,0x40028A,0x40028A,0x7A200001,0x7A200001,0x7A200001,0x441C0001,0x441C0001,0x321C0001,0x600288,0x600288,0x600288,0x46000049,0x46000049,
-0x32000004,0xC40288,0xC40288,0x2A000104,0x20000288,0x600288,0x600288,0x600288,0x46000049,0x46000049,0x32000004,0xC40288,0xC40288,0x2A000104,0x20000288,0xC40288,0xC40288,0x2A000104,0x20000288,0x20000288,0xFE100080,0xF23400F2,0x40028A,0xAC00006A,0x72000050,0x56000059,0x4C000032,0x40000082,0xC2000109,0x840000C1,0x8C0288,0x2A000104,
-0x8C0288,0x900372,0xB0680001,0x70680001,0x5C640001,0xD40372,0x82180001,0x5C3C0000,0x1B00372,0x56000055,0x46000374,0xD40372,0x82180001,0x5C3C0000,0x1B00372,0x56000055,0x46000374,0x1B00372,0x56000055,0x46000374,0x46000374,0xD40372,0x82180001,0x5C3C0000,0x1B00372,0x56000055,0x46000374,0x1B00372,0x56000055,0x46000374,0x46000374,0x1B00372,
-0x56000055,0x46000374,0x46000374,0x46000374,0xFE440188,0x4980372,0xFA8401C2,0xDA000041,0x9600006D,0x6C000055,0x5A00000D,0x540000B9,0xFE180164,0xBA00007D,0x62000004,0x46000374,0x1300372,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x100071,0x2C000000,0x2C000000,0x2C000000,0x2C000000,0x2C000000,
-0x2C000000,0x16000000,0x16000000,0x16000000,0xE000000,0x140071,0x140071,0x140071,0x140071,0x140071,0x140071,0x12000028,0x12000028,0x12000028,0xC000014,0x240071,0x240071,0x240071,0xA000041,0x6000071,0xE8000000,0x100071,0x100071,0x68000000,0x48000000,0x36000000,0x36000000,0x24000000,0x52000022,0x42000011,0x1C000004,0x12000028,
-0x1C0071,};
-static const uint32_t g_etc1_to_bc7_m6_table133[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x440000,0x440000,0x440000,0x440000,0x440000,
-0x440000,0x880000,0x880000,0x880000,0x16000000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x880000,0x880000,0x880000,0x16000000,0x880000,0x880000,0x880000,0x16000000,0x16000000,0x300000,0x2C0001,0x2C0001,0x2340000,0x380000,0x23C0000,0x23C0000,0x4C0000,0x2340000,0x380000,0x600000,0x880000,
-0x600000,0x740001,0x740001,0x740001,0x740001,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0x1640000,0x1640000,0x3A000000,0x3A000000,0xB00000,0xB00000,0xB00000,0x1640000,0x1640000,0x3A000000,0x1640000,0x1640000,0x3A000000,0x3A000000,0x1640000,
-0x1640000,0x3A000000,0x3A000000,0x3A000000,0x880000,0x67C0000,0x740001,0xA00000,0xC40000,0xF80000,0x1200000,0x1B40000,0x940000,0xB00000,0xF80000,0x3A000000,0xF80000,0xC80001,0x12C0000,0x1BF80000,0x64000000,0x12C0000,0x1BF80000,0x64000000,0x1BF80000,0x64000000,0x64000000,0x12C0000,0x1BF80000,0x64000000,0x1BF80000,0x64000000,
-0x64000000,0x1BF80000,0x64000000,0x64000000,0x64000000,0x12C0000,0x1BF80000,0x64000000,0x1BF80000,0x64000000,0x64000000,0x1BF80000,0x64000000,0x64000000,0x64000000,0x1BF80000,0x64000000,0x64000000,0x64000000,0x64000000,0xFC0000,0xD80000,0xD80000,0x1540000,0x1EC0000,0x3FF00000,0x64000000,0x64000000,0x3100000,0x17C0000,0x5BE80000,0x64000000,
-0x1AC0000,0x5C16CF,0xF83806EA,0x883806EA,0x643806EA,0xDA18047B,0x90180112,0x662001F9,0x7018047B,0x6014016E,0x4E18047D,0xB8000BE8,0x8C0002DE,0x660C02C2,0x6E00044D,0x5E00007D,0x4E00037D,0x5A000BE8,0x5000056A,0x44000675,0x3C000BE8,0x8816CF,0x7A000A0E,0x6200072E,0x68000911,0x580003FE,0x4C0005F9,0x52000F0C,0x4C00082B,0x44000886,0x3A000D5D,0x11416CF,
-0x44000EFE,0x3E000E01,0x32001198,0x2C0016D1,0xFE280685,0xFE4C1017,0xFE4C119F,0xE6000086,0x9A0000A1,0x78000083,0x6200000D,0x5800015D,0xFE00061B,0xBC00020A,0x5E0003DE,0x44000886,0xC016CF,0x780BEB,0xDC54028A,0x8250028A,0x6450028A,0xC2300373,0x8A2C0002,0x66380041,0x6A300373,0x5E2800A6,0x4E2C0375,0xB40BE8,0x8C0002DE,0x64180288,0x6E00044D,0x5E00007D,
-0x4E00037D,0x1700BE8,0x5000056A,0x44000675,0x3C000BE8,0xB40BE8,0x8C0002DE,0x64180288,0x6E00044D,0x5E00007D,0x4E00037D,0x1700BE8,0x5000056A,0x44000675,0x3C000BE8,0x1700BE8,0x5000056A,0x44000675,0x3C000BE8,0x3C000BE8,0xFE340432,0xFC68087D,0xFE6C085E,0xE6000086,0x9A0000A1,0x78000083,0x6200000D,0x5800015D,0xFE180455,0xBC00017A,0x5E0003BA,0x44000675,
-0x1000BE8,0x3806EA,0x3806EA,0x3806EA,0x3806EA,0x9A180109,0x9A180109,0x9A180109,0x5018010A,0x5018010A,0x3A180109,0x7C000288,0x7C000288,0x7C000288,0x52000019,0x52000019,0x3A040008,0x3C000288,0x3C000288,0x320000B9,0x28000288,0x5006E9,0x5006E9,0x5006E9,0x4600028A,0x4600028A,0x34000169,0x340003DB,0x340003DB,0x2E0001C2,0x28000331,0xA006E9,
-0xA006E9,0x2600043D,0x200004EC,0x1A0006E9,0xFE100140,0xFE2C03C9,0x3806EA,0xD000002D,0x86000020,0x6A00001D,0x62000009,0x4A000041,0xE40001A5,0x960000E2,0x4C0002A1,0x2E0001C2,0x7006E9,0x50028A,0x50028A,0x50028A,0x50028A,0x82300001,0x82300001,0x82300001,0x4C2C0001,0x4C2C0001,0x3A2C0001,0x780288,0x780288,0x780288,0x52000019,0x52000019,
-0x3A0C0000,0xF40288,0xF40288,0x320000B9,0x28000288,0x780288,0x780288,0x780288,0x52000019,0x52000019,0x3A0C0000,0xF40288,0xF40288,0x320000B9,0x28000288,0xF40288,0xF40288,0x320000B9,0x28000288,0x28000288,0xFE200091,0xFA4400F2,0x50028A,0xD000002D,0x86000020,0x6A00001D,0x62000009,0x4A000041,0xF40000CA,0x9A000080,0xAC0288,0x320000B9,
-0xAC0288,0xA00372,0xB8780001,0x78780001,0x64740001,0xEC0372,0x8A280001,0x644C0000,0x1E40372,0x60000034,0x4E000374,0xEC0372,0x8A280001,0x644C0000,0x1E40372,0x60000034,0x4E000374,0x1E40372,0x60000034,0x4E000374,0x4E000374,0xEC0372,0x8A280001,0x644C0000,0x1E40372,0x60000034,0x4E000374,0x1E40372,0x60000034,0x4E000374,0x4E000374,0x1E40372,
-0x60000034,0x4E000374,0x4E000374,0x4E000374,0xFE50019A,0xCA80372,0xF29401E1,0xEE000020,0xA000003D,0x7A00002D,0x64000001,0x5C000091,0xFE2C0185,0xCC000059,0x6A080000,0x4E000374,0x1540372,0x180109,0x180109,0x180109,0x180109,0x180109,0x180109,0x180109,0x180109,0x180109,0x180109,0x44000000,0x44000000,0x44000000,0x44000000,0x44000000,
-0x44000000,0x20000001,0x20000001,0x20000001,0x16000000,0x200109,0x200109,0x200109,0x200109,0x200109,0x200109,0x18000064,0x18000064,0x18000064,0x12000034,0x3C0109,0x3C0109,0x3C0109,0x1000009D,0xA000109,0xFC080019,0x180109,0x180109,0xA0000000,0x6E000000,0x54000000,0x54000000,0x38000000,0x84000050,0x64000028,0x26000008,0x18000064,
-0x2C0109,};
-static const uint32_t g_etc1_to_bc7_m6_table134[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,
-0x5C0000,0xB80000,0xB80000,0xB80000,0x1E000000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0xB80000,0xB80000,0xB80000,0x1E000000,0xB80000,0xB80000,0xB80000,0x1E000000,0x1E000000,0x8400000,0x3C0001,0x3C0001,0x480000,0x24C0000,0x540000,0x540000,0x680000,0x480000,0x24C0000,0x800000,0xB80000,
-0x800000,0x840001,0x840001,0x840001,0x840001,0x2C40000,0x2C40000,0x2C40000,0x1940000,0x1940000,0x42000000,0x2C40000,0x2C40000,0x2C40000,0x1940000,0x1940000,0x42000000,0x1940000,0x1940000,0x42000000,0x42000000,0x2C40000,0x2C40000,0x2C40000,0x1940000,0x1940000,0x42000000,0x1940000,0x1940000,0x42000000,0x42000000,0x1940000,
-0x1940000,0x42000000,0x42000000,0x42000000,0x9C0000,0xE8C0000,0x840001,0x2B40000,0xE00000,0x11C0000,0x1480000,0x1F00000,0xA80000,0x2C40000,0x11C0000,0x42000000,0x11C0000,0xD80001,0x1440000,0x27F80000,0x6C000000,0x1440000,0x27F80000,0x6C000000,0x27F80000,0x6C000000,0x6C000000,0x1440000,0x27F80000,0x6C000000,0x27F80000,0x6C000000,
-0x6C000000,0x27F80000,0x6C000000,0x6C000000,0x6C000000,0x1440000,0x27F80000,0x6C000000,0x27F80000,0x6C000000,0x6C000000,0x27F80000,0x6C000000,0x6C000000,0x6C000000,0x27F80000,0x6C000000,0x6C000000,0x6C000000,0x6C000000,0x1100000,0xE80000,0xE80000,0x36C0000,0x9FC0000,0x49F00000,0x6C000000,0x6C000000,0x1280000,0x1980000,0x63F80000,0x6C000000,
-0x1D00000,0x6819DB,0xFE4408D2,0x924008CA,0x6C4008CA,0xEA200553,0x9A2001FA,0x70280319,0x7A200553,0x681C0235,0x56200555,0xD0000BE8,0x9E00029E,0x6E10032E,0x800003ED,0x68000029,0x56000378,0x66000BE8,0x5C0004DA,0x500005E5,0x44000BE8,0x29819DB,0x86000B8E,0x6A0008DD,0x740009E9,0x62000465,0x5200068D,0x5E000FF4,0x58000853,0x4C00088E,0x40000DD1,0x13819DB,
-0x4A0010D2,0x44000F69,0x3E001338,0x320019DD,0xFE340856,0xF65C1329,0xF860148A,0xFE000033,0xAC00003D,0x84000029,0x6C040006,0x640000D5,0xFE1007BB,0xD4000193,0x66000384,0x4C00088E,0xDC19DB,0x880BEB,0xE464028A,0x8A60028A,0x6C60028A,0xCA400373,0x923C0002,0x6E480041,0x72400373,0x663800A6,0x563C0375,0xCC0BE8,0x9E00029E,0x6C280288,0x800003ED,0x68000029,
-0x56080374,0x1A00BE8,0x5C0004DA,0x500005E5,0x44000BE8,0xCC0BE8,0x9E00029E,0x6C280288,0x800003ED,0x68000029,0x56080374,0x1A00BE8,0x5C0004DA,0x500005E5,0x44000BE8,0x1A00BE8,0x5C0004DA,0x500005E5,0x44000BE8,0x44000BE8,0xFE50045E,0xF47808BB,0xF880088B,0xFE000033,0xAC00003D,0x84000029,0x6C040002,0x640000D5,0xFE2404A1,0xDA0000E6,0x6A000359,0x500005E5,
-0x1240BE8,0x4008CA,0x4008CA,0x4008CA,0x4008CA,0xAA2001E1,0xAA2001E1,0xAA2001E1,0x5A2001E2,0x5A2001E2,0x422001E1,0x94000288,0x94000288,0x94000288,0x62000001,0x62000001,0x42080034,0x48000288,0x48000288,0x3C000088,0x30000288,0x6008C9,0x6008C9,0x6008C9,0x52000362,0x52000362,0x40000221,0x40000473,0x40000473,0x360001F8,0x2E000371,0xC408C9,
-0xC408C9,0x2C000569,0x260005F4,0x200008C9,0xFE200221,0xF438058A,0x4008CA,0xF200000D,0x9E000005,0x7C000005,0x70000001,0x56000019,0xFE0001FA,0xB60000CE,0x5C0002AC,0x360001F8,0x8C08C9,0x60028A,0x60028A,0x60028A,0x60028A,0x8A400001,0x8A400001,0x8A400001,0x543C0001,0x543C0001,0x423C0001,0x900288,0x900288,0x900288,0x62000001,0x62000001,
-0x421C0000,0x1240288,0x1240288,0x3C000088,0x30000288,0x900288,0x900288,0x900288,0x62000001,0x62000001,0x421C0000,0x1240288,0x1240288,0x3C000088,0x30000288,0x1240288,0x1240288,0x3C000088,0x30000288,0x30000288,0xFA3400A2,0xF2540109,0x60028A,0xF200000D,0x9E000005,0x7C000005,0x6E040000,0x56000019,0xFE0C00C8,0xBC00004A,0xD00288,0x3C000088,
-0xD00288,0xB00372,0xC0880001,0x80880001,0x6C840001,0x1040372,0x92380001,0x6C5C0000,0x7FC0372,0x6A00001D,0x56000374,0x1040372,0x92380001,0x6C5C0000,0x7FC0372,0x6A00001D,0x56000374,0x7FC0372,0x6A00001D,0x56000374,0x56000374,0x1040372,0x92380001,0x6C5C0000,0x7FC0372,0x6A00001D,0x56000374,0x7FC0372,0x6A00001D,0x56000374,0x56000374,0x7FC0372,
-0x6A00001D,0x56000374,0x56000374,0x56000374,0xF86C01A5,0xBC0372,0xFAA401E1,0xFE040014,0xAC040029,0x84000019,0x6C0C0000,0x64000071,0xFA4C0188,0xDE000034,0x72180000,0x56000374,0x1740372,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x2001E1,0x5C000000,0x5C000000,0x5C000000,0x5C000000,0x5C000000,
-0x5C000000,0x2C000001,0x2C000001,0x2C000001,0x1E000000,0x2C01E1,0x2C01E1,0x2C01E1,0x2C01E1,0x2C01E1,0x2C01E1,0x220000AA,0x220000AA,0x220000AA,0x1A000061,0x5801E1,0x5801E1,0x5801E1,0x16000121,0xE0001E1,0xFE0C0075,0x2001E1,0x2001E1,0xD8000000,0x96000000,0x72000000,0x72000000,0x4C000000,0xB6000092,0x82000050,0x3600000D,0x220000AA,
-0x3C01E1,};
-static const uint32_t g_etc1_to_bc7_m6_table135[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x740000,0x740000,0x740000,0x740000,0x740000,
-0x740000,0xE80000,0xE80000,0xE80000,0x26000000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xE80000,0xE80000,0xE80000,0x26000000,0xE80000,0xE80000,0xE80000,0x26000000,0x26000000,0x540000,0x4C0001,0x4C0001,0x4580000,0x2600000,0x2680000,0x2680000,0x2800000,0x4580000,0x2600000,0xA40000,0xE80000,
-0xA40000,0x940001,0x940001,0x940001,0x940001,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x1C40000,0x1C40000,0x4A000000,0x4A000000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x4A000000,0x1C40000,0x1C40000,0x4A000000,0x4A000000,0x1C40000,
-0x1C40000,0x4A000000,0x4A000000,0x4A000000,0x4AC0000,0xA00000,0x940001,0xCC0000,0xFC0000,0x13C0000,0x16C0000,0xBF80000,0xBC0000,0x2DC0000,0x13C0000,0x4A000000,0x13C0000,0xE80001,0x15C0000,0x33F80000,0x74000000,0x15C0000,0x33F80000,0x74000000,0x33F80000,0x74000000,0x74000000,0x15C0000,0x33F80000,0x74000000,0x33F80000,0x74000000,
-0x74000000,0x33F80000,0x74000000,0x74000000,0x74000000,0x15C0000,0x33F80000,0x74000000,0x33F80000,0x74000000,0x74000000,0x33F80000,0x74000000,0x74000000,0x74000000,0x33F80000,0x74000000,0x74000000,0x74000000,0x74000000,0x1240000,0x4F80000,0x4F80000,0x1880000,0x17FC0000,0x53F00000,0x74000000,0x74000000,0x33C0000,0x1B80000,0x6DCC0000,0x74000000,
-0x1F00000,0x741D47,0xFE500B2E,0x9A4C0AFE,0x744C0AFE,0xFA28066B,0xA62C0322,0x7A340492,0x8428066B,0x6E24033A,0x5E28066D,0xEA000BE8,0xAC000289,0x781803CA,0x8C0003A5,0x72000009,0x60080394,0x72000BE8,0x6600046C,0x5600057D,0x4C000BE8,0xAC1D47,0x92000D6E,0x74000AFE,0x80000B01,0x6800053D,0x5E000775,0x680010D8,0x5E00088F,0x540008B8,0x48000E58,0x15C1D47,
-0x5000130E,0x4A001121,0x440014EC,0x38001D49,0xFE3C0AD7,0xFC68162D,0xFE6C17C2,0xFE0C0095,0xC000000D,0x90000005,0x7608002D,0x6C000072,0xFE180A0E,0xE6000156,0x7600031A,0x540008B8,0xF41D47,0x980BEB,0xEC74028A,0x9270028A,0x7470028A,0xD2500373,0x9A4C0002,0x76580041,0x7A500373,0x6E4800A6,0x5E4C0375,0xE40BE8,0xAA040289,0x74380288,0x8C0003A5,0x72000009,
-0x5E180374,0x1D00BE8,0x6600046C,0x5600057D,0x4C000BE8,0xE40BE8,0xAA040289,0x74380288,0x8C0003A5,0x72000009,0x5E180374,0x1D00BE8,0x6600046C,0x5600057D,0x4C000BE8,0x1D00BE8,0x6600046C,0x5600057D,0x4C000BE8,0x4C000BE8,0xFC6004C1,0xFC8808BB,0xFE8C08A3,0xFE14004A,0xC000000D,0x90000005,0x74140002,0x6C000072,0xFC4004E8,0xEC00007E,0x760002E9,0x5600057D,
-0x1480BE8,0x4C0AFE,0x4C0AFE,0x4C0AFE,0x4C0AFE,0xBA2802F9,0xBA2802F9,0xBA2802F9,0x642802FA,0x642802FA,0x4A2802F9,0xAC000288,0xAC000288,0xAC000288,0x6E040005,0x6E040005,0x4C100084,0x54000288,0x54000288,0x44000055,0x38000288,0x700AFE,0x700AFE,0x700AFE,0x62000471,0x62000471,0x46000321,0x4C00052B,0x4C00052B,0x40000236,0x340003C9,0xE40AFE,
-0xE40AFE,0x320006DD,0x2C00072C,0x24000B01,0xFE2C0378,0xFA440776,0x4C0AFE,0xFE080020,0xB8000001,0x8E000002,0x80040011,0x62000005,0xFE00031A,0xD40000D2,0x660002B4,0x40000236,0xA00AFE,0x70028A,0x70028A,0x70028A,0x70028A,0x92500001,0x92500001,0x92500001,0x5C4C0001,0x5C4C0001,0x4A4C0001,0xA80288,0xA80288,0xA80288,0x6A0C0001,0x6A0C0001,
-0x4A2C0000,0x1580288,0x1580288,0x44000055,0x38000288,0xA80288,0xA80288,0xA80288,0x6A0C0001,0x6A0C0001,0x4A2C0000,0x1580288,0x1580288,0x44000055,0x38000288,0x1580288,0x1580288,0x44000055,0x38000288,0x38000288,0xF64800B5,0xFA640109,0x70028A,0xFA10000D,0xAE0C0001,0x88080000,0x76140000,0x62000005,0xF62400DD,0xDA000022,0xF00288,0x44000055,
-0xF00288,0xC00372,0xC8980001,0x88980001,0x74940001,0x11C0372,0x9A480001,0x746C0000,0x13FC0372,0x72000008,0x5E000374,0x11C0372,0x9A480001,0x746C0000,0x13FC0372,0x72000008,0x5E000374,0x13FC0372,0x72000008,0x5E000374,0x5E000374,0x11C0372,0x9A480001,0x746C0000,0x13FC0372,0x72000008,0x5E000374,0x13FC0372,0x72000008,0x5E000374,0x5E000374,0x13FC0372,
-0x72000008,0x5E000374,0x5E000374,0x5E000374,0xFE7801B1,0xCC0372,0xF2B40202,0xFE200020,0xC000000D,0x90000005,0x741C0000,0x70000050,0xFE5C018A,0xF400001D,0x7A280000,0x5E000374,0x1980372,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x2802F9,0x76000000,0x76000000,0x76000000,0x76000000,0x76000000,
-0x76000000,0x38000001,0x38000001,0x38000001,0x26000000,0x3802F9,0x3802F9,0x3802F9,0x3802F9,0x3802F9,0x3802F9,0x2E000112,0x2E000112,0x2E000112,0x22000089,0x7002F9,0x7002F9,0x7002F9,0x1C0001CD,0x120002F9,0xF4180120,0x2802F9,0x2802F9,0xFE040005,0xBE000000,0x90000000,0x90000000,0x5E000000,0xF60000F1,0xA400007D,0x46000011,0x2E000112,
-0x5002F9,};
-static const uint32_t g_etc1_to_bc7_m6_table136[] = {
-0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x200000,
-0x200000,0x200000,0x200000,0x4000001,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x180000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,
-0x28C0000,0x1200000,0x1200000,0x1200000,0x2E000001,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x1200000,0x1200000,0x1200000,0x2E000001,0x1200000,0x1200000,0x1200000,0x2E000001,0x2E000001,0xA640000,0x600000,0x600000,0x700000,0x780000,0x2800000,0x2800000,0xA00000,0x700000,0x780000,0xCC0000,0x1200000,
-0xCC0000,0xA80000,0xA80000,0xA80000,0xA80000,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0x1FC0000,0x1FC0000,0x52000001,0x52000001,0xF80000,0xF80000,0xF80000,0x1FC0000,0x1FC0000,0x52000001,0x1FC0000,0x1FC0000,0x52000001,0x52000001,0x1FC0000,
-0x1FC0000,0x52000001,0x52000001,0x52000001,0xC40000,0xB40000,0xA80000,0xE40000,0x1180000,0x1640000,0x19C0000,0x17F80000,0x4D00000,0xF80000,0x1640000,0x52000001,0x1640000,0xFC0000,0x3740000,0x3FFC0000,0x7C000001,0x3740000,0x3FFC0000,0x7C000001,0x3FFC0000,0x7C000001,0x7C000001,0x3740000,0x3FFC0000,0x7C000001,0x3FFC0000,0x7C000001,
-0x7C000001,0x3FFC0000,0x7C000001,0x7C000001,0x7C000001,0x3740000,0x3FFC0000,0x7C000001,0x3FFC0000,0x7C000001,0x7C000001,0x3FFC0000,0x7C000001,0x7C000001,0x7C000001,0x3FFC0000,0x7C000001,0x7C000001,0x7C000001,0x7C000001,0x13C0000,0x10C0000,0x10C0000,0x1A80000,0x27F80000,0x5DFC0000,0x7C000001,0x7C000001,0x3540000,0x1D80000,0x75FC0000,0x7C000001,
-0xDFC0000,0x841EA8,0xFE600C58,0xA45C0BE8,0x7C5C0BE9,0xFC3806F8,0xB23803A1,0x82440531,0x8E3406E6,0x7A3403AE,0x683406E6,0xF80C0BE8,0xB80C028A,0x84240411,0x9608038E,0x7C0C000E,0x6A1403AE,0x7C0C0BE8,0x72000417,0x62040533,0x540C0BEB,0xC41EA8,0xA8000D6B,0x7C100BE9,0x8C000A7E,0x740004CE,0x68000746,0x7400107B,0x6A000782,0x5E0007AB,0x52000DBC,0x18C1EA8,
-0x5C001329,0x560010EE,0x4A0014E3,0x40001EAC,0xFE500C45,0xFE6C17DE,0xF67C1978,0xFE1C0128,0xCA0C0011,0x9A0C0006,0x80140046,0x78040046,0xFE340B52,0xFE000098,0x820002B5,0x5E0007AB,0x1181EA8,0xAC0BE8,0xF8840288,0x9A840288,0x7C840289,0xDC600372,0xA0600005,0x7E680042,0x84600372,0x765800A5,0x68600372,0x1000BE8,0xB4140289,0x7E480289,0x9A00037D,0x7C14000A,
-0x68280372,0x5F80BE8,0x720003F3,0x60000513,0x54000BEB,0x1000BE8,0xB4140289,0x7E480289,0x9A00037D,0x7C14000A,0x68280372,0x5F80BE8,0x720003F3,0x60000513,0x54000BEB,0x5F80BE8,0x720003F3,0x60000513,0x54000BEB,0x54000BEB,0xFE7804ED,0xF69C08F6,0xFAA408C9,0xFE300081,0xCA10000D,0x98100006,0x7E280001,0x7A000031,0xFC580519,0xFE040033,0x820002B1,0x60000513,
-0x16C0BE8,0x5C0BE8,0x5C0BE8,0x5C0BE8,0x5C0BE8,0xC6380374,0xC6380374,0xC6380374,0x6E380374,0x6E380374,0x52340375,0xBC0C0288,0xBC0C0288,0xBC0C0288,0x7A100009,0x7A100009,0x561800A6,0x5E0C0288,0x5E0C0288,0x4E080041,0x400C028A,0x880BE8,0x880BE8,0x880BE8,0x6E000465,0x6E000465,0x52000373,0x580004E1,0x580004E1,0x4C0001AA,0x4000036B,0x1140BE8,
-0x1140BE8,0x3E0006ED,0x3800072A,0x2C000BEB,0xFE3C0434,0xFE4C0864,0x5C0BE8,0xFE180059,0xC80C0005,0x9A0C0005,0x88140021,0x6E0C0002,0xFE1403B9,0xF600005E,0x7C00028C,0x4C0001AA,0xC00BE8,0x840288,0x840288,0x840288,0x840288,0x9C600000,0x9C600000,0x9C600000,0x64600000,0x64600000,0x52600001,0xC40288,0xC40288,0xC40288,0x72200001,0x72200001,
-0x543C0001,0x18C0288,0x18C0288,0x5000002D,0x4000028A,0xC40288,0xC40288,0xC40288,0x72200001,0x72200001,0x543C0001,0x18C0288,0x18C0288,0x5000002D,0x4000028A,0x18C0288,0x18C0288,0x5000002D,0x4000028A,0x4000028A,0xFE5800B5,0xF4780120,0x840288,0xFE240012,0xBC180000,0x92180000,0x7E280000,0x70080000,0xFE3400DD,0xFC00000A,0x1180288,0x5000002D,
-0x1180288,0xD00374,0xD2A80000,0x92A80000,0x7CA80001,0x1380372,0xA25C0001,0x7C800001,0x21F80372,0x7C000001,0x68000372,0x1380372,0xA25C0001,0x7C800001,0x21F80372,0x7C000001,0x68000372,0x21F80372,0x7C000001,0x68000372,0x68000372,0x1380372,0xA25C0001,0x7C800001,0x21F80372,0x7C000001,0x68000372,0x21F80372,0x7C000001,0x68000372,0x68000372,0x21F80372,
-0x7C000001,0x68000372,0x68000372,0x68000372,0xFE9401C2,0xE00372,0xFCC80200,0xFC3C0032,0xCE040005,0x9C040000,0x7C300001,0x7A00002D,0xFE7001B1,0xFE080014,0x84380000,0x68000372,0x1BC0372,0x340374,0x340374,0x340374,0x340374,0x340374,0x340374,0x340374,0x340374,0x340374,0x340374,0x840C0000,0x840C0000,0x840C0000,0x840C0000,0x840C0000,
-0x840C0000,0x440C0000,0x440C0000,0x440C0000,0x2E0C0001,0x500372,0x500372,0x500372,0x500372,0x500372,0x500372,0x3A0000E9,0x3A0000E9,0x3A0000E9,0x2E000052,0xA00372,0xA00372,0xA00372,0x200001E1,0x1A000372,0xFC28016D,0x340374,0x340374,0xFE10001D,0xD20C0000,0xA20C0000,0xA20C0000,0x6C0C0000,0xF80400F2,0xD6000041,0x5A000000,0x3A0000E9,
-0x700372,};
-static const uint32_t g_etc1_to_bc7_m6_table137[] = {
-0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x280000,0x500000,
-0x500000,0x500000,0x500000,0xC000001,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x61C0000,0x61C0000,0x61C0000,0x280000,0x380000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,
-0x2A40000,0x1500000,0x1500000,0x1500000,0x36000001,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x1500000,0x1500000,0x1500000,0x36000001,0x1500000,0x1500000,0x1500000,0x36000001,0x36000001,0x780000,0x700000,0x700000,0x4800000,0x8C0000,0x980000,0x980000,0xBC0000,0x4800000,0x8C0000,0xEC0000,0x1500000,
-0xEC0000,0xB80000,0xB80000,0xB80000,0xB80000,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0xDFC0000,0xDFC0000,0x5A000001,0x5A000001,0x1100000,0x1100000,0x1100000,0xDFC0000,0xDFC0000,0x5A000001,0xDFC0000,0xDFC0000,0x5A000001,0x5A000001,0xDFC0000,
-0xDFC0000,0x5A000001,0x5A000001,0x5A000001,0x2D40000,0xC40000,0xB80000,0x2F80000,0x1340000,0x1880000,0x1C00000,0x21FC0000,0x4E40000,0x1100000,0x1880000,0x5A000001,0x1880000,0x10C0000,0x38C0000,0x4BFC0000,0x84000001,0x38C0000,0x4BFC0000,0x84000001,0x4BFC0000,0x84000001,0x84000001,0x38C0000,0x4BFC0000,0x84000001,0x4BFC0000,0x84000001,
-0x84000001,0x4BFC0000,0x84000001,0x84000001,0x84000001,0x38C0000,0x4BFC0000,0x84000001,0x4BFC0000,0x84000001,0x84000001,0x4BFC0000,0x84000001,0x84000001,0x84000001,0x4BFC0000,0x84000001,0x84000001,0x84000001,0x84000001,0x1500000,0x71C0000,0x71C0000,0x3C00000,0x35F80000,0x67FC0000,0x84000001,0x84000001,0x16C0000,0x1F80000,0x7FD00000,0x84000001,
-0x1DF80000,0x941EA8,0xFE740C91,0xAC6C0BE8,0x846C0BE9,0xFC4C0716,0xBA4803A1,0x8A540531,0x964406E6,0x824403AE,0x704406E6,0xFE1C0BEB,0xC01C028A,0x8C340411,0x9E18038E,0x841C000E,0x722403AE,0x841C0BE8,0x780C0413,0x6A140533,0x5C1C0BEB,0xDC1EA8,0xBA000CBB,0x84200BE9,0x9800096E,0x80000406,0x700006EC,0x86000F87,0x7400061B,0x68000663,0x5A000D03,0x1BC1EA8,
-0x6600124C,0x5C000FB6,0x5600140B,0x48001EAC,0xFE640CCE,0xFC8817E8,0xFE8C1978,0xFE34018E,0xD21C0011,0xA21C0006,0x88240046,0x80140046,0xFE440C0F,0xFE1000C2,0x8A08029D,0x68000663,0x1381EA8,0xBC0BE8,0xFC940289,0xA2940288,0x84940289,0xE4700372,0xA8700005,0x86780042,0x8C700372,0x7E6800A5,0x70700372,0x1180BE8,0xBC240289,0x86580289,0xA6040372,0x8424000A,
-0x70380372,0x11F80BE8,0x78000393,0x6C0004B3,0x5C000BEB,0x1180BE8,0xBC240289,0x86580289,0xA6040372,0x8424000A,0x70380372,0x11F80BE8,0x78000393,0x6C0004B3,0x5C000BEB,0x11F80BE8,0x78000393,0x6C0004B3,0x5C000BEB,0x5C000BEB,0xFE800552,0xFEAC08F6,0xFEAC0901,0xFE4000AB,0xD220000D,0xA0200006,0x86380001,0x82000022,0xFE5C0579,0xFE180059,0x8C00028E,0x6C0004B3,
-0x1900BE8,0x6C0BE8,0x6C0BE8,0x6C0BE8,0x6C0BE8,0xCE480374,0xCE480374,0xCE480374,0x76480374,0x76480374,0x5A440375,0xC41C0288,0xC41C0288,0xC41C0288,0x82200009,0x82200009,0x5E2800A6,0x661C0288,0x661C0288,0x56180041,0x481C028A,0xA00BE8,0xA00BE8,0xA00BE8,0x800003ED,0x800003ED,0x5A100373,0x6800042A,0x6800042A,0x540000F6,0x460002EB,0x1440BE8,
-0x1440BE8,0x4A000655,0x3E00068A,0x34000BEB,0xFE4C0461,0xFA64087D,0x6C0BE8,0xFE2C0081,0xD01C0005,0xA21C0005,0x90240021,0x761C0002,0xFA2C03F9,0xFC080049,0x840C028A,0x540000F6,0xE40BE8,0x940288,0x940288,0x940288,0x940288,0xA4700000,0xA4700000,0xA4700000,0x6C700000,0x6C700000,0x5A700001,0xDC0288,0xDC0288,0xDC0288,0x7A300001,0x7A300001,
-0x5C4C0001,0x1BC0288,0x1BC0288,0x58000019,0x4800028A,0xDC0288,0xDC0288,0xDC0288,0x7A300001,0x7A300001,0x5C4C0001,0x1BC0288,0x1BC0288,0x58000019,0x4800028A,0x1BC0288,0x1BC0288,0x58000019,0x4800028A,0x4800028A,0xFA6C00C8,0xFC880120,0x940288,0xFE34001D,0xC4280000,0x9A280000,0x86380000,0x78180000,0xFA4800F4,0xFC14000D,0x1380288,0x58000019,
-0x1380288,0xE00374,0xDAB80000,0x9AB80000,0x84B80001,0x1500372,0xAA6C0001,0x84900001,0x2DF80372,0x84100001,0x70000372,0x1500372,0xAA6C0001,0x84900001,0x2DF80372,0x84100001,0x70000372,0x2DF80372,0x84100001,0x70000372,0x70000372,0x1500372,0xAA6C0001,0x84900001,0x2DF80372,0x84100001,0x70000372,0x2DF80372,0x84100001,0x70000372,0x70000372,0x2DF80372,
-0x84100001,0x70000372,0x70000372,0x70000372,0xF8A801E1,0xF00372,0xF4D80221,0xFC540048,0xDC080001,0xA4140000,0x84400001,0x82000019,0xFC8C01C2,0xFE200028,0x8C480000,0x70000372,0x1E00372,0x440374,0x440374,0x440374,0x440374,0x440374,0x440374,0x440374,0x440374,0x440374,0x440374,0x8C1C0000,0x8C1C0000,0x8C1C0000,0x8C1C0000,0x8C1C0000,
-0x8C1C0000,0x4C1C0000,0x4C1C0000,0x4C1C0000,0x361C0001,0x680372,0x680372,0x680372,0x680372,0x680372,0x680372,0x4C000089,0x4C000089,0x4C000089,0x36000011,0xD00372,0xD00372,0xD00372,0x2C000179,0x22000372,0xF4380188,0x440374,0x440374,0xFE200028,0xDA1C0000,0xAA1C0000,0xAA1C0000,0x741C0000,0xFE1000FA,0xFE000014,0x62100000,0x4C000089,
-0x940372,};
-static const uint32_t g_etc1_to_bc7_m6_table138[] = {
-0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x400000,0x800000,
-0x800000,0x800000,0x800000,0x14000001,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0xE2C0000,0xE2C0000,0xE2C0000,0x400000,0x5C0000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,
-0x2BC0000,0x1800000,0x1800000,0x1800000,0x3E000001,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x1800000,0x1800000,0x1800000,0x3E000001,0x1800000,0x1800000,0x1800000,0x3E000001,0x3E000001,0x880000,0x800000,0x800000,0x940000,0xA00000,0x2AC0000,0x2AC0000,0x2D40000,0x940000,0xA00000,0x1100000,0x1800000,
-0x1100000,0xC80000,0xC80000,0xC80000,0xC80000,0x1280000,0x1280000,0x1280000,0x19FC0000,0x19FC0000,0x62000001,0x1280000,0x1280000,0x1280000,0x19FC0000,0x19FC0000,0x62000001,0x19FC0000,0x19FC0000,0x62000001,0x62000001,0x1280000,0x1280000,0x1280000,0x19FC0000,0x19FC0000,0x62000001,0x19FC0000,0x19FC0000,0x62000001,0x62000001,0x19FC0000,
-0x19FC0000,0x62000001,0x62000001,0x62000001,0xE80000,0x2D40000,0xC80000,0x1100000,0x1500000,0x1A80000,0x1E80000,0x2DF80000,0x4F80000,0x1280000,0x1A80000,0x62000001,0x1A80000,0x11C0000,0x3A40000,0x57FC0000,0x8C000001,0x3A40000,0x57FC0000,0x8C000001,0x57FC0000,0x8C000001,0x8C000001,0x3A40000,0x57FC0000,0x8C000001,0x57FC0000,0x8C000001,
-0x8C000001,0x57FC0000,0x8C000001,0x8C000001,0x8C000001,0x3A40000,0x57FC0000,0x8C000001,0x57FC0000,0x8C000001,0x8C000001,0x57FC0000,0x8C000001,0x8C000001,0x8C000001,0x57FC0000,0x8C000001,0x8C000001,0x8C000001,0x8C000001,0x1640000,0xF2C0000,0xF2C0000,0x1DC0000,0x41FC0000,0x71FC0000,0x8C000001,0x8C000001,0x3800000,0xFFC0000,0x87E00000,0x8C000001,
-0x2BFC0000,0xA41EA8,0xFE800CD5,0xB47C0BE8,0x8C7C0BE9,0xFE5C073E,0xC25803A1,0x92640531,0x9E5406E6,0x8A5403AE,0x785406E6,0xFE300BF8,0xC82C028A,0x94440411,0xA628038E,0x8C2C000E,0x7A3403AE,0x8C2C0BE8,0x801C0413,0x72240533,0x642C0BEB,0xF41EA8,0xC6000C43,0x8C300BE9,0xA8000866,0x8C00039E,0x780806E6,0x92000EA7,0x800004F3,0x70000563,0x64000C64,0x1F01EA8,
-0x72001164,0x66000EB4,0x5C00131B,0x50001EAC,0xFE780D71,0xF498186E,0xF8A019DD,0xFE440206,0xDA2C0011,0xAA2C0006,0x90340046,0x88240046,0xFE540C9B,0xFE240136,0x9218029D,0x70000563,0x15C1EA8,0xCC0BE8,0xFEA40291,0xAAA40288,0x8CA40289,0xEC800372,0xB0800005,0x8E880042,0x94800372,0x867800A5,0x78800372,0x1300BE8,0xC4340289,0x8E680289,0xAE140372,0x8C34000A,
-0x78480372,0x1DF40BE8,0x8400033B,0x72000463,0x64000BEB,0x1300BE8,0xC4340289,0x8E680289,0xAE140372,0x8C34000A,0x78480372,0x1DF40BE8,0x8400033B,0x72000463,0x64000BEB,0x1DF40BE8,0x8400033B,0x72000463,0x64000BEB,0x64000BEB,0xFE94057B,0xF8C00934,0xFAC4090C,0xFE5400EA,0xDA30000D,0xA8300006,0x8E480001,0x8A100022,0xFE780595,0xFE300095,0x96080288,0x72000463,
-0x1B00BE8,0x7C0BE8,0x7C0BE8,0x7C0BE8,0x7C0BE8,0xD6580374,0xD6580374,0xD6580374,0x7E580374,0x7E580374,0x62540375,0xCC2C0288,0xCC2C0288,0xCC2C0288,0x8A300009,0x8A300009,0x663800A6,0x6E2C0288,0x6E2C0288,0x5E280041,0x502C028A,0xB80BE8,0xB80BE8,0xB80BE8,0x8C00039D,0x8C00039D,0x62200373,0x7400039A,0x7400039A,0x5E00006A,0x500002A3,0x1740BE8,
-0x1740BE8,0x500005CD,0x4A0005FA,0x3C000BEB,0xFE5804B9,0xFE6C08A5,0x7C0BE8,0xFE3C00A8,0xD82C0005,0xAA2C0005,0x98340021,0x7E2C0002,0xFE3C0428,0xFE180062,0x8C1C028A,0x5E00006A,0x1080BE8,0xA40288,0xA40288,0xA40288,0xA40288,0xAC800000,0xAC800000,0xAC800000,0x74800000,0x74800000,0x62800001,0xF40288,0xF40288,0xF40288,0x82400001,0x82400001,
-0x645C0001,0x1F00288,0x1F00288,0x6000000A,0x5000028A,0xF40288,0xF40288,0xF40288,0x82400001,0x82400001,0x645C0001,0x1F00288,0x1F00288,0x6000000A,0x5000028A,0x1F00288,0x1F00288,0x6000000A,0x5000028A,0x5000028A,0xF68000DD,0xF4980139,0xA40288,0xF8500029,0xCC380000,0xA2380000,0x8E480000,0x80280000,0xF2600109,0xFE240014,0x15C0288,0x6000000A,
-0x15C0288,0xF00374,0xE2C80000,0xA2C80000,0x8CC80001,0x1680372,0xB27C0001,0x8CA00001,0x39F80372,0x8C200001,0x78000372,0x1680372,0xB27C0001,0x8CA00001,0x39F80372,0x8C200001,0x78000372,0x39F80372,0x8C200001,0x78000372,0x78000372,0x1680372,0xB27C0001,0x8CA00001,0x39F80372,0x8C200001,0x78000372,0x39F80372,0x8C200001,0x78000372,0x78000372,0x39F80372,
-0x8C200001,0x78000372,0x78000372,0x78000372,0xFEB401ED,0x9000372,0xFCE80221,0xFE6C0055,0xE4180001,0xAC240000,0x8C500001,0x8A00000D,0xFE9801D4,0xFE400032,0x94580000,0x78000372,0x3FC0372,0x540374,0x540374,0x540374,0x540374,0x540374,0x540374,0x540374,0x540374,0x540374,0x540374,0x942C0000,0x942C0000,0x942C0000,0x942C0000,0x942C0000,
-0x942C0000,0x542C0000,0x542C0000,0x542C0000,0x3E2C0001,0x800372,0x800372,0x800372,0x800372,0x800372,0x800372,0x58000041,0x58000041,0x58000041,0x3E040001,0x1000372,0x1000372,0x1000372,0x38000131,0x2A000372,0xFC480188,0x540374,0x540374,0xFA340034,0xE22C0000,0xB22C0000,0xB22C0000,0x7C2C0000,0xFE200115,0xFE140019,0x6A200000,0x58000041,
-0xB40372,};
-static const uint32_t g_etc1_to_bc7_m6_table139[] = {
-0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0x580000,0xB00000,
-0xB00000,0xB00000,0xB00000,0x1C000001,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x3C0000,0x400000,0x400000,0x400000,0x580000,0x7C0000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,
-0xD40000,0x1B00000,0x1B00000,0x1B00000,0x46000001,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x1B00000,0x1B00000,0x1B00000,0x46000001,0x1B00000,0x1B00000,0x1B00000,0x46000001,0x46000001,0x4980000,0x900000,0x900000,0xA80000,0xB40000,0xC40000,0xC40000,0xF00000,0xA80000,0xB40000,0x1300000,0x1B00000,
-0x1300000,0xD80000,0xD80000,0xD80000,0xD80000,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x25F80000,0x25F80000,0x6A000001,0x6A000001,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x6A000001,0x25F80000,0x25F80000,0x6A000001,0x6A000001,0x25F80000,
-0x25F80000,0x6A000001,0x6A000001,0x6A000001,0xFC0000,0xAE40000,0xD80000,0x3240000,0x16C0000,0x1CC0000,0x9F80000,0x37FC0000,0x50C0000,0x1400000,0x1CC0000,0x6A000001,0x1CC0000,0x12C0000,0x3BC0000,0x63FC0000,0x94000001,0x3BC0000,0x63FC0000,0x94000001,0x63FC0000,0x94000001,0x94000001,0x3BC0000,0x63FC0000,0x94000001,0x63FC0000,0x94000001,
-0x94000001,0x63FC0000,0x94000001,0x94000001,0x94000001,0x3BC0000,0x63FC0000,0x94000001,0x63FC0000,0x94000001,0x94000001,0x63FC0000,0x94000001,0x94000001,0x94000001,0x63FC0000,0x94000001,0x94000001,0x94000001,0x94000001,0x1780000,0x1400000,0x1400000,0x1F80000,0x4FFC0000,0x7BFC0000,0x94000001,0x94000001,0x1980000,0x21FC0000,0x8FF00000,0x94000001,
-0x3BFC0000,0xB41EA8,0xFE980D35,0xBC8C0BE8,0x948C0BE9,0xFE74078E,0xCA6803A1,0x9A740531,0xA66406E6,0x926403AE,0x806406E6,0xFE440C13,0xD03C028A,0x9C540411,0xAE38038E,0x943C000E,0x824403AE,0x943C0BE8,0x882C0413,0x7A340533,0x6C3C0BEB,0x10C1EA8,0xD8000BFB,0x94400BE9,0xBA0007BE,0x9608038E,0x801806E6,0x9E000DE7,0x8C00040B,0x7A000497,0x6C000C0F,0xBF81EA8,
-0x7800109C,0x72000DB4,0x6600126C,0x58001EAC,0xFE8C0E18,0xFCA8186E,0xFEAC19E1,0xFE5C0296,0xE23C0011,0xB23C0006,0x98440046,0x90340046,0xFE700D55,0xFE3401B7,0x9A28029D,0x7A000497,0x17C1EA8,0xDC0BE8,0xFCB802A9,0xB2B40288,0x94B40289,0xF4900372,0xB8900005,0x96980042,0x9C900372,0x8E8800A5,0x80900372,0x3440BE8,0xCC440289,0x96780289,0xB6240372,0x9444000A,
-0x80580372,0x27FC0BE8,0x8E000301,0x7C000422,0x6C000BEB,0x3440BE8,0xCC440289,0x96780289,0xB6240372,0x9444000A,0x80580372,0x27FC0BE8,0x8E000301,0x7C000422,0x6C000BEB,0x27FC0BE8,0x8E000301,0x7C000422,0x6C000BEB,0x6C000BEB,0xFCB005BD,0xFECC0938,0xFECC094C,0xFE6C0129,0xE240000D,0xB0400006,0x96580001,0x92200022,0xFE8805E8,0xFE4800B9,0x9E180288,0x7C000422,
-0x1D40BE8,0x8C0BE8,0x8C0BE8,0x8C0BE8,0x8C0BE8,0xDE680374,0xDE680374,0xDE680374,0x86680374,0x86680374,0x6A640375,0xD43C0288,0xD43C0288,0xD43C0288,0x92400009,0x92400009,0x6E4800A6,0x763C0288,0x763C0288,0x66380041,0x583C028A,0xD00BE8,0xD00BE8,0xD00BE8,0x9E000375,0x9E000375,0x6A300373,0x8000032A,0x8000032A,0x6800001A,0x5A00028A,0x1A40BE8,
-0x1A40BE8,0x5C000545,0x5000056A,0x44000BEB,0xFE6804EE,0xFA8408B8,0x8C0BE8,0xFE4C00D9,0xE03C0005,0xB23C0005,0xA0440021,0x863C0002,0xFE500455,0xFE300081,0x942C028A,0x6800001A,0x1280BE8,0xB40288,0xB40288,0xB40288,0xB40288,0xB4900000,0xB4900000,0xB4900000,0x7C900000,0x7C900000,0x6A900001,0x10C0288,0x10C0288,0x10C0288,0x8A500001,0x8A500001,
-0x6C6C0001,0xBF80288,0xBF80288,0x6A000001,0x5800028A,0x10C0288,0x10C0288,0x10C0288,0x8A500001,0x8A500001,0x6C6C0001,0xBF80288,0xBF80288,0x6A000001,0x5800028A,0xBF80288,0xBF80288,0x6A000001,0x5800028A,0x5800028A,0xFE9000DD,0xFCA80139,0xB40288,0xFE5C002D,0xD4480000,0xAA480000,0x96580000,0x88380000,0xFA700109,0xFA400020,0x17C0288,0x6A000001,
-0x17C0288,0x1000374,0xEAD80000,0xAAD80000,0x94D80001,0x1800372,0xBA8C0001,0x94B00001,0x45F80372,0x94300001,0x80000372,0x1800372,0xBA8C0001,0x94B00001,0x45F80372,0x94300001,0x80000372,0x45F80372,0x94300001,0x80000372,0x80000372,0x1800372,0xBA8C0001,0x94B00001,0x45F80372,0x94300001,0x80000372,0x45F80372,0x94300001,0x80000372,0x80000372,0x45F80372,
-0x94300001,0x80000372,0x80000372,0x80000372,0xFCCC0202,0x1140372,0xF4F80244,0xFE800071,0xEC280001,0xB4340000,0x94600001,0x94000002,0xFEAC01F9,0xFE58004A,0x9C680000,0x80000372,0x13FC0372,0x640374,0x640374,0x640374,0x640374,0x640374,0x640374,0x640374,0x640374,0x640374,0x640374,0x9C3C0000,0x9C3C0000,0x9C3C0000,0x9C3C0000,0x9C3C0000,
-0x9C3C0000,0x5C3C0000,0x5C3C0000,0x5C3C0000,0x463C0001,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0x68000011,0x68000011,0x68000011,0x46140001,0x1300372,0x1300372,0x1300372,0x3E0000E1,0x32000372,0xF45801A5,0x640374,0x640374,0xFC48003D,0xEA3C0000,0xBA3C0000,0xBA3C0000,0x843C0000,0xFC380120,0xF8280029,0x72300000,0x68000011,
-0xD80372,};
-static const uint32_t g_etc1_to_bc7_m6_table140[] = {
-0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xE80000,
-0xE80000,0xE80000,0xE80000,0x26000000,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x540000,0x540000,0x540000,0x740000,0xA40000,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xA00001,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,
-0xF00000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0xF00000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0x1E80000,0x1E80000,0x1E80000,0x50000000,0x50000000,0xAC0000,0xA00001,0xA00001,0xBC0000,0xCC0000,0xDC0000,0xDC0000,0x1100000,0xBC0000,0xCC0000,0x1580000,0x1E80000,
-0x1580000,0xE80001,0xE80001,0xE80001,0xE80001,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x33F80000,0x33F80000,0x74000000,0x74000000,0x15C0000,0x15C0000,0x15C0000,0x33F80000,0x33F80000,0x74000000,0x33F80000,0x33F80000,0x74000000,0x74000000,0x33F80000,
-0x33F80000,0x74000000,0x74000000,0x74000000,0x1100000,0x4F80000,0xE80001,0x33C0000,0x1880000,0x1F00000,0x17FC0000,0x43FC0000,0x1240000,0x15C0000,0x1F00000,0x74000000,0x1F00000,0x13C0001,0x1D80000,0x71F80000,0x9E000000,0x1D80000,0x71F80000,0x9E000000,0x71F80000,0x9E000000,0x9E000000,0x1D80000,0x71F80000,0x9E000000,0x71F80000,0x9E000000,
-0x9E000000,0x71F80000,0x9E000000,0x9E000000,0x9E000000,0x1D80000,0x71F80000,0x9E000000,0x71F80000,0x9E000000,0x9E000000,0x71F80000,0x9E000000,0x9E000000,0x9E000000,0x71F80000,0x9E000000,0x9E000000,0x9E000000,0x9E000000,0x58C0000,0x1540000,0x1540000,0x13FC0000,0x5FF80000,0x87F80000,0x9E000000,0x9E000000,0x1B00000,0x33FC0000,0x99E40000,0x9E000000,
-0x4BFC0000,0xC41EAC,0xFEA40D93,0xC4A00BEB,0x9E9C0BEB,0xFE8407F4,0xD47C03A3,0xA2840533,0xAE7806E6,0x9A7403AE,0x887806E6,0xFE5C0C49,0xDA50028A,0xA6680413,0xB84C038E,0x9E4C000E,0x8A5403AE,0x9C500BE9,0x92400411,0x82440531,0x764C0BE9,0x3241EA8,0xE8080BE9,0x9E500BE8,0xC6000736,0xA01C038E,0x882C06E6,0xAE000D2F,0x98000345,0x840003FD,0x76000BE8,0x17FC1EA8,
-0x84000FB8,0x78000C98,0x6C001198,0x62001EA8,0xFEA00EBE,0xF6BC18F4,0xF8C01A44,0xFE70034E,0xF04C000E,0xBC500006,0xA2580046,0x9A480046,0xFE800E11,0xFE500235,0xA438029D,0x840003FD,0x1A41EA8,0xEC0BEB,0xFEC802CA,0xBCC4028A,0x9EC4028A,0xFCA40373,0xC4A00002,0xA0AC0041,0xA4A40373,0x989C00A6,0x88A00375,0x1600BE8,0xD4580289,0x9E8C0288,0xBE380372,0x9C540009,
-0x886C0374,0x35FC0BE8,0x9A0002CA,0x840003E4,0x76000BE8,0x1600BE8,0xD4580289,0x9E8C0288,0xBE380372,0x9C540009,0x886C0374,0x35FC0BE8,0x9A0002CA,0x840003E4,0x76000BE8,0x35FC0BE8,0x9A0002CA,0x840003E4,0x76000BE8,0x76000BE8,0xFEBC0611,0xF8E00975,0xFCE8094E,0xFE800189,0xEA54000D,0xBA540005,0x9E680002,0x9A380021,0xFEA40632,0xFE5C0116,0xA62C0289,0x840003E4,
-0x1F80BE8,0x9C0BEB,0x9C0BEB,0x9C0BEB,0x9C0BEB,0xE8780372,0xE8780372,0xE8780372,0x90780372,0x90780372,0x74780372,0xDA500289,0xDA500289,0xDA500289,0x9A50000A,0x9A50000A,0x785C00A5,0x804C0289,0x804C0289,0x704C0042,0x624C0289,0x2E80BE8,0x2E80BE8,0x2E80BE8,0xAA0C0372,0xAA0C0372,0x74400372,0x8C0002D9,0x8C0002D9,0x74000005,0x62140288,0x1DC0BE8,
-0x1DC0BE8,0x660004D8,0x5C0004E1,0x4E000BE8,0xFE840522,0xF49808F6,0x9C0BEB,0xFE680122,0xE6500006,0xB8500005,0xA8540022,0x904C0001,0xFE6404B5,0xFE4400B9,0x9E3C0289,0x74000005,0x1500BE8,0xC4028A,0xC4028A,0xC4028A,0xC4028A,0xBCA40001,0xBCA40001,0xBCA40001,0x86A00001,0x86A00001,0x74A00001,0x3240288,0x3240288,0x3240288,0x94600001,0x94600001,
-0x74800000,0x17FC0288,0x17FC0288,0x740C0000,0x62000288,0x3240288,0x3240288,0x3240288,0x94600001,0x94600001,0x74800000,0x17FC0288,0x17FC0288,0x740C0000,0x62000288,0x17FC0288,0x17FC0288,0x740C0000,0x62000288,0x62000288,0xFEA000F4,0xF6BC0152,0xC4028A,0xFE78003D,0xD8600001,0xB25C0000,0xA0680000,0x904C0000,0xF8880120,0xFC580029,0x1A40288,0x740C0000,
-0x1A40288,0x1140372,0xF2EC0001,0xB2EC0001,0x9EE80001,0x3980372,0xC49C0001,0x9EC00000,0x51FC0372,0x9E3C0000,0x88000374,0x3980372,0xC49C0001,0x9EC00000,0x51FC0372,0x9E3C0000,0x88000374,0x51FC0372,0x9E3C0000,0x88000374,0x88000374,0x3980372,0xC49C0001,0x9EC00000,0x51FC0372,0x9E3C0000,0x88000374,0x51FC0372,0x9E3C0000,0x88000374,0x88000374,0x51FC0372,
-0x9E3C0000,0x88000374,0x88000374,0x88000374,0xF6E80221,0xB240372,0xFF0C0242,0xFE9C0091,0xF43C0001,0xBE440000,0x9E700000,0x9E000000,0xF8D00200,0xFA7C0071,0xA47C0000,0x88000374,0x23FC0372,0x780372,0x780372,0x780372,0x780372,0x780372,0x780372,0x780372,0x780372,0x780372,0x780372,0xA4500001,0xA4500001,0xA4500001,0xA4500001,0xA4500001,
-0xA4500001,0x64500001,0x64500001,0x64500001,0x504C0001,0x2B00372,0x2B00372,0x2B00372,0x2B00372,0x2B00372,0x2B00372,0x76000001,0x76000001,0x76000001,0x50240000,0x1680372,0x1680372,0x1680372,0x4A00009D,0x3A000374,0xFE6C01A5,0x780372,0x780372,0xFE58004A,0xF0500001,0xC0500001,0xC0500001,0x8C500001,0xF84C0139,0xFE3C0032,0x78440001,0x76000001,
-0xFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table141[] = {
-0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x8C0000,0x1180000,
-0x1180000,0x1180000,0x1180000,0x2E000000,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x640000,0x640000,0x640000,0x8C0000,0xC80000,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0xB00001,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,
-0x1080000,0x9F80000,0x9F80000,0x9F80000,0x58000000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x9F80000,0x9F80000,0x9F80000,0x58000000,0x9F80000,0x9F80000,0x9F80000,0x58000000,0x58000000,0x6BC0000,0xB00001,0xB00001,0x6CC0000,0xE00000,0x2F00000,0x2F00000,0x12C0000,0x6CC0000,0xE00000,0x1780000,0x9F80000,
-0x1780000,0xF80001,0xF80001,0xF80001,0xF80001,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x3FF80000,0x3FF80000,0x7C000000,0x7C000000,0x1740000,0x1740000,0x1740000,0x3FF80000,0x3FF80000,0x7C000000,0x3FF80000,0x3FF80000,0x7C000000,0x7C000000,0x3FF80000,
-0x3FF80000,0x7C000000,0x7C000000,0x7C000000,0x7200000,0xD080000,0xF80001,0x1540000,0x1A40000,0xBFC0000,0x25FC0000,0x4FF80000,0x1380000,0x1740000,0xBFC0000,0x7C000000,0xBFC0000,0x14C0001,0x1F00000,0x7DF80000,0xA6000000,0x1F00000,0x7DF80000,0xA6000000,0x7DF80000,0xA6000000,0xA6000000,0x1F00000,0x7DF80000,0xA6000000,0x7DF80000,0xA6000000,
-0xA6000000,0x7DF80000,0xA6000000,0xA6000000,0xA6000000,0x1F00000,0x7DF80000,0xA6000000,0x7DF80000,0xA6000000,0xA6000000,0x7DF80000,0xA6000000,0xA6000000,0xA6000000,0x7DF80000,0xA6000000,0xA6000000,0xA6000000,0xA6000000,0x5A00000,0x1640000,0x1640000,0x27FC0000,0x6BFC0000,0x91F80000,0xA6000000,0xA6000000,0x3C40000,0x45FC0000,0xA1F40000,0xA6000000,
-0x5BFC0000,0xD41EAC,0xFEBC0E0B,0xCCB00BEB,0xA6AC0BEB,0xFE980856,0xDC8C03A3,0xAA940533,0xB68806E6,0xA28403AE,0x908806E6,0xFE740C91,0xE260028A,0xAE780413,0xC05C038E,0xA65C000E,0x926403AE,0xA4600BE9,0x9A500411,0x8A540531,0x7E5C0BE9,0x33C1EA8,0xF0180BE9,0xA6600BE8,0xD80006F6,0xA82C038E,0x903C06E6,0xBA000CAF,0xA20002CE,0x8E0003B5,0x7E100BE8,0x23FC1EA8,
-0x90000EF8,0x84000BA8,0x780010D8,0x6A001EA8,0xFEB40F7B,0xFECC18F4,0xFECC1A5C,0xFE8003FA,0xF85C000E,0xC4600006,0xAA680046,0xA2580046,0xFE900EA5,0xFE6002FA,0xAC48029D,0x8E0003B5,0x1C81EA8,0xFC0BEB,0xFEDC02EB,0xC4D4028A,0xA6D4028A,0xFEB4037B,0xCCB00002,0xA8BC0041,0xACB40373,0xA0AC00A6,0x90B00375,0x1780BE8,0xDC680289,0xA69C0288,0xC6480372,0xA4640009,
-0x907C0374,0x41FC0BE8,0xA20002AA,0x900003B4,0x7E000BE8,0x1780BE8,0xDC680289,0xA69C0288,0xC6480372,0xA4640009,0x907C0374,0x41FC0BE8,0xA20002AA,0x900003B4,0x7E000BE8,0x41FC0BE8,0xA20002AA,0x900003B4,0x7E000BE8,0x7E000BE8,0xFED00640,0xFEEC098D,0xF4F80993,0xFE9801D5,0xF264000D,0xC2640005,0xA6780002,0xA2480021,0xFEB40682,0xFC80016E,0xAE3C0289,0x900003B4,
-0xFFC0BE8,0xAC0BEB,0xAC0BEB,0xAC0BEB,0xAC0BEB,0xF0880372,0xF0880372,0xF0880372,0x98880372,0x98880372,0x7C880372,0xE2600289,0xE2600289,0xE2600289,0xA260000A,0xA260000A,0x806C00A5,0x885C0289,0x885C0289,0x785C0042,0x6A5C0289,0x3000BE8,0x3000BE8,0x3000BE8,0xB21C0372,0xB21C0372,0x7C500372,0x9E0002A1,0x9E0002A1,0x7C100005,0x6A240288,0x5FC0BE8,
-0x5FC0BE8,0x72000478,0x64000489,0x56000BE8,0xFE90056E,0xFCA808F6,0xAC0BEB,0xFE780156,0xEE600006,0xC0600005,0xB0640022,0x985C0001,0xFE7804E6,0xFE5C00F5,0xA64C0289,0x7C100005,0x1700BE8,0xD4028A,0xD4028A,0xD4028A,0xD4028A,0xC4B40001,0xC4B40001,0xC4B40001,0x8EB00001,0x8EB00001,0x7CB00001,0x33C0288,0x33C0288,0x33C0288,0x9C700001,0x9C700001,
-0x7C900000,0x23FC0288,0x23FC0288,0x7C1C0000,0x6A000288,0x33C0288,0x33C0288,0x33C0288,0x9C700001,0x9C700001,0x7C900000,0x23FC0288,0x23FC0288,0x7C1C0000,0x6A000288,0x23FC0288,0x23FC0288,0x7C1C0000,0x6A000288,0x6A000288,0xFAB40109,0xFECC0152,0xD4028A,0xFE88004A,0xE0700001,0xBA6C0000,0xA8780000,0x985C0000,0xFE940128,0xFE680034,0x1C80288,0x7C1C0000,
-0x1C80288,0x1240372,0xFAFC0001,0xBAFC0001,0xA6F80001,0x3B00372,0xCCAC0001,0xA6D00000,0x5DFC0372,0xA64C0000,0x90000374,0x3B00372,0xCCAC0001,0xA6D00000,0x5DFC0372,0xA64C0000,0x90000374,0x5DFC0372,0xA64C0000,0x90000374,0x90000374,0x3B00372,0xCCAC0001,0xA6D00000,0x5DFC0372,0xA64C0000,0x90000374,0x5DFC0372,0xA64C0000,0x90000374,0x90000374,0x5DFC0372,
-0xA64C0000,0x90000374,0x90000374,0x90000374,0xFEF80221,0x1380372,0xF71C0265,0xFEB000AA,0xFC4C0001,0xC6540000,0xA6800000,0xA6100000,0xFEDC0208,0xFE940080,0xAC8C0000,0x90000374,0x33FC0372,0x880372,0x880372,0x880372,0x880372,0x880372,0x880372,0x880372,0x880372,0x880372,0x880372,0xAC600001,0xAC600001,0xAC600001,0xAC600001,0xAC600001,
-0xAC600001,0x6C600001,0x6C600001,0x6C600001,0x585C0001,0xC80372,0xC80372,0xC80372,0xC80372,0xC80372,0xC80372,0x7E100001,0x7E100001,0x7E100001,0x58340000,0x1980372,0x1980372,0x1980372,0x50000071,0x42000374,0xF67C01C2,0x880372,0x880372,0xFE680059,0xF8600001,0xC8600001,0xC8600001,0x94600001,0xFE580145,0xFE50003D,0x80540001,0x7E100001,
-0x1200372,};
-static const uint32_t g_etc1_to_bc7_m6_table142[] = {
-0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0x14C0000,
-0x14C0000,0x14C0000,0x14C0000,0x36000000,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0x2740000,0x2740000,0x2740000,0xA40000,0xE80000,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0xC00001,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,
-0x1200000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x1200000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x15F80000,0x15F80000,0x15F80000,0x60000000,0x60000000,0xECC0000,0xC00001,0xC00001,0x2E00000,0xF40000,0x1080000,0x1080000,0x1440000,0x2E00000,0xF40000,0x19C0000,0x15F80000,
-0x19C0000,0x1080001,0x1080001,0x1080001,0x1080001,0x18C0000,0x18C0000,0x18C0000,0x4BF80000,0x4BF80000,0x84000000,0x18C0000,0x18C0000,0x18C0000,0x4BF80000,0x4BF80000,0x84000000,0x4BF80000,0x4BF80000,0x84000000,0x84000000,0x18C0000,0x18C0000,0x18C0000,0x4BF80000,0x4BF80000,0x84000000,0x4BF80000,0x4BF80000,0x84000000,0x84000000,0x4BF80000,
-0x4BF80000,0x84000000,0x84000000,0x84000000,0x3340000,0x11C0000,0x1080001,0x3680000,0x1C00000,0x1BFC0000,0x33F80000,0x59FC0000,0x34C0000,0x18C0000,0x1BFC0000,0x84000000,0x1BFC0000,0x15C0001,0xDFC0000,0x89F80000,0xAE000000,0xDFC0000,0x89F80000,0xAE000000,0x89F80000,0xAE000000,0xAE000000,0xDFC0000,0x89F80000,0xAE000000,0x89F80000,0xAE000000,
-0xAE000000,0x89F80000,0xAE000000,0xAE000000,0xAE000000,0xDFC0000,0x89F80000,0xAE000000,0x89F80000,0xAE000000,0xAE000000,0x89F80000,0xAE000000,0xAE000000,0xAE000000,0x89F80000,0xAE000000,0xAE000000,0xAE000000,0xAE000000,0x5B40000,0x3740000,0x3740000,0x3BFC0000,0x79FC0000,0x9BF80000,0xAE000000,0xAE000000,0x1DC0000,0x55FC0000,0xABC80000,0xAE000000,
-0x69FC0000,0xE41EAC,0xFEC80E73,0xD4C00BEB,0xAEBC0BEB,0xFEA808EC,0xE49C03A3,0xB2A40533,0xBE9806E6,0xAA9403AE,0x989806E6,0xFE840CF4,0xEA70028A,0xB6880413,0xC86C038E,0xAE6C000E,0x9A7403AE,0xAC700BE9,0xA2600411,0x92640531,0x866C0BE9,0x1541EA8,0xF8280BE9,0xAE700BE8,0xE20406E6,0xB03C038E,0x984C06E6,0xC6000C4F,0xAC000292,0x980403A1,0x86200BE8,0x2FFC1EA8,
-0x9C000E58,0x8A000AE8,0x7E00102C,0x72001EA8,0xFEBC1022,0xF6DC197E,0xF8E01AAF,0xFE9804BE,0xFA70000F,0xCC700006,0xB2780046,0xAA680046,0xFEA40F96,0xFE7403B7,0xB458029D,0x980403A1,0x1E81EA8,0x10C0BEB,0xFEEC032A,0xCCE4028A,0xAEE4028A,0xFEC8038D,0xD4C00002,0xB0CC0041,0xB4C40373,0xA8BC00A6,0x98C00375,0x1900BE8,0xE4780289,0xAEAC0288,0xCE580372,0xAC740009,
-0x988C0374,0x4DFC0BE8,0xAC00028E,0x96000394,0x86000BE8,0x1900BE8,0xE4780289,0xAEAC0288,0xCE580372,0xAC740009,0x988C0374,0x4DFC0BE8,0xAC00028E,0x96000394,0x86000BE8,0x4DFC0BE8,0xAC00028E,0x96000394,0x86000BE8,0x86000BE8,0xFEE406A5,0xFB0409B3,0xFD080993,0xFEB0022E,0xFA74000D,0xCA740005,0xAE880002,0xAA580021,0xFAD406D1,0xFE9001B2,0xB64C0289,0x96000394,
-0x1FF80BE8,0xBC0BEB,0xBC0BEB,0xBC0BEB,0xBC0BEB,0xF8980372,0xF8980372,0xF8980372,0xA0980372,0xA0980372,0x84980372,0xEA700289,0xEA700289,0xEA700289,0xAA70000A,0xAA70000A,0x887C00A5,0x906C0289,0x906C0289,0x806C0042,0x726C0289,0x3180BE8,0x3180BE8,0x3180BE8,0xBA2C0372,0xBA2C0372,0x84600372,0xAA000289,0xAA000289,0x84200005,0x72340288,0x11FC0BE8,
-0x11FC0BE8,0x7C000432,0x6C000414,0x5E000BE8,0xFEA005A5,0xF4B80933,0xBC0BEB,0xFE880193,0xF6700006,0xC8700005,0xB8740022,0xA06C0001,0xFC8C0549,0xFE680138,0xAE5C0289,0x84200005,0x1940BE8,0xE4028A,0xE4028A,0xE4028A,0xE4028A,0xCCC40001,0xCCC40001,0xCCC40001,0x96C00001,0x96C00001,0x84C00001,0x1540288,0x1540288,0x1540288,0xA4800001,0xA4800001,
-0x84A00000,0x2FFC0288,0x2FFC0288,0x842C0000,0x72000288,0x1540288,0x1540288,0x1540288,0xA4800001,0xA4800001,0x84A00000,0x2FFC0288,0x2FFC0288,0x842C0000,0x72000288,0x2FFC0288,0x2FFC0288,0x842C0000,0x72000288,0x72000288,0xF6C80120,0xF6DC016D,0xE4028A,0xFE980061,0xE8800001,0xC27C0000,0xB0880000,0xA06C0000,0xFAAC0139,0xFE800048,0x1E80288,0x842C0000,
-0x1E80288,0x1340372,0xFF0C0002,0xC30C0001,0xAF080001,0x1C80372,0xD4BC0001,0xAEE00000,0x69FC0372,0xAE5C0000,0x98000374,0x1C80372,0xD4BC0001,0xAEE00000,0x69FC0372,0xAE5C0000,0x98000374,0x69FC0372,0xAE5C0000,0x98000374,0x98000374,0x1C80372,0xD4BC0001,0xAEE00000,0x69FC0372,0xAE5C0000,0x98000374,0x69FC0372,0xAE5C0000,0x98000374,0x98000374,0x69FC0372,
-0xAE5C0000,0x98000374,0x98000374,0x98000374,0xFB0C0242,0x1480372,0xFF2C0265,0xFEC800D0,0xFE680005,0xCE640000,0xAE900000,0xAE200000,0xFEF0022D,0xFEAC00A4,0xB49C0000,0x98000374,0x41FC0372,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0x980372,0xB4700001,0xB4700001,0xB4700001,0xB4700001,0xB4700001,
-0xB4700001,0x74700001,0x74700001,0x74700001,0x606C0001,0xE00372,0xE00372,0xE00372,0xE00372,0xE00372,0xE00372,0x86200001,0x86200001,0x86200001,0x60440000,0x1CC0372,0x1CC0372,0x1CC0372,0x5C000041,0x4A000374,0xFE8C01C2,0x980372,0x980372,0xFA7C0071,0xFA700002,0xD0700001,0xD0700001,0x9C700001,0xFC700152,0xFC600055,0x88640001,0x86200001,
-0x1400372,};
-static const uint32_t g_etc1_to_bc7_m6_table143[] = {
-0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0xBC0000,0x17C0000,
-0x17C0000,0x17C0000,0x17C0000,0x3E000000,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0xA840000,0xA840000,0xA840000,0xBC0000,0x10C0000,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,
-0x1380000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x21F80000,0x21F80000,0x21F80000,0x68000000,0x68000000,0xE00000,0xD00001,0xD00001,0xF40000,0x1080000,0x11C0000,0x11C0000,0x1600000,0xF40000,0x1080000,0x1BC0000,0x21F80000,
-0x1BC0000,0x1180001,0x1180001,0x1180001,0x1180001,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x57F80000,0x57F80000,0x8C000000,0x8C000000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x8C000000,0x57F80000,0x57F80000,0x8C000000,0x8C000000,0x57F80000,
-0x57F80000,0x8C000000,0x8C000000,0x8C000000,0x1480000,0x12C0000,0x1180001,0x1800000,0x1D80000,0x29FC0000,0x3FFC0000,0x65F40000,0x3600000,0x1A40000,0x29FC0000,0x8C000000,0x29FC0000,0x16C0001,0x25FC0000,0x95F80000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,
-0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0xB6000000,0x1CC0000,0xB840000,0xB840000,0x4FFC0000,0x87F80000,0xA5F80000,0xB6000000,0xB6000000,0x3F00000,0x67FC0000,0xB3D80000,0xB6000000,
-0x79FC0000,0xF41EAC,0xFEDC0EEC,0xDCD00BEB,0xB6CC0BEB,0xFEBC096E,0xECAC03A3,0xBAB40533,0xC6A806E6,0xB2A403AE,0xA0A806E6,0xFE980D51,0xF280028A,0xBE980413,0xD07C038E,0xB67C000E,0xA28403AE,0xB4800BE9,0xAA700411,0x9A740531,0x8E7C0BE9,0x16C1EA8,0xFE380BEF,0xB6800BE8,0xEA1406E6,0xB84C038E,0xA05C06E6,0xD2000C0F,0xB608028A,0xA01403A1,0x8E300BE8,0x3BFC1EA8,
-0xA6000DDB,0x96000A18,0x8A000F9C,0x7A001EA8,0xFED010D1,0xFEEC197E,0xFEEC1ACB,0xFEAC059E,0xFE840027,0xD4800006,0xBA880046,0xB2780046,0xFEB4101B,0xFE900462,0xBC68029D,0xA01403A1,0x7FC1EA8,0x11C0BEB,0xFF000363,0xD4F4028A,0xB6F4028A,0xFEDC03AB,0xDCD00002,0xB8DC0041,0xBCD40373,0xB0CC00A6,0xA0D00375,0x1A80BE8,0xEC880289,0xB6BC0288,0xD6680372,0xB4840009,
-0xA09C0374,0x59FC0BE8,0xB6000288,0xA000037D,0x8E000BE8,0x1A80BE8,0xEC880289,0xB6BC0288,0xD6680372,0xB4840009,0xA09C0374,0x59FC0BE8,0xB6000288,0xA000037D,0x8E000BE8,0x59FC0BE8,0xB6000288,0xA000037D,0x8E000BE8,0x8E000BE8,0xFEF806DA,0xFF0C09DB,0xF51809DA,0xFEC00296,0xFE88001A,0xD2840005,0xB6980002,0xB2680021,0xFEDC06F9,0xFEA40224,0xBE5C0289,0xA000037D,
-0x2DFC0BE8,0xCC0BEB,0xCC0BEB,0xCC0BEB,0xCC0BEB,0xFEA80373,0xFEA80373,0xFEA80373,0xA8A80372,0xA8A80372,0x8CA80372,0xF2800289,0xF2800289,0xF2800289,0xB280000A,0xB280000A,0x908C00A5,0x987C0289,0x987C0289,0x887C0042,0x7A7C0289,0x3300BE8,0x3300BE8,0x3300BE8,0xC23C0372,0xC23C0372,0x8C700372,0xB2100289,0xB2100289,0x8C300005,0x7A440288,0x1DFC0BE8,
-0x1DFC0BE8,0x840003E4,0x760003C9,0x66000BE8,0xFCB805F6,0xFCC80933,0xCC0BEB,0xFE9801DA,0xFE800006,0xD0800005,0xC0840022,0xA87C0001,0xFC9C0581,0xFE800171,0xB66C0289,0x8C300005,0x1B40BE8,0xF4028A,0xF4028A,0xF4028A,0xF4028A,0xD4D40001,0xD4D40001,0xD4D40001,0x9ED00001,0x9ED00001,0x8CD00001,0x16C0288,0x16C0288,0x16C0288,0xAC900001,0xAC900001,
-0x8CB00000,0x3BFC0288,0x3BFC0288,0x8C3C0000,0x7A000288,0x16C0288,0x16C0288,0x16C0288,0xAC900001,0xAC900001,0x8CB00000,0x3BFC0288,0x3BFC0288,0x8C3C0000,0x7A000288,0x3BFC0288,0x3BFC0288,0x8C3C0000,0x7A000288,0x7A000288,0xFED80120,0xFEEC016D,0xF4028A,0xFEB40071,0xF0900001,0xCA8C0000,0xB8980000,0xA87C0000,0xFCC00152,0xFE980055,0x7FC0288,0x8C3C0000,
-0x7FC0288,0x1440372,0xFF1C0011,0xCB1C0001,0xB7180001,0x1E00372,0xDCCC0001,0xB6F00000,0x75FC0372,0xB66C0000,0xA0000374,0x1E00372,0xDCCC0001,0xB6F00000,0x75FC0372,0xB66C0000,0xA0000374,0x75FC0372,0xB66C0000,0xA0000374,0xA0000374,0x1E00372,0xDCCC0001,0xB6F00000,0x75FC0372,0xB66C0000,0xA0000374,0x75FC0372,0xB66C0000,0xA0000374,0xA0000374,0x75FC0372,
-0xB66C0000,0xA0000374,0xA0000374,0xA0000374,0xFF140262,0x5580372,0xF73C028A,0xFCE800F2,0xFC8C0019,0xD6740000,0xB6A00000,0xB6300000,0xFD0C0242,0xFEC000C1,0xBCAC0000,0xA0000374,0x51FC0372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xA80372,0xBC800001,0xBC800001,0xBC800001,0xBC800001,0xBC800001,
-0xBC800001,0x7C800001,0x7C800001,0x7C800001,0x687C0001,0xF80372,0xF80372,0xF80372,0xF80372,0xF80372,0xF80372,0x8E300001,0x8E300001,0x8E300001,0x68540000,0x1FC0372,0x1FC0372,0x1FC0372,0x66000028,0x52000374,0xF69C01E1,0xA80372,0xA80372,0xFC8C0082,0xFE800005,0xD8800001,0xD8800001,0xA4800001,0xF884016D,0xFC740062,0x90740001,0x8E300001,
-0x1640372,};
-static const uint32_t g_etc1_to_bc7_m6_table144[] = {
-0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x1B00000,
-0x1B00000,0x1B00000,0x1B00000,0x46000001,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x4980000,0x4980000,0x4980000,0xD40000,0x1300000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0xE40000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,
-0x3500000,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x3500000,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x2DFC0000,0x2DFC0000,0x2DFC0000,0x70000001,0x70000001,0xF40000,0xE40000,0xE40000,0x1080000,0x31C0000,0x3340000,0x3340000,0x1800000,0x1080000,0x31C0000,0x1E40000,0x2DFC0000,
-0x1E40000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x63FC0000,0x63FC0000,0x94000001,0x94000001,0x3BC0000,0x3BC0000,0x3BC0000,0x63FC0000,0x63FC0000,0x94000001,0x63FC0000,0x63FC0000,0x94000001,0x94000001,0x63FC0000,
-0x63FC0000,0x94000001,0x94000001,0x94000001,0x15C0000,0x1400000,0x12C0000,0x1980000,0x1F80000,0x3BFC0000,0x4FFC0000,0x71F80000,0x1780000,0x3BC0000,0x3BFC0000,0x94000001,0x3BFC0000,0x1800000,0x41FC0000,0xA1FC0000,0xBE000001,0x41FC0000,0xA1FC0000,0xBE000001,0xA1FC0000,0xBE000001,0xBE000001,0x41FC0000,0xA1FC0000,0xBE000001,0xA1FC0000,0xBE000001,
-0xBE000001,0xA1FC0000,0xBE000001,0xBE000001,0xBE000001,0x41FC0000,0xA1FC0000,0xBE000001,0xA1FC0000,0xBE000001,0xBE000001,0xA1FC0000,0xBE000001,0xBE000001,0xBE000001,0xA1FC0000,0xBE000001,0xBE000001,0xBE000001,0xBE000001,0x3E00000,0x5980000,0x5980000,0x65FC0000,0x95FC0000,0xB1F40000,0xBE000001,0xBE000001,0x13FC0000,0x79FC0000,0xBDCC0000,0xBE000001,
-0x89FC0000,0x1081EA8,0xFEE80F9C,0xE6E00BE8,0xBEE00BE9,0xFED00A18,0xF4BC03A1,0xC4C80531,0xD0B806E6,0xBCB803AE,0xAAB806E6,0xFEB00DDB,0xFA90028A,0xC6A80411,0xD88C038E,0xBE90000E,0xAC9803AE,0xBE900BE8,0xB2800413,0xA4880533,0x96900BEB,0x1881EA8,0xFE580C0F,0xBE940BE9,0xF22806E6,0xC05C038E,0xAA6C06E6,0xE0000BEF,0xBE18028A,0xA82403A3,0x96440BEB,0x49F81EA8,
-0xB2000D51,0xA000096E,0x90000EEC,0x82001EAC,0xFEE41196,0xF9001A08,0xFB041B18,0xFEC006A1,0xFE9C0075,0xDC900006,0xC2980046,0xBA880046,0xFED410EA,0xFEA40584,0xC47C029D,0xA82403A3,0x19FC1EA8,0x1300BE8,0xFF1003C9,0xDD080288,0xBF080289,0xFEF403E4,0xE2E40005,0xC0EC0042,0xC6E40372,0xB8DC00A5,0xAAE40372,0x1C40BE8,0xF6980289,0xC0CC0289,0xE0780372,0xBE98000A,
-0xAAAC0372,0x67F80BE8,0xBE140289,0xAA000373,0x96000BEB,0x1C40BE8,0xF6980289,0xC0CC0289,0xE0780372,0xBE98000A,0xAAAC0372,0x67F80BE8,0xBE140289,0xAA000373,0x96000BEB,0x67F80BE8,0xBE140289,0xAA000373,0x96000BEB,0x96000BEB,0xFD10074D,0xFD2809F6,0xFF2C09D9,0xFEDC0312,0xFEA4003E,0xDA940006,0xC0AC0001,0xBC740022,0xFEF80752,0xFEC00281,0xC86C0288,0xAA000373,
-0x3FF80BE8,0xE00BE8,0xE00BE8,0xE00BE8,0xE00BE8,0xFEBC037D,0xFEBC037D,0xFEBC037D,0xB0BC0374,0xB0BC0374,0x94B80375,0xFE900288,0xFE900288,0xFE900288,0xBC940009,0xBC940009,0x989C00A6,0xA0900288,0xA0900288,0x908C0041,0x8290028A,0x14C0BE8,0x14C0BE8,0x14C0BE8,0xCA500372,0xCA500372,0x94840373,0xBA240289,0xBA240289,0x96440002,0x8454028A,0x2BF80BE8,
-0x2BF80BE8,0x900003AB,0x7E000363,0x6E000BEB,0xFECC062C,0xF4D80975,0xE00BE8,0xFEAC023D,0xFE940011,0xDC900005,0xCA980021,0xB0900002,0xFEB405B5,0xFE9801C6,0xBE80028A,0x96440002,0x1DC0BE8,0x1080288,0x1080288,0x1080288,0x1080288,0xDEE40000,0xDEE40000,0xDEE40000,0xA6E40000,0xA6E40000,0x94E40001,0x1880288,0x1880288,0x1880288,0xB4A40001,0xB4A40001,
-0x96C00001,0x49F80288,0x49F80288,0x94540001,0x8200028A,0x1880288,0x1880288,0x1880288,0xB4A40001,0xB4A40001,0x96C00001,0x49F80288,0x49F80288,0x94540001,0x8200028A,0x49F80288,0x49F80288,0x94540001,0x8200028A,0x8200028A,0xFAEC0139,0xF9000188,0x1080288,0xFEC40088,0xFE9C0000,0xD49C0000,0xC0AC0000,0xB28C0000,0xFED0015A,0xFEB00071,0x19FC0288,0x94540001,
-0x19FC0288,0x1540374,0xFF300028,0xD52C0000,0xBF2C0001,0x1FC0372,0xE4E00001,0xBF040001,0x83F80372,0xBE840001,0xAA000372,0x1FC0372,0xE4E00001,0xBF040001,0x83F80372,0xBE840001,0xAA000372,0x83F80372,0xBE840001,0xAA000372,0xAA000372,0x1FC0372,0xE4E00001,0xBF040001,0x83F80372,0xBE840001,0xAA000372,0x83F80372,0xBE840001,0xAA000372,0xAA000372,0x83F80372,
-0xBE840001,0xAA000372,0xAA000372,0xAA000372,0xFF340265,0x16C0372,0xFF4C0290,0xFF000120,0xFEA8002D,0xDE880000,0xBEB40001,0xBE480001,0xFF180262,0xFEE400F4,0xC6BC0000,0xAA000372,0x61FC0372,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xB80374,0xC6900000,0xC6900000,0xC6900000,0xC6900000,0xC6900000,
-0xC6900000,0x86900000,0x86900000,0x86900000,0x70900001,0x1140372,0x1140372,0x1140372,0x1140372,0x1140372,0x1140372,0x96440001,0x96440001,0x96440001,0x70680001,0xFF80372,0xFF80372,0xFF80372,0x70000011,0x5C000372,0xFEAC01E5,0xB80374,0xB80374,0xFEA00091,0xFE94000D,0xE4900000,0xE4900000,0xAE900000,0xFE900179,0xFC880080,0x9C840000,0x96440001,
-0x18C0372,};
-static const uint32_t g_etc1_to_bc7_m6_table145[] = {
-0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0x1E40000,
-0x1E40000,0x1E40000,0x1E40000,0x4E000001,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xA00000,0xCA80000,0xCA80000,0xCA80000,0xEC0000,0x1540000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0x3680000,0x3680000,0x3680000,0x3680000,0x3680000,
-0x3680000,0x39FC0000,0x39FC0000,0x39FC0000,0x78000001,0x3680000,0x3680000,0x3680000,0x3680000,0x3680000,0x3680000,0x39FC0000,0x39FC0000,0x39FC0000,0x78000001,0x39FC0000,0x39FC0000,0x39FC0000,0x78000001,0x78000001,0x1040000,0xF40000,0xF40000,0x11C0000,0x3300000,0x14C0000,0x14C0000,0x1980000,0x11C0000,0x3300000,0x5FC0000,0x39FC0000,
-0x5FC0000,0x13C0000,0x13C0000,0x13C0000,0x13C0000,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x6FFC0000,0x6FFC0000,0x9C000001,0x9C000001,0x1D40000,0x1D40000,0x1D40000,0x6FFC0000,0x6FFC0000,0x9C000001,0x6FFC0000,0x6FFC0000,0x9C000001,0x9C000001,0x6FFC0000,
-0x6FFC0000,0x9C000001,0x9C000001,0x9C000001,0x1700000,0x1500000,0x13C0000,0x3AC0000,0x11FC0000,0x49FC0000,0x5DF80000,0x7BFC0000,0x18C0000,0x1D40000,0x49FC0000,0x9C000001,0x49FC0000,0x1900000,0x59FC0000,0xADFC0000,0xC6000001,0x59FC0000,0xADFC0000,0xC6000001,0xADFC0000,0xC6000001,0xC6000001,0x59FC0000,0xADFC0000,0xC6000001,0xADFC0000,0xC6000001,
-0xC6000001,0xADFC0000,0xC6000001,0xC6000001,0xC6000001,0x59FC0000,0xADFC0000,0xC6000001,0xADFC0000,0xC6000001,0xC6000001,0xADFC0000,0xC6000001,0xC6000001,0xC6000001,0xADFC0000,0xC6000001,0xC6000001,0xC6000001,0xC6000001,0x3F40000,0xDA80000,0xDA80000,0x79FC0000,0xA3FC0000,0xBBF40000,0xC6000001,0xC6000001,0x31FC0000,0x8BFC0000,0xC5DC0000,0xC6000001,
-0x99FC0000,0x1181EA8,0xFF00102C,0xEEF00BE8,0xC6F00BE9,0xFEE80AE8,0xFCCC03A1,0xCCD80531,0xD8C806E6,0xC4C803AE,0xB2C806E6,0xFEC40E58,0xFEA40292,0xCEB80411,0xE09C038E,0xC6A0000E,0xB4A803AE,0xC6A00BE8,0xBA900413,0xAC980533,0x9EA00BEB,0x1A01EA8,0xFE700C4F,0xC6A40BE9,0xFA3806E6,0xC86C038E,0xB27C06E6,0xEA0C0BE9,0xC628028A,0xB03403A3,0x9E540BEB,0x55F81EA8,
-0xBC000CF4,0xAA0008EC,0x9A000E73,0x8A001EAC,0xFEF81255,0xFF0C1A18,0xFF0C1B58,0xFED4078E,0xFEB000ED,0xE4A00006,0xCAA80046,0xC2980046,0xFEDC11C3,0xFEC00675,0xCC8C029D,0xB03403A3,0x27FC1EA8,0x1400BE8,0xFF240414,0xE5180288,0xC7180289,0xFF040432,0xEAF40005,0xC8FC0042,0xCEF40372,0xC0EC00A5,0xB2F40372,0x1DC0BE8,0xFEA80289,0xC8DC0289,0xE8880372,0xC6A8000A,
-0xB2BC0372,0x73F80BE8,0xC6240289,0xB2080372,0x9E000BEB,0x1DC0BE8,0xFEA80289,0xC8DC0289,0xE8880372,0xC6A8000A,0xB2BC0372,0x73F80BE8,0xC6240289,0xB2080372,0x9E000BEB,0x73F80BE8,0xC6240289,0xB2080372,0x9E000BEB,0x9E000BEB,0xFF20078E,0xF5380A38,0xF73C0A20,0xFEEC0395,0xFEBC007E,0xE2A40006,0xC8BC0001,0xC4840022,0xFF0807A3,0xFEDC0305,0xD07C0288,0xB2080372,
-0x4DFC0BE8,0xF00BE8,0xF00BE8,0xF00BE8,0xF00BE8,0xFED00394,0xFED00394,0xFED00394,0xB8CC0374,0xB8CC0374,0x9CC80375,0xFEA4028E,0xFEA4028E,0xFEA4028E,0xC4A40009,0xC4A40009,0xA0AC00A6,0xA8A00288,0xA8A00288,0x989C0041,0x8AA0028A,0x1640BE8,0x1640BE8,0x1640BE8,0xD2600372,0xD2600372,0x9C940373,0xC2340289,0xC2340289,0x9E540002,0x8C64028A,0x37F80BE8,
-0x37F80BE8,0x9A00038D,0x8800032A,0x76000BEB,0xFED80672,0xFCE80975,0xF00BE8,0xFEC4028C,0xFEAC0031,0xE4A00005,0xD2A80021,0xB8A00002,0xFEBC061A,0xFCAC0225,0xC690028A,0x9E540002,0x1FC0BE8,0x1180288,0x1180288,0x1180288,0x1180288,0xE6F40000,0xE6F40000,0xE6F40000,0xAEF40000,0xAEF40000,0x9CF40001,0x1A00288,0x1A00288,0x1A00288,0xBCB40001,0xBCB40001,
-0x9ED00001,0x55F80288,0x55F80288,0x9C640001,0x8A00028A,0x1A00288,0x1A00288,0x1A00288,0xBCB40001,0xBCB40001,0x9ED00001,0x55F80288,0x55F80288,0x9C640001,0x8A00028A,0x55F80288,0x55F80288,0x9C640001,0x8A00028A,0x8A00028A,0xF7000152,0xFF0C0190,0x1180288,0xFEDC0091,0xFEB00004,0xDCAC0000,0xC8BC0000,0xBA9C0000,0xF8EC016D,0xFEC40080,0x27FC0288,0x9C640001,
-0x27FC0288,0x1640374,0xFF440041,0xDD3C0000,0xC73C0001,0x19FC0372,0xECF00001,0xC7140001,0x8FF80372,0xC6940001,0xB2000372,0x19FC0372,0xECF00001,0xC7140001,0x8FF80372,0xC6940001,0xB2000372,0x8FF80372,0xC6940001,0xB2000372,0xB2000372,0x19FC0372,0xECF00001,0xC7140001,0x8FF80372,0xC6940001,0xB2000372,0x8FF80372,0xC6940001,0xB2000372,0xB2000372,0x8FF80372,
-0xC6940001,0xB2000372,0xB2000372,0xB2000372,0xFB480288,0x77C0372,0xF96002AD,0xFF14013D,0xFECC0055,0xE6980000,0xC6C40001,0xC6580001,0xF1400288,0xFD040120,0xCECC0000,0xB2000372,0x71FC0372,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xC80374,0xCEA00000,0xCEA00000,0xCEA00000,0xCEA00000,0xCEA00000,
-0xCEA00000,0x8EA00000,0x8EA00000,0x8EA00000,0x78A00001,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x9E540001,0x9E540001,0x9E540001,0x78780001,0x1BF80372,0x1BF80372,0x1BF80372,0x78000002,0x64000372,0xF8C00200,0xC80374,0xC80374,0xFEAC00B4,0xFEA80019,0xECA00000,0xECA00000,0xB6A00000,0xFEA0019A,0xFC9C0091,0xA4940000,0x9E540001,
-0x1AC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table146[] = {
-0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x7FC0000,
-0x7FC0000,0x7FC0000,0x7FC0000,0x56000001,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xBC0000,0xBC0000,0xBC0000,0x1040000,0x1740000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x1040000,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,
-0x3800000,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,0x3800000,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x45FC0000,0x45FC0000,0x45FC0000,0x80000001,0x80000001,0x3140000,0x1040000,0x1040000,0x52C0000,0x3440000,0x1600000,0x1600000,0x1B40000,0x52C0000,0x3440000,0x15FC0000,0x45FC0000,
-0x15FC0000,0x14C0000,0x14C0000,0x14C0000,0x14C0000,0x1EC0000,0x1EC0000,0x1EC0000,0x7BFC0000,0x7BFC0000,0xA4000001,0x1EC0000,0x1EC0000,0x1EC0000,0x7BFC0000,0x7BFC0000,0xA4000001,0x7BFC0000,0x7BFC0000,0xA4000001,0xA4000001,0x1EC0000,0x1EC0000,0x1EC0000,0x7BFC0000,0x7BFC0000,0xA4000001,0x7BFC0000,0x7BFC0000,0xA4000001,0xA4000001,0x7BFC0000,
-0x7BFC0000,0xA4000001,0xA4000001,0xA4000001,0x5800000,0x9600000,0x14C0000,0x1C40000,0x25FC0000,0x59FC0000,0x69FC0000,0x87F40000,0x1A00000,0x1EC0000,0x59FC0000,0xA4000001,0x59FC0000,0x1A00000,0x71FC0000,0xB9FC0000,0xCE000001,0x71FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xCE000001,0xCE000001,0x71FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xCE000001,
-0xCE000001,0xB9FC0000,0xCE000001,0xCE000001,0xCE000001,0x71FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xCE000001,0xCE000001,0xB9FC0000,0xCE000001,0xCE000001,0xCE000001,0xB9FC0000,0xCE000001,0xCE000001,0xCE000001,0xCE000001,0x1BFC0000,0x1BC0000,0x1BC0000,0x8DFC0000,0xB1FC0000,0xC5F40000,0xCE000001,0xCE000001,0x4FFC0000,0x9BFC0000,0xCDEC0000,0xCE000001,
-0xA7FC0000,0x1281EA8,0xFF0C10D8,0xF7000BE8,0xCF000BE9,0xFEF40BA8,0xFEDC03B5,0xD4E80531,0xE0D806E6,0xCCD803AE,0xBAD806E6,0xFEDC0EF8,0xFEB802CE,0xD6C80411,0xE8AC038E,0xCEB0000E,0xBCB803AE,0xCEB00BE8,0xC2A00413,0xB4A80533,0xA6B00BEB,0x1B81EA8,0xFE880CAF,0xCEB40BE9,0xFE4C06F6,0xD07C038E,0xBA8C06E6,0xF21C0BE9,0xCE38028A,0xB84403A3,0xA6640BEB,0x61F81EA8,
-0xC4000C91,0xB2000856,0xA0000E0B,0x92001EAC,0xFF0C1316,0xF9201A96,0xFB241B85,0xFEEC08C9,0xFEC40195,0xECB00006,0xD2B80046,0xCAA80046,0xFEF81262,0xFED0075B,0xD49C029D,0xB84403A3,0x37FC1EA8,0x1500BE8,0xFF340489,0xED280288,0xCF280289,0xFF180478,0xF3040005,0xD10C0042,0xD7040372,0xC8FC00A5,0xBB040372,0x1F40BE8,0xFEC002A1,0xD0EC0289,0xF0980372,0xCEB8000A,
-0xBACC0372,0x7FF80BE8,0xCE340289,0xBA180372,0xA6000BEB,0x1F40BE8,0xFEC002A1,0xD0EC0289,0xF0980372,0xCEB8000A,0xBACC0372,0x7FF80BE8,0xCE340289,0xBA180372,0xA6000BEB,0x7FF80BE8,0xCE340289,0xBA180372,0xA6000BEB,0xA6000BEB,0xFF3407C9,0xFD480A38,0xFF4C0A20,0xFF08040E,0xFED000DE,0xEAB40006,0xD0CC0001,0xCC940022,0xFF180806,0xFEF00396,0xD88C0288,0xBA180372,
-0x5DF80BE8,0x1000BE8,0x1000BE8,0x1000BE8,0x1000BE8,0xFEDC03B4,0xFEDC03B4,0xFEDC03B4,0xC0DC0374,0xC0DC0374,0xA4D80375,0xFCB802AA,0xFCB802AA,0xFCB802AA,0xCCB40009,0xCCB40009,0xA8BC00A6,0xB0B00288,0xB0B00288,0xA0AC0041,0x92B0028A,0x17C0BE8,0x17C0BE8,0x17C0BE8,0xDA700372,0xDA700372,0xA4A40373,0xCA440289,0xCA440289,0xA6640002,0x9474028A,0x43F80BE8,
-0x43F80BE8,0xA400037B,0x900002EB,0x7E000BEB,0xFEE806AD,0xF6FC09B4,0x1000BE8,0xFED402DD,0xFEC0005D,0xECB00005,0xDAB80021,0xC0B00002,0xFED0064B,0xFEBC0272,0xCEA0028A,0xA6640002,0x11FC0BE8,0x1280288,0x1280288,0x1280288,0x1280288,0xEF040000,0xEF040000,0xEF040000,0xB7040000,0xB7040000,0xA5040001,0x1B80288,0x1B80288,0x1B80288,0xC4C40001,0xC4C40001,
-0xA6E00001,0x61F80288,0x61F80288,0xA4740001,0x9200028A,0x1B80288,0x1B80288,0x1B80288,0xC4C40001,0xC4C40001,0xA6E00001,0x61F80288,0x61F80288,0xA4740001,0x9200028A,0x61F80288,0x61F80288,0xA4740001,0x9200028A,0x9200028A,0xFF100152,0xF92001A5,0x1280288,0xFCF400B5,0xFEC8000A,0xE4BC0000,0xD0CC0000,0xC2AC0000,0xFEF80171,0xFEDC0091,0x37FC0288,0xA4740001,
-0x37FC0288,0x1740374,0xFF5C0071,0xE54C0000,0xCF4C0001,0x31FC0372,0xF5000001,0xCF240001,0x9BF80372,0xCEA40001,0xBA000372,0x31FC0372,0xF5000001,0xCF240001,0x9BF80372,0xCEA40001,0xBA000372,0x9BF80372,0xCEA40001,0xBA000372,0xBA000372,0x31FC0372,0xF5000001,0xCF240001,0x9BF80372,0xCEA40001,0xBA000372,0x9BF80372,0xCEA40001,0xBA000372,0xBA000372,0x9BF80372,
-0xCEA40001,0xBA000372,0xBA000372,0xBA000372,0xFF5002A8,0xF8C0372,0xFF6C02B9,0xFF30016D,0xFEE80075,0xEEA80000,0xCED40001,0xCE680001,0xF9500288,0xFF140145,0xD6DC0000,0xBA000372,0x7FFC0372,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD80374,0xD6B00000,0xD6B00000,0xD6B00000,0xD6B00000,0xD6B00000,
-0xD6B00000,0x96B00000,0x96B00000,0x96B00000,0x80B00001,0x1440372,0x1440372,0x1440372,0x1440372,0x1440372,0x1440372,0xA6640001,0xA6640001,0xA6640001,0x80880001,0x27F80372,0x27F80372,0x27F80372,0x80080001,0x6C000372,0xFECC0208,0xD80374,0xD80374,0xFAC400C8,0xFEB40028,0xF4B00000,0xF4B00000,0xBEB00000,0xFCB801A5,0xFAAC00A4,0xACA40000,0xA6640001,
-0x1D00372,};
-static const uint32_t g_etc1_to_bc7_m6_table147[] = {
-0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x13FC0000,
-0x13FC0000,0x13FC0000,0x13FC0000,0x5E000001,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xC00000,0xCC0000,0xCC0000,0xCC0000,0x11C0000,0x1980000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,
-0x3980000,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x88000001,0xB240000,0x1140000,0x1140000,0x1400000,0x3580000,0x1780000,0x1780000,0x1D00000,0x1400000,0x3580000,0x23FC0000,0x51FC0000,
-0x23FC0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x87FC0000,0x87FC0000,0xAC000001,0xAC000001,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x87FC0000,0x87FC0000,0xAC000001,0xAC000001,0x87FC0000,
-0x87FC0000,0xAC000001,0xAC000001,0xAC000001,0x1940000,0x1740000,0x15C0000,0x1D80000,0x39FC0000,0x67FC0000,0x77FC0000,0x91FC0000,0x1B40000,0x9FC0000,0x67FC0000,0xAC000001,0x67FC0000,0x1B00000,0x89FC0000,0xC5FC0000,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,
-0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0xD6000001,0x41FC0000,0x1CC0000,0x1CC0000,0xA1FC0000,0xBFF80000,0xCFF40000,0xD6000001,0xD6000001,0x6FFC0000,0xADFC0000,0xD5FC0000,0xD6000001,
-0xB7FC0000,0x1381EA8,0xFF241198,0xFF100BE8,0xD7100BE9,0xFF0C0C98,0xFEF403FD,0xDCF80531,0xE8E806E6,0xD4E803AE,0xC2E806E6,0xFEF40FB8,0xFECC0345,0xDED80411,0xF0BC038E,0xD6C0000E,0xC4C803AE,0xD6C00BE8,0xCAB00413,0xBCB80533,0xAEC00BEB,0x1D01EA8,0xFEA00D2F,0xD6C40BE9,0xFE700736,0xD88C038E,0xC29C06E6,0xFA2C0BE9,0xD648028A,0xC05403A3,0xAE740BEB,0x6DF81EA8,
-0xD0000C49,0xBC0007F4,0xAC000D93,0x9A001EAC,0xFF2013CE,0xFF2C1AAE,0xFF2C1BCD,0xFF0009E0,0xFED8026D,0xF4C00006,0xDAC80046,0xD2B80046,0xFF101346,0xFEE008B4,0xDCAC029D,0xC05403A3,0x45FC1EA8,0x1600BE8,0xFF4404E1,0xF5380288,0xD7380289,0xFF3004D8,0xFB140005,0xD91C0042,0xDF140372,0xD10C00A5,0xC3140372,0xFFC0BE8,0xFED802D9,0xD8FC0289,0xF8A80372,0xD6C8000A,
-0xC2DC0372,0x8BF80BE8,0xD6440289,0xC2280372,0xAE000BEB,0xFFC0BE8,0xFED802D9,0xD8FC0289,0xF8A80372,0xD6C8000A,0xC2DC0372,0x8BF80BE8,0xD6440289,0xC2280372,0xAE000BEB,0x8BF80BE8,0xD6440289,0xC2280372,0xAE000BEB,0xAE000BEB,0xFB480849,0xF5580A7E,0xF75C0A69,0xFF180496,0xFEE80149,0xF2C40006,0xD8DC0001,0xD4A40022,0xFF34084E,0xFF040403,0xE09C0288,0xC2280372,
-0x6BFC0BE8,0x1100BE8,0x1100BE8,0x1100BE8,0x1100BE8,0xFEF403E4,0xFEF403E4,0xFEF403E4,0xC8EC0374,0xC8EC0374,0xACE80375,0xFEC802CA,0xFEC802CA,0xFEC802CA,0xD4C40009,0xD4C40009,0xB0CC00A6,0xB8C00288,0xB8C00288,0xA8BC0041,0x9AC0028A,0x1940BE8,0x1940BE8,0x1940BE8,0xE2800372,0xE2800372,0xACB40373,0xD2540289,0xD2540289,0xAE740002,0x9C84028A,0x4FF80BE8,
-0x4FF80BE8,0xAC040373,0x9A0002CA,0x86000BEB,0xFD000714,0xFF0C09B4,0x1100BE8,0xFEE8034D,0xFED40099,0xF4C00005,0xE2C80021,0xC8C00002,0xFEE406AA,0xFED002BA,0xD6B0028A,0xAE740002,0x1FFC0BE8,0x1380288,0x1380288,0x1380288,0x1380288,0xF7140000,0xF7140000,0xF7140000,0xBF140000,0xBF140000,0xAD140001,0x1D00288,0x1D00288,0x1D00288,0xCCD40001,0xCCD40001,
-0xAEF00001,0x6DF80288,0x6DF80288,0xAC840001,0x9A00028A,0x1D00288,0x1D00288,0x1D00288,0xCCD40001,0xCCD40001,0xAEF00001,0x6DF80288,0x6DF80288,0xAC840001,0x9A00028A,0x6DF80288,0x6DF80288,0xAC840001,0x9A00028A,0x9A00028A,0xFF20016D,0xFF2C01B1,0x1380288,0xFD0400CA,0xFEE00019,0xECCC0000,0xD8DC0000,0xCABC0000,0xFD100188,0xFEE800B4,0x45FC0288,0xAC840001,
-0x45FC0288,0x1840374,0xFF68009D,0xED5C0000,0xD75C0001,0x49FC0372,0xFD100001,0xD7340001,0xA7F80372,0xD6B40001,0xC2000372,0x49FC0372,0xFD100001,0xD7340001,0xA7F80372,0xD6B40001,0xC2000372,0xA7F80372,0xD6B40001,0xC2000372,0xC2000372,0x49FC0372,0xFD100001,0xD7340001,0xA7F80372,0xD6B40001,0xC2000372,0xA7F80372,0xD6B40001,0xC2000372,0xC2000372,0xA7F80372,
-0xD6B40001,0xC2000372,0xC2000372,0xC2000372,0xFB7002AD,0x1A00372,0xF98002D4,0xFF40019A,0xFF0C00B5,0xF6B80000,0xD6E40001,0xD6780001,0xFF5C0290,0xFF2C0185,0xDEEC0000,0xC2000372,0x8FFC0372,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xE80374,0xDEC00000,0xDEC00000,0xDEC00000,0xDEC00000,0xDEC00000,
-0xDEC00000,0x9EC00000,0x9EC00000,0x9EC00000,0x88C00001,0x15C0372,0x15C0372,0x15C0372,0x15C0372,0x15C0372,0x15C0372,0xAE740001,0xAE740001,0xAE740001,0x88980001,0x33F80372,0x33F80372,0x33F80372,0x88180001,0x74000372,0xF8E00221,0xE80374,0xE80374,0xFCD400DD,0xFCCC003D,0xFCC00000,0xFCC00000,0xC6C00000,0xF8CC01C2,0xFEBC00B9,0xB4B40000,0xAE740001,
-0x1F00372,};
-static const uint32_t g_etc1_to_bc7_m6_table148[] = {
-0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x21F80000,
-0x21F80000,0x21F80000,0x21F80000,0x68000000,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xD00001,0xE00000,0xE00000,0xE00000,0x1380000,0x1BC0000,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1240001,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,
-0x1B40000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x5FF80000,0x5FF80000,0x5FF80000,0x92000000,0x92000000,0x5380000,0x1240001,0x1240001,0x3540000,0x1700000,0x1900000,0x1900000,0x3EC0000,0x3540000,0x1700000,0x35FC0000,0x5FF80000,
-0x35FC0000,0x16C0001,0x16C0001,0x16C0001,0x16C0001,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x95F80000,0x95F80000,0xB6000000,0xB6000000,0x25FC0000,0x25FC0000,0x25FC0000,0x95F80000,0x95F80000,0xB6000000,0x95F80000,0x95F80000,0xB6000000,0xB6000000,0x95F80000,
-0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x3A80000,0xB840000,0x16C0001,0x3F00000,0x4FFC0000,0x79FC0000,0x87F80000,0x9DFC0000,0x1CC0000,0x25FC0000,0x79FC0000,0xB6000000,0x79FC0000,0x1C00001,0xA5FC0000,0xD3FC0000,0xE0000000,0xA5FC0000,0xD3FC0000,0xE0000000,0xD3FC0000,0xE0000000,0xE0000000,0xA5FC0000,0xD3FC0000,0xE0000000,0xD3FC0000,0xE0000000,
-0xE0000000,0xD3FC0000,0xE0000000,0xE0000000,0xE0000000,0xA5FC0000,0xD3FC0000,0xE0000000,0xD3FC0000,0xE0000000,0xE0000000,0xD3FC0000,0xE0000000,0xE0000000,0xE0000000,0xD3FC0000,0xE0000000,0xE0000000,0xE0000000,0xE0000000,0x6DFC0000,0x1E00000,0x1E00000,0xB7FC0000,0xCDFC0000,0xD9FC0000,0xE0000000,0xE0000000,0x8FFC0000,0xC1FC0000,0xDFF00000,0xE0000000,
-0xC7FC0000,0x1481EAC,0xFF30126C,0xFF200C0F,0xE1200BEB,0xFF180DB4,0xFF080497,0xE5080533,0xF0FC06E6,0xDCF803AE,0xCAFC06E6,0xFF00109C,0xFEE4040B,0xE8EC0413,0xFAD0038E,0xE0D0000E,0xCCD803AE,0xDED40BE9,0xD4C40411,0xC4C80531,0xB8D00BE9,0x3E81EA8,0xFEC00DE7,0xE0D40BE8,0xFE8807BE,0xE2A0038E,0xCAB006E6,0xFE4C0BFB,0xE05C028A,0xCA6803A1,0xB8840BE8,0x79FC1EA8,
-0xDC000C13,0xC400078E,0xB2000D35,0xA4001EA8,0xFF3414AF,0xFB441B24,0xFD481BF4,0xFF140B27,0xFEF00392,0xFED40006,0xE4DC0046,0xDCCC0046,0xFF181429,0xFF0009F4,0xE6BC029D,0xCA6803A1,0x57FC1EA8,0x1700BEB,0xFF5C056A,0xFF48028A,0xE148028A,0xFF440545,0xFF28001A,0xE3300041,0xE7280373,0xDB2000A6,0xCB240375,0x2BFC0BE8,0xFEFC032A,0xE1100288,0xFEC00375,0xDED80009,
-0xCAF00374,0x97FC0BE8,0xE0540288,0xCA400374,0xB8000BE8,0x2BFC0BE8,0xFEFC032A,0xE1100288,0xFEC00375,0xDED80009,0xCAF00374,0x97FC0BE8,0xE0540288,0xCA400374,0xB8000BE8,0x97FC0BE8,0xE0540288,0xCA400374,0xB8000BE8,0xB8000BE8,0xFF580895,0xFF6C0A7D,0xFF6C0A6E,0xFF30053A,0xFF1001EE,0xFCD80005,0xE0EC0002,0xDCBC0021,0xFF4408B8,0xFF2404CA,0xE8B00289,0xCA400374,
-0x7DF80BE8,0x1200BEB,0x1200BEB,0x1200BEB,0x1200BEB,0xFF040422,0xFF040422,0xFF040422,0xD2FC0372,0xD2FC0372,0xB6FC0372,0xFEE00301,0xFEE00301,0xFEE00301,0xDCD4000A,0xDCD4000A,0xBAE000A5,0xC2D00289,0xC2D00289,0xB2D00042,0xA4D00289,0x1B00BE8,0x1B00BE8,0x1B00BE8,0xEC900372,0xEC900372,0xB6C40372,0xDC640289,0xDC640289,0xB6840005,0xA4980288,0x5DF40BE8,
-0x5DF40BE8,0xB6100372,0xA20002A9,0x90000BE8,0xFF100755,0xF71C09F6,0x1200BEB,0xFEF803C2,0xFEE400F1,0xFAD40005,0xEAD80022,0xD2D00001,0xFEF806F2,0xFEE80352,0xE0C00289,0xB6840005,0x31FC0BE8,0x148028A,0x148028A,0x148028A,0x148028A,0xFF280001,0xFF280001,0xFF280001,0xC9240001,0xC9240001,0xB7240001,0x3E80288,0x3E80288,0x3E80288,0xD6E40001,0xD6E40001,
-0xB7040000,0x79FC0288,0x79FC0288,0xB6900000,0xA4000288,0x3E80288,0x3E80288,0x3E80288,0xD6E40001,0xD6E40001,0xB7040000,0x79FC0288,0x79FC0288,0xB6900000,0xA4000288,0x79FC0288,0x79FC0288,0xB6900000,0xA4000288,0xA4000288,0xFB34018A,0xFB4401C2,0x148028A,0xFF1800E1,0xFCFC0032,0xF4E00000,0xE2EC0000,0xD2D00000,0xF92801A5,0xFF0400D0,0x57FC0288,0xB6900000,
-0x57FC0288,0x1980372,0xFF8000E1,0xF5700001,0xE16C0001,0x65FC0372,0xFF2C0011,0xE1440000,0xB3FC0372,0xE0C00000,0xCA000374,0x65FC0372,0xFF2C0011,0xE1440000,0xB3FC0372,0xE0C00000,0xCA000374,0xB3FC0372,0xE0C00000,0xCA000374,0xCA000374,0x65FC0372,0xFF2C0011,0xE1440000,0xB3FC0372,0xE0C00000,0xCA000374,0xB3FC0372,0xE0C00000,0xCA000374,0xCA000374,0xB3FC0372,
-0xE0C00000,0xCA000374,0xCA000374,0xCA000374,0xFB8402D2,0x1B40372,0xFF8C02F2,0xFD6801E1,0xFF2800E9,0xFECC0001,0xE0F40000,0xE0840000,0xFF7002C5,0xFF4C01B1,0xE7000000,0xCA000374,0x9FFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xFC0372,0xE6D40001,0xE6D40001,0xE6D40001,0xE6D40001,0xE6D40001,
-0xE6D40001,0xA6D40001,0xA6D40001,0xA6D40001,0x92D00001,0x3740372,0x3740372,0x3740372,0x3740372,0x3740372,0x3740372,0xB8840001,0xB8840001,0xB8840001,0x92A80000,0x3FFC0372,0x3FFC0372,0x3FFC0372,0x92240000,0x7C000374,0xFEEC0239,0xFC0372,0xFC0372,0xFEE800F2,0xFCE00055,0xFED40002,0xFED40002,0xCED40001,0xFED801D4,0xFED000D0,0xBAC80001,0xB8840001,
-0xDFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table149[] = {
-0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x1500000,0x2DF80000,
-0x2DF80000,0x2DF80000,0x2DF80000,0x70000000,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xE00001,0xF00000,0xF00000,0xF00000,0x1500000,0x1E00000,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1340001,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,
-0x1CC0000,0x6BF80000,0x6BF80000,0x6BF80000,0x9A000000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x6BF80000,0x6BF80000,0x6BF80000,0x9A000000,0x6BF80000,0x6BF80000,0x6BF80000,0x9A000000,0x9A000000,0xD480000,0x1340001,0x1340001,0x1680000,0x1840000,0x3A40000,0x3A40000,0xBFC0000,0x1680000,0x1840000,0x43FC0000,0x6BF80000,
-0x43FC0000,0x17C0001,0x17C0001,0x17C0001,0x17C0001,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0xA1F80000,0xA1F80000,0xBE000000,0xBE000000,0x3DFC0000,0x3DFC0000,0x3DFC0000,0xA1F80000,0xA1F80000,0xBE000000,0xA1F80000,0xA1F80000,0xBE000000,0xBE000000,0xA1F80000,
-0xA1F80000,0xBE000000,0xBE000000,0xBE000000,0x1BC0000,0x1980000,0x17C0001,0xFFC0000,0x63FC0000,0x87FC0000,0x95F80000,0xA9F40000,0x1E00000,0x3DFC0000,0x87FC0000,0xBE000000,0x87FC0000,0x1D00001,0xBDFC0000,0xDFF80000,0xE8000000,0xBDFC0000,0xDFF80000,0xE8000000,0xDFF80000,0xE8000000,0xE8000000,0xBDFC0000,0xDFF80000,0xE8000000,0xDFF80000,0xE8000000,
-0xE8000000,0xDFF80000,0xE8000000,0xE8000000,0xE8000000,0xBDFC0000,0xDFF80000,0xE8000000,0xDFF80000,0xE8000000,0xE8000000,0xDFF80000,0xE8000000,0xE8000000,0xE8000000,0xDFF80000,0xE8000000,0xE8000000,0xE8000000,0xE8000000,0x95FC0000,0x1F00000,0x1F00000,0xCBFC0000,0xDBFC0000,0xE5F00000,0xE8000000,0xE8000000,0xAFFC0000,0xD1FC0000,0xE9C40000,0xE8000000,
-0xD7FC0000,0x1581EAC,0xFF44131B,0xFF340C64,0xE9300BEB,0xFF300EB4,0xFF180563,0xED180533,0xF90C06E6,0xE50803AE,0xD30C06E6,0xFF181164,0xFEFC04F3,0xF0FC0413,0xFEE4039E,0xE8E0000E,0xD4E803AE,0xE6E40BE9,0xDCD40411,0xCCD80531,0xC0E00BE9,0x7FC1EA8,0xFED80EA7,0xE8E40BE8,0xFEAC0866,0xEAB0038E,0xD2C006E6,0xFE640C43,0xE86C028A,0xD27803A1,0xC0940BE8,0x85FC1EA8,
-0xE6000BF8,0xD000073E,0xBE000CD5,0xAC001EA8,0xFF3C1574,0xFF4C1B64,0xF5581C63,0xFF240C9E,0xFF0404CE,0xFEE80032,0xECEC0046,0xE4DC0046,0xFF3414DA,0xFF100B2E,0xEECC029D,0xD27803A1,0x65FC1EA8,0x1800BEB,0xFF6805FA,0xFF5802A3,0xE958028A,0xFF5C05CD,0xFF40006A,0xEB400041,0xEF380373,0xE33000A6,0xD3340375,0x43FC0BE8,0xFF14039A,0xE9200288,0xFEE4039D,0xE6E80009,
-0xD3000374,0xA3FC0BE8,0xE8640288,0xD2500374,0xC0000BE8,0x43FC0BE8,0xFF14039A,0xE9200288,0xFEE4039D,0xE6E80009,0xD3000374,0xA3FC0BE8,0xE8640288,0xD2500374,0xC0000BE8,0xA3FC0BE8,0xE8640288,0xD2500374,0xC0000BE8,0xC0000BE8,0xFF640905,0xF77C0AC3,0xF9800AB3,0xFF4405E1,0xFF24028E,0xFEF0001B,0xE8FC0002,0xE4CC0021,0xFF5C08DE,0xFF400550,0xF0C00289,0xD2500374,
-0x8BFC0BE8,0x1300BEB,0x1300BEB,0x1300BEB,0x1300BEB,0xFF180463,0xFF180463,0xFF180463,0xDB0C0372,0xDB0C0372,0xBF0C0372,0xFEF4033B,0xFEF4033B,0xFEF4033B,0xE4E4000A,0xE4E4000A,0xC2F000A5,0xCAE00289,0xCAE00289,0xBAE00042,0xACE00289,0x3C40BE8,0x3C40BE8,0x3C40BE8,0xF4A00372,0xF4A00372,0xBED40372,0xE4740289,0xE4740289,0xBE940005,0xACA80288,0x67FC0BE8,
-0x67FC0BE8,0xBE200372,0xAC000291,0x98000BE8,0xFF200792,0xFF2C09F6,0x1300BEB,0xFF100426,0xFEF8014D,0xFEE8000E,0xF2E80022,0xDAE00001,0xFF100749,0xFEF403B6,0xE8D00289,0xBE940005,0x3FFC0BE8,0x158028A,0x158028A,0x158028A,0x158028A,0xFD38000A,0xFD38000A,0xFD38000A,0xD1340001,0xD1340001,0xBF340001,0x7FC0288,0x7FC0288,0x7FC0288,0xDEF40001,0xDEF40001,
-0xBF140000,0x85FC0288,0x85FC0288,0xBEA00000,0xAC000288,0x7FC0288,0x7FC0288,0x7FC0288,0xDEF40001,0xDEF40001,0xBF140000,0x85FC0288,0x85FC0288,0xBEA00000,0xAC000288,0x85FC0288,0x85FC0288,0xBEA00000,0xAC000288,0xAC000288,0xF74801A5,0xF35401E1,0x158028A,0xFD300109,0xFD140048,0xFCF00000,0xEAFC0000,0xDAE00000,0xFF3401A9,0xFF1800E9,0x65FC0288,0xBEA00000,
-0x65FC0288,0x1A80372,0xFF8C0131,0xFD800001,0xE97C0001,0x7DFC0372,0xFF4C0041,0xE9540000,0xBFFC0372,0xE8D00000,0xD2000374,0x7DFC0372,0xFF4C0041,0xE9540000,0xBFFC0372,0xE8D00000,0xD2000374,0xBFFC0372,0xE8D00000,0xD2000374,0xD2000374,0x7DFC0372,0xFF4C0041,0xE9540000,0xBFFC0372,0xE8D00000,0xD2000374,0xBFFC0372,0xE8D00000,0xD2000374,0xD2000374,0xBFFC0372,
-0xE8D00000,0xD2000374,0xD2000374,0xD2000374,0xFF9402D4,0x1C40372,0xFBA402F9,0xFF740212,0xFF500139,0xFEF0001A,0xE9040000,0xE8940000,0xFB9002D2,0xFF6401F9,0xEF100000,0xD2000374,0xAFFC0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0x10C0372,0xEEE40001,0xEEE40001,0xEEE40001,0xEEE40001,0xEEE40001,
-0xEEE40001,0xAEE40001,0xAEE40001,0xAEE40001,0x9AE00001,0x38C0372,0x38C0372,0x38C0372,0x38C0372,0x38C0372,0x38C0372,0xC0940001,0xC0940001,0xC0940001,0x9AB80000,0x4BFC0372,0x4BFC0372,0x4BFC0372,0x9A340000,0x84000374,0xFB040242,0x10C0372,0x10C0372,0xFEF40115,0xFEEC006A,0xFEE8000D,0xFEE8000D,0xD6E40001,0xFCF001E1,0xFEE400F4,0xC2D80001,0xC0940001,
-0x1DF80372,};
-static const uint32_t g_etc1_to_bc7_m6_table150[] = {
-0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x39F80000,
-0x39F80000,0x39F80000,0x39F80000,0x78000000,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0xF00001,0x9000000,0x9000000,0x9000000,0x1680000,0x3FC0000,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1440001,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,
-0x1E40000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x1E40000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0x77F80000,0x77F80000,0x77F80000,0xA2000000,0xA2000000,0x15C0000,0x1440001,0x1440001,0x17C0000,0x1980000,0x1BC0000,0x1BC0000,0x1DFC0000,0x17C0000,0x1980000,0x53FC0000,0x77F80000,
-0x53FC0000,0x18C0001,0x18C0001,0x18C0001,0x18C0001,0x55FC0000,0x55FC0000,0x55FC0000,0xADF80000,0xADF80000,0xC6000000,0x55FC0000,0x55FC0000,0x55FC0000,0xADF80000,0xADF80000,0xC6000000,0xADF80000,0xADF80000,0xC6000000,0xC6000000,0x55FC0000,0x55FC0000,0x55FC0000,0xADF80000,0xADF80000,0xC6000000,0xADF80000,0xADF80000,0xC6000000,0xC6000000,0xADF80000,
-0xADF80000,0xC6000000,0xC6000000,0xC6000000,0x7CC0000,0x1A80000,0x18C0001,0x2FFC0000,0x77FC0000,0x97FC0000,0xA1FC0000,0xB3FC0000,0x1F40000,0x55FC0000,0x97FC0000,0xC6000000,0x97FC0000,0x1E00001,0xD5FC0000,0xEBF80000,0xF0000000,0xD5FC0000,0xEBF80000,0xF0000000,0xEBF80000,0xF0000000,0xF0000000,0xD5FC0000,0xEBF80000,0xF0000000,0xEBF80000,0xF0000000,
-0xF0000000,0xEBF80000,0xF0000000,0xF0000000,0xF0000000,0xD5FC0000,0xEBF80000,0xF0000000,0xEBF80000,0xF0000000,0xF0000000,0xEBF80000,0xF0000000,0xF0000000,0xF0000000,0xEBF80000,0xF0000000,0xF0000000,0xF0000000,0xF0000000,0xBDFC0000,0x27FC0000,0x27FC0000,0xDDFC0000,0xE9F80000,0xEFF00000,0xF0000000,0xF0000000,0xCDFC0000,0xE3FC0000,0xF1D40000,0xF0000000,
-0xE5FC0000,0x1681EAC,0xFF50140B,0xFF480D03,0xF1400BEB,0xFF440FB6,0xFF2C0663,0xF5280533,0xFF1C06EC,0xED1803AE,0xDB1C06E6,0xFF30124C,0xFF14061B,0xF90C0413,0xFEFC0406,0xF0F0000E,0xDCF803AE,0xEEF40BE9,0xE4E40411,0xD4E80531,0xC8F00BE9,0x1FFC1EA8,0xFEF00F87,0xF0F40BE8,0xFEC0096E,0xF2C0038E,0xDAD006E6,0xFE880CBB,0xF07C028A,0xDA8803A1,0xC8A40BE8,0x91FC1EA8,
-0xF0000BEB,0xD8000716,0xC4000C91,0xB4001EA8,0xFF50163F,0xFB641BB6,0xFD681C63,0xFF340DE2,0xFF1C0636,0xFEFC00C2,0xF4FC0046,0xECEC0046,0xFF4415D9,0xFF240CCA,0xF6DC029D,0xDA8803A1,0x75FC1EA8,0x1900BEB,0xFF80068A,0xFF7002EB,0xF168028A,0xFF680655,0xFF5400F6,0xF3500041,0xF7480373,0xEB4000A6,0xDB440375,0x5BFC0BE8,0xFF2C042A,0xF1300288,0xFEFC03ED,0xEEF80009,
-0xDB100374,0xAFFC0BE8,0xF0740288,0xDA600374,0xC8000BE8,0x5BFC0BE8,0xFF2C042A,0xF1300288,0xFEFC03ED,0xEEF80009,0xDB100374,0xAFFC0BE8,0xF0740288,0xDA600374,0xC8000BE8,0xAFFC0BE8,0xF0740288,0xDA600374,0xC8000BE8,0xC8000BE8,0xFF78093E,0xFF8C0AC3,0xFF8C0ABB,0xFF5C0671,0xFF380356,0xFF10007D,0xF10C0002,0xECDC0021,0xFF780956,0xFF540602,0xF8D00289,0xDA600374,
-0x9BFC0BE8,0x1400BEB,0x1400BEB,0x1400BEB,0x1400BEB,0xFF2404B3,0xFF2404B3,0xFF2404B3,0xE31C0372,0xE31C0372,0xC71C0372,0xFF0C0393,0xFF0C0393,0xFF0C0393,0xECF4000A,0xECF4000A,0xCB0000A5,0xD2F00289,0xD2F00289,0xC2F00042,0xB4F00289,0x3DC0BE8,0x3DC0BE8,0x3DC0BE8,0xFCB00372,0xFCB00372,0xC6E40372,0xEC840289,0xEC840289,0xC6A40005,0xB4B80288,0x73FC0BE8,
-0x73FC0BE8,0xC6300372,0xB4000289,0xA0000BE8,0xFF2C07F1,0xF73C0A3B,0x1400BEB,0xFF1804AA,0xFF0C01B9,0xFEF80032,0xFAF80022,0xE2F00001,0xFF20078D,0xFF10042D,0xF0E00289,0xC6A40005,0x4FFC0BE8,0x168028A,0x168028A,0x168028A,0x168028A,0xFD4C0019,0xFD4C0019,0xFD4C0019,0xD9440001,0xD9440001,0xC7440001,0x1FFC0288,0x1FFC0288,0x1FFC0288,0xE7040001,0xE7040001,
-0xC7240000,0x91FC0288,0x91FC0288,0xC6B00000,0xB4000288,0x1FFC0288,0x1FFC0288,0x1FFC0288,0xE7040001,0xE7040001,0xC7240000,0x91FC0288,0x91FC0288,0xC6B00000,0xB4000288,0x91FC0288,0x91FC0288,0xC6B00000,0xB4000288,0xB4000288,0xFF5801A5,0xFB6401E1,0x168028A,0xFF440120,0xFD280064,0xFF080008,0xF30C0000,0xE2F00000,0xFD4C01C2,0xFF30010D,0x75FC0288,0xC6B00000,
-0x75FC0288,0x1B80372,0xFFA40179,0xFF900011,0xF18C0001,0x95FC0372,0xFF640089,0xF1640000,0xCBFC0372,0xF0E00000,0xDA000374,0x95FC0372,0xFF640089,0xF1640000,0xCBFC0372,0xF0E00000,0xDA000374,0xCBFC0372,0xF0E00000,0xDA000374,0xDA000374,0x95FC0372,0xFF640089,0xF1640000,0xCBFC0372,0xF0E00000,0xDA000374,0xCBFC0372,0xF0E00000,0xDA000374,0xDA000374,0xCBFC0372,
-0xF0E00000,0xDA000374,0xDA000374,0xDA000374,0xFBAC02F9,0x3D40372,0xFFAC0321,0xFF940242,0xFF680179,0xFF180055,0xF1140000,0xF0A40000,0xFF9802F2,0xFF880221,0xF7200000,0xDA000374,0xBFF80372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0x11C0372,0xF6F40001,0xF6F40001,0xF6F40001,0xF6F40001,0xF6F40001,
-0xF6F40001,0xB6F40001,0xB6F40001,0xB6F40001,0xA2F00001,0x3A40372,0x3A40372,0x3A40372,0x3A40372,0x3A40372,0x3A40372,0xC8A40001,0xC8A40001,0xC8A40001,0xA2C80000,0x57FC0372,0x57FC0372,0x57FC0372,0xA2440000,0x8C000374,0xFF0C0262,0x11C0372,0x11C0372,0xFF040132,0xFF000082,0xFEF80019,0xFEF80019,0xDEF40001,0xF9040200,0xFEF80109,0xCAE80001,0xC8A40001,
-0x2BFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table151[] = {
-0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x45F80000,
-0x45F80000,0x45F80000,0x45F80000,0x80000000,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1000001,0x1140000,0x1140000,0x1140000,0x1800000,0x13FC0000,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,
-0x1FC0000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0x83F80000,0x83F80000,0x83F80000,0xAA000000,0xAA000000,0x16C0000,0x1540001,0x1540001,0x38C0000,0x1AC0000,0x1D00000,0x1D00000,0x31FC0000,0x38C0000,0x1AC0000,0x61FC0000,0x83F80000,
-0x61FC0000,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0xB9F80000,0xB9F80000,0xCE000000,0xCE000000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xCE000000,0xB9F80000,0xB9F80000,0xCE000000,0xCE000000,0xB9F80000,
-0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0x3E00000,0x5B80000,0x19C0001,0x4DFC0000,0x8BFC0000,0xA5FC0000,0xAFFC0000,0xBFF40000,0x15FC0000,0x6FFC0000,0xA5FC0000,0xCE000000,0xA5FC0000,0x1F00001,0xEFFC0000,0xF7F80000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,
-0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xF8000000,0xE3FC0000,0xA7FC0000,0xA7FC0000,0xF1FC0000,0xF5FC0000,0xF9F00000,0xF8000000,0xF8000000,0xEBFC0000,0xF3FC0000,0xF9E40000,0xF8000000,
-0xF5FC0000,0x1781EAC,0xFF6814E3,0xFF580DBC,0xF9500BEB,0xFF5010EE,0xFF4007AB,0xFD380533,0xFF2C0746,0xF52803AE,0xE32C06E6,0xFF441329,0xFF280782,0xFF180417,0xFF0804CE,0xF900000E,0xE50803AE,0xF7040BE9,0xECF40411,0xDCF80531,0xD1000BE9,0x37FC1EA8,0xFF14107B,0xF9040BE8,0xFEE40A7E,0xFAD0038E,0xE2E006E6,0xFEAC0D6B,0xF88C028A,0xE29803A1,0xD0B40BE8,0x9DFC1EA8,
-0xF8080BE8,0xE00006F8,0xCE000C58,0xBC001EA8,0xFF641706,0xFF6C1C06,0xF5781CD4,0xFF440F42,0xFF2807C3,0xFF1001BE,0xFD0C0046,0xF4FC0046,0xFF54168D,0xFF400E29,0xFEEC029D,0xE29803A1,0x83FC1EA8,0x1A00BEB,0xFF8C072A,0xFF7C036B,0xF978028A,0xFF8006ED,0xFF6401AA,0xFB600041,0xFF580373,0xF35000A6,0xE3540375,0x75FC0BE8,0xFF4C04E1,0xF9400288,0xFF200465,0xF7080009,
-0xE3200374,0xBBFC0BE8,0xF8840288,0xE2700374,0xD0000BE8,0x75FC0BE8,0xFF4C04E1,0xF9400288,0xFF200465,0xF7080009,0xE3200374,0xBBFC0BE8,0xF8840288,0xE2700374,0xD0000BE8,0xBBFC0BE8,0xF8840288,0xE2700374,0xD0000BE8,0xD0000BE8,0xFF940998,0xF79C0B0D,0xF9A00AFE,0xFF780729,0xFF50043A,0xFF2C0113,0xF91C0002,0xF4EC0021,0xFF88099B,0xFF6806C8,0xFEE40291,0xE2700374,
-0xA9FC0BE8,0x1500BEB,0x1500BEB,0x1500BEB,0x1500BEB,0xFF3C0513,0xFF3C0513,0xFF3C0513,0xEB2C0372,0xEB2C0372,0xCF2C0372,0xFF1803F3,0xFF1803F3,0xFF1803F3,0xF504000A,0xF504000A,0xD31000A5,0xDB000289,0xDB000289,0xCB000042,0xBD000289,0x3F40BE8,0x3F40BE8,0x3F40BE8,0xFEC8037D,0xFEC8037D,0xCEF40372,0xF4940289,0xF4940289,0xCEB40005,0xBCC80288,0x7FFC0BE8,
-0x7FFC0BE8,0xCE400372,0xBC0C0288,0xA8000BE8,0xFF3C0843,0xFF4C0A3B,0x1500BEB,0xFF34051E,0xFF200235,0xFF10007A,0xFF080031,0xEB000001,0xFF3407CA,0xFF2404BA,0xF8F00289,0xCEB40005,0x5FF80BE8,0x178028A,0x178028A,0x178028A,0x178028A,0xFF5C002D,0xFF5C002D,0xFF5C002D,0xE1540001,0xE1540001,0xCF540001,0x37FC0288,0x37FC0288,0x37FC0288,0xEF140001,0xEF140001,
-0xCF340000,0x9DFC0288,0x9DFC0288,0xCEC00000,0xBC000288,0x37FC0288,0x37FC0288,0x37FC0288,0xEF140001,0xEF140001,0xCF340000,0x9DFC0288,0x9DFC0288,0xCEC00000,0xBC000288,0x9DFC0288,0x9DFC0288,0xCEC00000,0xBC000288,0xBC000288,0xFB6C01C2,0xF3740202,0x178028A,0xFD580139,0xFF400080,0xFF200014,0xFB1C0000,0xEB000000,0xF56401E1,0xFF480122,0x83FC0288,0xCEC00000,
-0x83FC0288,0x1C80372,0xFFB001E1,0xFFA00052,0xF99C0001,0xAFFC0372,0xFF8800E9,0xF9740000,0xD7FC0372,0xF8F00000,0xE2000374,0xAFFC0372,0xFF8800E9,0xF9740000,0xD7FC0372,0xF8F00000,0xE2000374,0xD7FC0372,0xF8F00000,0xE2000374,0xE2000374,0xAFFC0372,0xFF8800E9,0xF9740000,0xD7FC0372,0xF8F00000,0xE2000374,0xD7FC0372,0xF8F00000,0xE2000374,0xE2000374,0xD7FC0372,
-0xF8F00000,0xE2000374,0xE2000374,0xE2000374,0xFFB40311,0xBE40372,0xFBC40322,0xFFAC0288,0xFF7C01E1,0xFF4800A9,0xF9240000,0xF8B40000,0xF1C00320,0xFFA00269,0xFF300000,0xE2000374,0xCDFC0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0x12C0372,0xFF040001,0xFF040001,0xFF040001,0xFF040001,0xFF040001,
-0xFF040001,0xBF040001,0xBF040001,0xBF040001,0xAB000001,0x3BC0372,0x3BC0372,0x3BC0372,0x3BC0372,0x3BC0372,0x3BC0372,0xD0B40001,0xD0B40001,0xD0B40001,0xAAD80000,0x63FC0372,0x63FC0372,0x63FC0372,0xAA540000,0x94000374,0xFB240265,0x12C0372,0x12C0372,0xFD1C0152,0xFF1400A2,0xFF08002D,0xFF08002D,0xE7040001,0xFF100208,0xFB0C0139,0xD2F80001,0xD0B40001,
-0x3BFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table152[] = {
-0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x3980000,0x51FC0000,
-0x51FC0000,0x51FC0000,0x51FC0000,0x88000001,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0xB240000,0xB240000,0xB240000,0x3980000,0x23FC0000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1680000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,
-0x1BFC0000,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0x91F80000,0x91F80000,0x91F80000,0xB2000001,0xB2000001,0x1800000,0x1680000,0x1680000,0x7A00000,0x1C40000,0x3E80000,0x3E80000,0x47FC0000,0x7A00000,0x1C40000,0x73FC0000,0x91F80000,
-0x73FC0000,0x1B00000,0x1B00000,0x1B00000,0x1B00000,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xC5FC0000,0xD6000001,0xD6000001,0x89FC0000,0x89FC0000,0x89FC0000,0xC5FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xC5FC0000,0xD6000001,0xD6000001,0xC5FC0000,
-0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0x5F40000,0x1CC0000,0x1B00000,0x6FFC0000,0xA1FC0000,0xB7FC0000,0xBFF80000,0xCBF80000,0x41FC0000,0x89FC0000,0xB7FC0000,0xD6000001,0xB7FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x18C1D49,0xFF7414EC,0xFF6C0E58,0xFF640BE8,0xFF681121,0xFF5408B8,0xFF50057D,0xFF400775,0xFB3C0394,0xEB3C066D,0xFF5C130E,0xFF40088F,0xFF30046C,0xFF20053D,0xFF180009,0xEB1C033A,0xFF140AFE,0xF30C03CA,0xE5080492,0xD9140AFE,0x4FFC1D47,0xFF2C10D8,0xFF180BE8,0xFEFC0B01,0xFEE403A5,0xEAF4066B,0xFED80D6E,0xFEA00289,0xE8B00322,0xD8C80AFE,0xA9FC1D47,
-0xFE280BE8,0xEA04066B,0xD6000B2E,0xC4001D47,0xFF7816B6,0xFD881AF5,0xFD881B85,0xFF5C0FB9,0xFF4808E2,0xFF2C02B1,0xFF240072,0xFB10002D,0xFF70165C,0xFF500E9A,0xFF0802CE,0xE8B00322,0x93FC1D47,0x1B00B01,0xFFA4072C,0xFF9403C9,0xFF8C0288,0xFF9806DD,0xFF7C0236,0xFF740055,0xFF700321,0xF7640084,0xEB6802F9,0x8DFC0AFE,0xFF64052B,0xFF540288,0xFF380471,0xFD200005,
-0xEB3002FA,0xC7FC0AFE,0xFEA00288,0xEA8402F9,0xD8000AFE,0x8DFC0AFE,0xFF64052B,0xFF540288,0xFF380471,0xFD200005,0xEB3002FA,0xC7FC0AFE,0xFEA00288,0xEA8402F9,0xD8000AFE,0xC7FC0AFE,0xFEA00288,0xEA8402F9,0xD8000AFE,0xD8000AFE,0xFFA0091B,0xFFAC0A29,0xFFAC0A2C,0xFF94070A,0xFF680489,0xFF480194,0xFF340005,0xFCFC0011,0xFF98091E,0xFF8406A8,0xFF1002BB,0xEA8402F9,
-0xB9FC0AFE,0x1640BE8,0x1640BE8,0x1640BE8,0x1640BE8,0xFF50057D,0xFF50057D,0xFF50057D,0xF3400374,0xF3400374,0xD73C0375,0xFF30046C,0xFF30046C,0xFF30046C,0xFF180009,0xFF180009,0xDB2000A6,0xE3140288,0xE3140288,0xD3100041,0xC514028A,0x15FC0BE8,0x15FC0BE8,0x15FC0BE8,0xFEE403A5,0xFEE403A5,0xD7080373,0xFCA80289,0xFCA80289,0xD8C80002,0xC6D8028A,0x8DFC0BE8,
-0x8DFC0BE8,0xD6580373,0xC420028A,0xB0000BEB,0xFF58088D,0xF9600A7D,0x1640BE8,0xFF4405A2,0xFF3402D2,0xFF2400E9,0xFF240072,0xF3140002,0xFD4C0845,0xFF300545,0xFF08028E,0xD8C80002,0x6FFC0BE8,0x18C0288,0x18C0288,0x18C0288,0x18C0288,0xFF740055,0xFF740055,0xFF740055,0xE9680000,0xE9680000,0xD7680001,0x53FC0288,0x53FC0288,0x53FC0288,0xF7280001,0xF7280001,
-0xD9440001,0xABF80288,0xABF80288,0xD6D80001,0xC400028A,0x53FC0288,0x53FC0288,0x53FC0288,0xF7280001,0xF7280001,0xD9440001,0xABF80288,0xABF80288,0xD6D80001,0xC400028A,0xABF80288,0xABF80288,0xD6D80001,0xC400028A,0xC400028A,0xF78001E1,0xFD880200,0x18C0288,0xFB70016D,0xFF5400AA,0xFF3C0034,0xFF340005,0xF5100000,0xFD7401E1,0xFF5C0145,0x95FC0288,0xD6D80001,
-0x95FC0288,0x1D802F9,0xFFC401CD,0xFFB80089,0xFFB00000,0xC3FC02F9,0xFFA00112,0xFF880001,0xE1FC02F9,0xFF100000,0xEA0002F9,0xC3FC02F9,0xFFA00112,0xFF880001,0xE1FC02F9,0xFF100000,0xEA0002F9,0xE1FC02F9,0xFF100000,0xEA0002F9,0xEA0002F9,0xC3FC02F9,0xFFA00112,0xFF880001,0xE1FC02F9,0xFF100000,0xEA0002F9,0xE1FC02F9,0xFF100000,0xEA0002F9,0xEA0002F9,0xE1FC02F9,
-0xFF100000,0xEA0002F9,0xEA0002F9,0xEA0002F9,0xFFD002AD,0x1F802F9,0xF3D402D4,0xFFC00244,0xFFA801CA,0xFF7000E8,0xFF3C0000,0xFED80000,0xF9D002AD,0xFDBC0244,0xFF580019,0xEA0002F9,0xDBFC02F9,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0x13C0374,0xFF180008,0xFF180008,0xFF180008,0xFF180008,0xFF180008,
-0xFF180008,0xC9140000,0xC9140000,0xC9140000,0xB3140001,0x1D80372,0x1D80372,0x1D80372,0x1D80372,0x1D80372,0x1D80372,0xD8C80001,0xD8C80001,0xD8C80001,0xB2EC0001,0x71F80372,0x71F80372,0x71F80372,0xB26C0001,0x9E000372,0xF5380288,0x13C0374,0x13C0374,0xFF2C0171,0xFF2800C8,0xFF1C0050,0xFF1C0050,0xF1140000,0xFF200239,0xFF200152,0xDF080000,0xD8C80001,
-0x4BFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table153[] = {
-0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x3B00000,0x5DFC0000,
-0x5DFC0000,0x5DFC0000,0x5DFC0000,0x90000001,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1240000,0x1380000,0x1380000,0x1380000,0x3B00000,0x33FC0000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x1780000,0x35FC0000,0x35FC0000,0x35FC0000,0x35FC0000,0x35FC0000,
-0x35FC0000,0x9DF40000,0x9DF40000,0x9DF40000,0xBA000001,0x35FC0000,0x35FC0000,0x35FC0000,0x35FC0000,0x35FC0000,0x35FC0000,0x9DF40000,0x9DF40000,0x9DF40000,0xBA000001,0x9DF40000,0x9DF40000,0x9DF40000,0xBA000001,0xBA000001,0x1900000,0x1780000,0x1780000,0x3B40000,0x1D80000,0x5FC0000,0x5FC0000,0x5BFC0000,0x3B40000,0x1D80000,0x81FC0000,0x9DF40000,
-0x81FC0000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xD1FC0000,0xDE000001,0xDE000001,0xA3FC0000,0xA3FC0000,0xA3FC0000,0xD1FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xD1FC0000,0xDE000001,0xDE000001,0xD1FC0000,
-0xD1FC0000,0xDE000001,0xDE000001,0xDE000001,0x27FC0000,0x7DC0000,0x1C00000,0x8DFC0000,0xB3FC0000,0xC5FC0000,0xCBFC0000,0xD5FC0000,0x69FC0000,0xA3FC0000,0xC5FC0000,0xDE000001,0xC5FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x19819DD,0xFF801338,0xFF7C0DD1,0xFF740BE8,0xFF740F69,0xFF64088E,0xFF5C05E5,0xFF58068D,0xFD4C0378,0xEF4C0555,0xFF6810D2,0xFF4C0853,0xFF4404DA,0xFF380465,0xFF2C0029,0xF12C0235,0xFF2808DD,0xF51C032E,0xEB1C0319,0xDF2408CA,0x63FC19DB,0xFF380FF4,0xFF300BE8,0xFF1409E9,0xFEFC03ED,0xEF080553,0xFEF00B8E,0xFEC0029E,0xECC401FA,0xDED808CA,0xB3F819DB,
-0xFE580BE8,0xEE280553,0xDC0008D2,0xCA0019DB,0xFF801459,0xFF8C17D1,0xFF8C1891,0xFF700E65,0xFF50086A,0xFF3C02F4,0xFF3400D5,0xFD240006,0xFF7813DA,0xFF5C0D8F,0xFF1C02E2,0xECC401FA,0x9FF819DB,0x1BC08C9,0xFFB005F4,0xFFA00371,0xFF9C0288,0xFFA40569,0xFF9001F8,0xFF840088,0xFF7C0221,0xFB780034,0xEF7801E1,0x9BFC08C9,0xFF7C0473,0xFF6C0288,0xFF580362,0xFF380001,
-0xEF4401E2,0xCFF808C9,0xFED40288,0xEEA801E1,0xDE0008CA,0x9BFC08C9,0xFF7C0473,0xFF6C0288,0xFF580362,0xFF380001,0xEF4401E2,0xCFF808C9,0xFED40288,0xEEA801E1,0xDE0008CA,0xCFF808C9,0xFED40288,0xEEA801E1,0xDE0008CA,0xDE0008CA,0xFFB4074A,0xF5B80845,0xF7BC0849,0xFF9805B9,0xFF7C03C5,0xFF5C0171,0xFF500019,0xFF180001,0xFFA4076A,0xFF90057E,0xFF3002AE,0xEEA801E1,
-0xC1FC08C9,0x1740BE8,0x1740BE8,0x1740BE8,0x1740BE8,0xFF5C05E5,0xFF5C05E5,0xFF5C05E5,0xFB500374,0xFB500374,0xDF4C0375,0xFF4404DA,0xFF4404DA,0xFF4404DA,0xFF2C0029,0xFF2C0029,0xE33000A6,0xEB240288,0xEB240288,0xDB200041,0xCD24028A,0x2FFC0BE8,0x2FFC0BE8,0x2FFC0BE8,0xFEFC03ED,0xFEFC03ED,0xDF180373,0xFEC0029E,0xFEC0029E,0xE0D80002,0xCEE8028A,0x99FC0BE8,
-0x99FC0BE8,0xDE680373,0xCC30028A,0xB8000BEB,0xFF6808CE,0xFF6C0A8D,0x1740BE8,0xFF540625,0xFF440371,0xFF3C0164,0xFF3400D5,0xFB240002,0xFF58089A,0xFF5005B6,0xFF1C02B1,0xE0D80002,0x7FF80BE8,0x19C0288,0x19C0288,0x19C0288,0x19C0288,0xFF840088,0xFF840088,0xFF840088,0xF1780000,0xF1780000,0xDF780001,0x6BFC0288,0x6BFC0288,0x6BFC0288,0xFF380001,0xFF380001,
-0xE1540001,0xB7F80288,0xB7F80288,0xDEE80001,0xCC00028A,0x6BFC0288,0x6BFC0288,0x6BFC0288,0xFF380001,0xFF380001,0xE1540001,0xB7F80288,0xB7F80288,0xDEE80001,0xCC00028A,0xB7F80288,0xB7F80288,0xDEE80001,0xCC00028A,0xCC00028A,0xFF9001E1,0xF5980221,0x19C0288,0xFD840188,0xFF7000DD,0xFF580064,0xFF500019,0xFD200000,0xFD880200,0xFF740171,0xA3FC0288,0xDEE80001,
-0xA3FC0288,0x1E001E1,0xFFD00121,0xFFC80061,0xFFC00000,0xCFFC01E1,0xFFB800AA,0xFFA00001,0xE7FC01E1,0xFF400000,0xEE0001E1,0xCFFC01E1,0xFFB800AA,0xFFA00001,0xE7FC01E1,0xFF400000,0xEE0001E1,0xE7FC01E1,0xFF400000,0xEE0001E1,0xEE0001E1,0xCFFC01E1,0xFFB800AA,0xFFA00001,0xE7FC01E1,0xFF400000,0xEE0001E1,0xE7FC01E1,0xFF400000,0xEE0001E1,0xEE0001E1,0xE7FC01E1,
-0xFF400000,0xEE0001E1,0xEE0001E1,0xEE0001E1,0xFFD001BD,0x7FC01E1,0xF7DC01C4,0xFFC80179,0xFFBC0122,0xFF900092,0xFF640000,0xFF140000,0xFDD801A5,0xFFC00164,0xFF780010,0xEE0001E1,0xE1FC01E1,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0x14C0374,0xFF28001D,0xFF28001D,0xFF28001D,0xFF28001D,0xFF28001D,
-0xFF28001D,0xD1240000,0xD1240000,0xD1240000,0xBB240001,0x1F00372,0x1F00372,0x1F00372,0x1F00372,0x1F00372,0x1F00372,0xE0D80001,0xE0D80001,0xE0D80001,0xBAFC0001,0x7DF80372,0x7DF80372,0x7DF80372,0xBA7C0001,0xA6000372,0xFD480288,0x14C0374,0x14C0374,0xFF3C0190,0xFF3400E9,0xFF340071,0xFF340071,0xF9240000,0xFD380244,0xFF34016D,0xE7180000,0xE0D80001,
-0x5BFC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table154[] = {
-0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0x69FC0000,
-0x69FC0000,0x69FC0000,0x69FC0000,0x98000001,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1340000,0x1480000,0x1480000,0x1480000,0x1C80000,0x41FC0000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,
-0x4DFC0000,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0x4DFC0000,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0xA7FC0000,0xA7FC0000,0xA7FC0000,0xC2000001,0xC2000001,0x9A00000,0x1880000,0x1880000,0x1C80000,0x1EC0000,0x23FC0000,0x23FC0000,0x6FFC0000,0x1C80000,0x1EC0000,0x91FC0000,0xA7FC0000,
-0x91FC0000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xDDFC0000,0xDDFC0000,0xE6000001,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xDDFC0000,0xDDFC0000,0xE6000001,0xDDFC0000,0xDDFC0000,0xE6000001,0xE6000001,0xBBFC0000,0xBBFC0000,0xBBFC0000,0xDDFC0000,0xDDFC0000,0xE6000001,0xDDFC0000,0xDDFC0000,0xE6000001,0xE6000001,0xDDFC0000,
-0xDDFC0000,0xE6000001,0xE6000001,0xE6000001,0x5FFC0000,0xFEC0000,0x1D00000,0xABFC0000,0xC7FC0000,0xD5FC0000,0xD9FC0000,0xE1F40000,0x91FC0000,0xBBFC0000,0xD5FC0000,0xE6000001,0xD5FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1A416D1,0xFF981198,0xFF880D5D,0xFF840BE8,0xFF800E01,0xFF740886,0xFF740675,0xFF6405F9,0xFF60037D,0xF35C047D,0xFF740EFE,0xFF64082B,0xFF5C056A,0xFF4C03FE,0xFF40007D,0xF53C016E,0xFF38072E,0xF93002C2,0xEF3001F9,0xE33406EA,0x75FC16CF,0xFF580F0C,0xFF480BE8,0xFF2C0911,0xFF20044D,0xF31C047B,0xFF080A0E,0xFEE402DE,0xF2DC0112,0xE2EC06EA,0xBBFC16CF,
-0xFE880BE8,0xF248047B,0xE20806EA,0xD00016CF,0xFF8C1241,0xF9A0153D,0xF9A015C4,0xFF800D27,0xFF68080D,0xFF54034B,0xFF4C015D,0xFF38000D,0xFF9011EA,0xFF740C8A,0xFF340318,0xF2DC0112,0xA9FC16CF,0x1C806E9,0xFFBC04EC,0xFFAC0331,0xFFAC0288,0xFFB0043D,0xFFA001C2,0xFF9800B9,0xFF940169,0xFD880008,0xF3880109,0xAFFC06E9,0xFF9403DB,0xFF840288,0xFF70028A,0xFF580019,
-0xF358010A,0xD7FC06E9,0xFF040288,0xF2C80109,0xE20006EA,0xAFFC06E9,0xFF9403DB,0xFF840288,0xFF70028A,0xFF580019,0xF358010A,0xD7FC06E9,0xFF040288,0xF2C80109,0xE20006EA,0xD7FC06E9,0xFF040288,0xF2C80109,0xE20006EA,0xE20006EA,0xFFBC05CD,0xFBC4066D,0xFBC40681,0xFFB004A6,0xFF900329,0xFF7C0153,0xFF680041,0xFF380009,0xFFB405D6,0xFFA40482,0xFF5002A1,0xF2C80109,
-0xCDFC06E9,0x1840BE8,0x1840BE8,0x1840BE8,0x1840BE8,0xFF740675,0xFF740675,0xFF740675,0xFF60037D,0xFF60037D,0xE75C0375,0xFF5C056A,0xFF5C056A,0xFF5C056A,0xFF40007D,0xFF40007D,0xEB4000A6,0xF3340288,0xF3340288,0xE3300041,0xD534028A,0x47FC0BE8,0x47FC0BE8,0x47FC0BE8,0xFF20044D,0xFF20044D,0xE7280373,0xFEE402DE,0xFEE402DE,0xE8E80002,0xD6F8028A,0xA5F80BE8,
-0xA5F80BE8,0xE6780373,0xD440028A,0xC0000BEB,0xFF740934,0xF9800AC4,0x1840BE8,0xFF6806BD,0xFF580419,0xFF4C021D,0xFF4C015D,0xFF38000D,0xFD70090C,0xFF5C0656,0xFF3402F4,0xE8E80002,0x8DFC0BE8,0x1AC0288,0x1AC0288,0x1AC0288,0x1AC0288,0xFF9800B9,0xFF9800B9,0xFF9800B9,0xF9880000,0xF9880000,0xE7880001,0x83FC0288,0x83FC0288,0x83FC0288,0xFF580019,0xFF580019,
-0xE9640001,0xC3F80288,0xC3F80288,0xE6F80001,0xD400028A,0x83FC0288,0x83FC0288,0x83FC0288,0xFF580019,0xFF580019,0xE9640001,0xC3F80288,0xC3F80288,0xE6F80001,0xD400028A,0xC3F80288,0xC3F80288,0xE6F80001,0xD400028A,0xD400028A,0xFFA00202,0xFDA80221,0x1AC0288,0xFF9801A5,0xFF84010D,0xFF740091,0xFF680041,0xFF380009,0xFF940212,0xFB9001A5,0xB3FC0288,0xE6F80001,
-0xB3FC0288,0x1E80109,0xFFDC009D,0xFFD80034,0xFFD00000,0xDBFC0109,0xFFCC0064,0xFFB80001,0xEDFC0109,0xFF700000,0xF2000109,0xDBFC0109,0xFFCC0064,0xFFB80001,0xEDFC0109,0xFF700000,0xF2000109,0xEDFC0109,0xFF700000,0xF2000109,0xF2000109,0xDBFC0109,0xFFCC0064,0xFFB80001,0xEDFC0109,0xFF700000,0xF2000109,0xEDFC0109,0xFF700000,0xF2000109,0xF2000109,0xEDFC0109,
-0xFF700000,0xF2000109,0xF2000109,0xF2000109,0xFBE400F2,0x47FC0109,0xFBE400F4,0xFFD800CA,0xFFD000A2,0xFFAC0050,0xFF8C0000,0xFF500000,0xFFDC00E1,0xFFD800C8,0xFF9C0009,0xF2000109,0xE9FC0109,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0x15C0374,0xFF3C0034,0xFF3C0034,0xFF3C0034,0xFF3C0034,0xFF3C0034,
-0xFF3C0034,0xD9340000,0xD9340000,0xD9340000,0xC3340001,0xDFC0372,0xDFC0372,0xDFC0372,0xDFC0372,0xDFC0372,0xDFC0372,0xE8E80001,0xE8E80001,0xE8E80001,0xC30C0001,0x89F80372,0x89F80372,0x89F80372,0xC28C0001,0xAE000372,0xF55802AD,0x15C0374,0x15C0374,0xFF4C01B1,0xFD4C0120,0xFF440091,0xFF440091,0xFF340001,0xF94C0265,0xFB4801A5,0xEF280000,0xE8E80001,
-0x69FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table155[] = {
-0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0x75FC0000,
-0x75FC0000,0x75FC0000,0x75FC0000,0xA0000001,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x5580000,0x5580000,0x5580000,0x1E00000,0x51FC0000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x1980000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,
-0x65FC0000,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0x65FC0000,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0xB3FC0000,0xB3FC0000,0xB3FC0000,0xCA000001,0xCA000001,0x1B40000,0x1980000,0x1980000,0x5D80000,0x7FC0000,0x41FC0000,0x41FC0000,0x83FC0000,0x5D80000,0x7FC0000,0x9FFC0000,0xB3FC0000,
-0x9FFC0000,0x1E00000,0x1E00000,0x1E00000,0x1E00000,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xE9FC0000,0xEE000001,0xEE000001,0xD3FC0000,0xD3FC0000,0xD3FC0000,0xE9FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xE9FC0000,0xEE000001,0xEE000001,0xE9FC0000,
-0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0x97FC0000,0x17FC0000,0x1E00000,0xC9FC0000,0xDBFC0000,0xE3FC0000,0xE7F80000,0xEBF80000,0xB7FC0000,0xD3FC0000,0xE3FC0000,0xEE000001,0xE3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B01425,0xFFA4102C,0xFF940D09,0xFF940BE8,0xFF980CD1,0xFF840898,0xFF8006ED,0xFF7C05B1,0xFF7003B4,0xF76C03E5,0xFF8C0D6E,0xFF74082B,0xFF68060E,0xFF6403D6,0xFF54010D,0xF95000EA,0xFF4C0601,0xFD44028E,0xF53C011D,0xE944055E,0x87FC1423,0xFF700E44,0xFF600BE8,0xFF4C089F,0xFF3804C5,0xF73003E3,0xFF2008EE,0xFEFC034E,0xF6F00072,0xE900055E,0xC5F81423,
-0xFEB80BE8,0xF66803E3,0xE820055E,0xD6001423,0xFFA0103F,0xFFAC12A9,0xFFAC133C,0xFF940C5A,0xFF7807F2,0xFF6803C5,0xFF5C0214,0xFF48005E,0xFF981017,0xFF800B9F,0xFF50039A,0xF6F00072,0xB5FC1423,0x1D00561,0xFFC40411,0xFFC002F4,0xFFBC0288,0xFFBC0359,0xFFB001B2,0xFFA80104,0xFFA000F1,0xFF980004,0xF7980071,0xBDFC055E,0xFFAC0363,0xFF9C0288,0xFF8801F2,0xFF700049,
-0xF76C0072,0xDFF8055E,0xFF340288,0xF6E80071,0xE800055E,0xBDFC055E,0xFFAC0363,0xFF9C0288,0xFF8801F2,0xFF700049,0xF76C0072,0xDFF8055E,0xFF340288,0xF6E80071,0xE800055E,0xDFF8055E,0xFF340288,0xF6E80071,0xE800055E,0xE800055E,0xFDCC049A,0xFFCC04F5,0xFFCC0519,0xFFC003C4,0xFFA402B5,0xFF90016B,0xFF7C0082,0xFF640032,0xFFC4049D,0xFFB003C2,0xFF700298,0xF6E80071,
-0xD7FC055E,0x1940BE8,0x1940BE8,0x1940BE8,0x1940BE8,0xFF8006ED,0xFF8006ED,0xFF8006ED,0xFF7003B4,0xFF7003B4,0xEF6C0375,0xFF68060E,0xFF68060E,0xFF68060E,0xFF54010D,0xFF54010D,0xF35000A6,0xFB440288,0xFB440288,0xEB400041,0xDD44028A,0x5FFC0BE8,0x5FFC0BE8,0x5FFC0BE8,0xFF3804C5,0xFF3804C5,0xEF380373,0xFEFC034E,0xFEFC034E,0xF0F80002,0xDF08028A,0xB1F80BE8,
-0xB1F80BE8,0xEE880373,0xDC50028A,0xC8000BEB,0xFF84097D,0xFF8C0AD8,0x1940BE8,0xFF80074E,0xFF6C04D1,0xFF6002D9,0xFF5C0214,0xFF48005E,0xFF800956,0xFF7406E1,0xFF500381,0xF0F80002,0x9DF80BE8,0x1BC0288,0x1BC0288,0x1BC0288,0x1BC0288,0xFFA80104,0xFFA80104,0xFFA80104,0xFF980004,0xFF980004,0xEF980001,0x9BFC0288,0x9BFC0288,0x9BFC0288,0xFF700049,0xFF700049,
-0xF1740001,0xCFF80288,0xCFF80288,0xEF080001,0xDC00028A,0x9BFC0288,0x9BFC0288,0x9BFC0288,0xFF700049,0xFF700049,0xF1740001,0xCFF80288,0xCFF80288,0xEF080001,0xDC00028A,0xCFF80288,0xCFF80288,0xEF080001,0xDC00028A,0xDC00028A,0xFBB40221,0xF5B80244,0x1BC0288,0xFFA401D4,0xFF980145,0xFF8C00E1,0xFF7C0082,0xFF640032,0xFDB00221,0xFFA401C2,0xC1FC0288,0xEF080001,
-0xC1FC0288,0x1F00071,0xFFE80041,0xFFE40014,0xFFE00000,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xF3FC0071,0xFFA00000,0xF6000071,0xF6000071,0xE9FC0071,0xFFD80028,0xFFD00000,0xF3FC0071,0xFFA00000,0xF6000071,0xF3FC0071,0xFFA00000,0xF6000071,0xF6000071,0xF3FC0071,
-0xFFA00000,0xF6000071,0xF6000071,0xF6000071,0xFFEC0062,0x87FC0071,0xFFEC0064,0xFDE80055,0xFDE40048,0xFFC8001D,0xFFB40000,0xFF8C0000,0xFDEC0062,0xFFE40055,0xFFBC0004,0xF6000071,0xF1FC0071,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0x16C0374,0xFF500055,0xFF500055,0xFF500055,0xFF500055,0xFF500055,
-0xFF500055,0xE1440000,0xE1440000,0xE1440000,0xCB440001,0x25FC0372,0x25FC0372,0x25FC0372,0x25FC0372,0x25FC0372,0x25FC0372,0xF0F80001,0xF0F80001,0xF0F80001,0xCB1C0001,0x95F80372,0x95F80372,0x95F80372,0xCA9C0001,0xB6000372,0xFD6802AD,0x16C0374,0x16C0374,0xFD6401E1,0xFF580145,0xFF5400B9,0xFF5400B9,0xFF48000D,0xFF580271,0xFF5001BD,0xF7380000,0xF0F80001,
-0x79FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table156[] = {
-0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x1FC0000,0x83F80000,
-0x83F80000,0x83F80000,0x83F80000,0xAA000000,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x1540001,0x16C0000,0x16C0000,0x16C0000,0x1FC0000,0x61FC0000,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,
-0x81FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xD4000000,0xBC40000,0x1A80001,0x1A80001,0x1F00000,0x33FC0000,0x63FC0000,0x63FC0000,0x99FC0000,0x1F00000,0x33FC0000,0xB1FC0000,0xC1FC0000,
-0xB1FC0000,0x1F00001,0x1F00001,0x1F00001,0x1F00001,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xF7F80000,0xF7F80000,0xF8000000,0xF8000000,0xEFFC0000,0xEFFC0000,0xEFFC0000,0xF7F80000,0xF7F80000,0xF8000000,0xF7F80000,0xF7F80000,0xF8000000,0xF8000000,0xF7F80000,
-0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xD7FC0000,0xA7FC0000,0x1F00001,0xEBFC0000,0xF1FC0000,0xF5FC0000,0xF5FC0000,0xF7FC0000,0xE3FC0000,0xEFFC0000,0xF5FC0000,0xF8000000,0xF5FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1BC1194,0xFFB00ED7,0xFFAC0CAC,0xFFA40BEB,0xFFA40BBE,0xFF9808C3,0xFF9807A2,0xFF8805A4,0xFF88043B,0xFB800386,0xFF980C15,0xFF8C0852,0xFF8006C9,0xFF7003FF,0xFF6C01EE,0xFD6400A9,0xFF64052C,0xFF58029B,0xF950007E,0xEF540409,0x9BFC1194,0xFF880D81,0xFF7C0BE8,0xFF64083E,0xFF58058D,0xFD440386,0xFF380829,0xFF2003F9,0xFB080015,0xEF140408,0xCFF81194,
-0xFEF00BE8,0xFA8C0386,0xEE400408,0xDC001198,0xFFB40EBF,0xF5B810AC,0xF7BC1114,0xFF980B8A,0xFF8C0803,0xFF7C0476,0xFF78030B,0xFF640105,0xFDB00E9B,0xFF980AFA,0xFF640451,0xFB080015,0xC1FC1194,0x1DC040B,0xFFD4035E,0xFFCC02CA,0xFFCC028A,0xFFD0029B,0xFFC401B6,0xFFBC0151,0xFFB800C3,0xFFAC0032,0xFBA80015,0xCFFC0408,0xFFC002FE,0xFFB80288,0xFFA00196,0xFF94009D,
-0xFB840014,0xE7FC0408,0xFF6C0288,0xFB0C0014,0xEE000408,0xCFFC0408,0xFFC002FE,0xFFB80288,0xFFA00196,0xFF94009D,0xFB840014,0xE7FC0408,0xFF6C0288,0xFB0C0014,0xEE000408,0xE7FC0408,0xFF6C0288,0xFB0C0014,0xEE000408,0xEE000408,0xFFD80385,0xF7DC03D5,0xF7DC03EA,0xFDD00321,0xFFC0025A,0xFFA80179,0xFFA000DA,0xFF7C0082,0xFFD00395,0xFFC802E4,0xFF980291,0xFB0C0014,
-0xE1FC0408,0x1A40BEB,0x1A40BEB,0x1A40BEB,0x1A40BEB,0xFF9807A2,0xFF9807A2,0xFF9807A2,0xFF88043B,0xFF88043B,0xF9800372,0xFF8006C9,0xFF8006C9,0xFF8006C9,0xFF6C01EE,0xFF6C01EE,0xFD6400A5,0xFF58029B,0xFF58029B,0xF5540042,0xE7540289,0x7BFC0BE8,0x7BFC0BE8,0x7BFC0BE8,0xFF58058D,0xFF58058D,0xF9480372,0xFF2003F9,0xFF2003F9,0xF9080005,0xE71C0288,0xBFF80BE8,
-0xBFF80BE8,0xF8940372,0xE6600288,0xD2000BE8,0xFFA009E1,0xFBA40B0A,0x1A40BEB,0xFF9007E6,0xFF8005BA,0xFF7803C3,0xFF78030B,0xFF640105,0xFF9409A4,0xFF8007C1,0xFF640438,0xF9080005,0xADFC0BE8,0x1CC028A,0x1CC028A,0x1CC028A,0x1CC028A,0xFFBC0151,0xFFBC0151,0xFFBC0151,0xFFAC0032,0xFFAC0032,0xF9A80001,0xB7FC0288,0xB7FC0288,0xB7FC0288,0xFF94009D,0xFF94009D,
-0xF9880000,0xDDF40288,0xDDF40288,0xF9140000,0xE6000288,0xB7FC0288,0xB7FC0288,0xB7FC0288,0xFF94009D,0xFF94009D,0xF9880000,0xDDF40288,0xDDF40288,0xF9140000,0xE6000288,0xDDF40288,0xDDF40288,0xF9140000,0xE6000288,0xE6000288,0xFDC80242,0xFFCC0242,0x1CC028A,0xFDC00202,0xFFAC0195,0xFFA80128,0xFFA000DA,0xFF7C0082,0xFFC40242,0xFBC00200,0xD3FC0288,0xF9140000,
-0xD3FC0288,0x1F80012,0xFFF4000A,0xFFF00005,0xFFF00001,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xFBFC0012,0xFFD80000,0xFA000014,0xFA000014,0xF7FC0012,0xFFF00005,0xFFEC0000,0xFBFC0012,0xFFD80000,0xFA000014,0xFBFC0012,0xFFD80000,0xFA000014,0xFA000014,0xFBFC0012,
-0xFFD80000,0xFA000014,0xFA000014,0xFA000014,0xFFF8000D,0xD7FC0012,0xF5F80012,0xFFF4000D,0xFDF4000D,0xFFE80005,0xFFE00000,0xFFD00000,0xF5FC0012,0xFFF00011,0xFFE40001,0xFA000014,0xFBFC0012,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0x1800372,0xFF680091,0xFF680091,0xFF680091,0xFF680091,0xFF680091,
-0xFF680091,0xE9580001,0xE9580001,0xE9580001,0xD5540001,0x41FC0372,0x41FC0372,0x41FC0372,0x41FC0372,0x41FC0372,0x41FC0372,0xFB080001,0xFB080001,0xFB080001,0xD52C0000,0xA1FC0372,0xA1FC0372,0xA1FC0372,0xD4A80000,0xBE000374,0xF77C02D2,0x1800372,0x1800372,0xFF740202,0xFF6C0179,0xFF6800FA,0xFF6800FA,0xFF5C002D,0xFD70028A,0xFF6401F9,0xFD4C0001,0xFB080001,
-0x89FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table157[] = {
-0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x19FC0000,0x8FF80000,
-0x8FF80000,0x8FF80000,0x8FF80000,0xB2000000,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x1640001,0x77C0000,0x77C0000,0x77C0000,0x19FC0000,0x71FC0000,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x1B80001,0x99FC0000,0x99FC0000,0x99FC0000,0x99FC0000,0x99FC0000,
-0x99FC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xDC000000,0x99FC0000,0x99FC0000,0x99FC0000,0x99FC0000,0x99FC0000,0x99FC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xDC000000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xDC000000,0xDC000000,0x1D80000,0x1B80001,0x1B80001,0x11FC0000,0x5BFC0000,0x81FC0000,0x81FC0000,0xADFC0000,0x11FC0000,0x5BFC0000,0xBFFC0000,0xCDFC0000,
-0xBFFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1C80F44,0xFFBC0D5F,0xFFB80BD4,0xFFB40B53,0xFFB00ABA,0xFFA4089F,0xFFA407BE,0xFFA005B4,0xFF94049B,0xFF900372,0xFFA40AD9,0xFF980816,0xFF9806F5,0xFF880417,0xFF7C0296,0xFF7400C1,0xFF7C0494,0xFF7002A3,0xFD64003A,0xF3640305,0xABFC0F44,0xFF940C75,0xFF900B53,0xFF7C07D6,0xFF7005E5,0xFF580374,0xFF580773,0xFF380455,0xFF200005,0xF3280304,0xD7F80F44,
-0xFF1C0B53,0xFEAC0372,0xF2600304,0xE2000F44,0xFFBC0CF6,0xFBC40E6C,0xFBC40EDC,0xFFAC0A8E,0xFF9C07CB,0xFF9004FA,0xFF8803B2,0xFF7401A6,0xFFB40CB7,0xFFA40A4C,0xFF780483,0xFF200005,0xCBFC0F44,0x1E80303,0xFFE002AE,0xFFE00263,0xFFDC0242,0xFFDC0213,0xFFD0019A,0xFFD0015A,0xFFC000D1,0xFFC0006D,0xFFB80001,0xDFFC0303,0xFFCC0282,0xFFCC0242,0xFFB80156,0xFFAC00CD,
-0xFF980000,0xEFFC0303,0xFF940242,0xFF2C0000,0xF2000304,0xDFFC0303,0xFFCC0282,0xFFCC0242,0xFFB80156,0xFFAC00CD,0xFF980000,0xEFFC0303,0xFF940242,0xFF2C0000,0xF2000304,0xEFFC0303,0xFF940242,0xFF2C0000,0xF2000304,0xF2000304,0xFFE402C1,0xFBE402D9,0xFDE802EE,0xFFDC025D,0xFFD40202,0xFFB80171,0xFFB80104,0xFFA000B4,0xFFDC02BD,0xFFDC025A,0xFFB40246,0xFF2C0000,
-0xEBFC0303,0x1B40B53,0x1B40B53,0x1B40B53,0x1B40B53,0xFFA407BE,0xFFA407BE,0xFFA407BE,0xFF94049B,0xFF94049B,0xFF900372,0xFF9806F5,0xFF9806F5,0xFF9806F5,0xFF7C0296,0xFF7C0296,0xFF7400C1,0xFF7002A3,0xFF7002A3,0xFB600032,0xED640245,0x8FFC0B53,0x8FFC0B53,0x8FFC0B53,0xFF7005E5,0xFF7005E5,0xFF580374,0xFF380455,0xFF380455,0xFF200005,0xED2C0244,0xC9F80B53,
-0xC9F80B53,0xFEAC0372,0xEC780244,0xDA000B54,0xFFAC09A2,0xFFAC0A9E,0x1B40B53,0xFFA4081B,0xFF940612,0xFF900481,0xFF8803B2,0xFF7401A6,0xFFA0097D,0xFF9807C5,0xFF780473,0xFF200005,0xBBFC0B53,0x1DC0242,0x1DC0242,0x1DC0242,0x1DC0242,0xFFD0015A,0xFFD0015A,0xFFD0015A,0xFFC0006D,0xFFC0006D,0xFFB80001,0xCDFC0242,0xCDFC0242,0xCDFC0242,0xFFAC00CD,0xFFAC00CD,
-0xFF980000,0xE7F80242,0xE7F80242,0xFF2C0000,0xEC000244,0xCDFC0242,0xCDFC0242,0xCDFC0242,0xFFAC00CD,0xFFAC00CD,0xFF980000,0xE7F80242,0xE7F80242,0xFF2C0000,0xEC000244,0xE7F80242,0xE7F80242,0xFF2C0000,0xEC000244,0xEC000244,0xFFD80200,0xF7DC0221,0x1DC0242,0xFBD401E1,0xFFC80190,0xFFB80140,0xFFB80104,0xFFA000B4,0xFFD00208,0xFFC801D4,0xDFFC0242,0xFF2C0000,
-0xDFFC0242,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0x1900372,0xFF7400C1,0xFF7400C1,0xFF7400C1,0xFF7400C1,0xFF7400C1,
-0xFF7400C1,0xF1680001,0xF1680001,0xF1680001,0xDD640001,0x59FC0372,0x59FC0372,0x59FC0372,0x59FC0372,0x59FC0372,0x59FC0372,0xFF200005,0xFF200005,0xFF200005,0xDD3C0000,0xADFC0372,0xADFC0372,0xADFC0372,0xDCB80000,0xC6000374,0xFF8C02D2,0x1900372,0x1900372,0xFF840225,0xFF8001A9,0xFF780132,0xFF780132,0xFF700059,0xF98402AD,0xFF780212,0xFF60000D,0xFF200005,
-0x99FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table158[] = {
-0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x31FC0000,0x9BF80000,
-0x9BF80000,0x9BF80000,0x9BF80000,0xBA000000,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0x1740001,0xF8C0000,0xF8C0000,0xF8C0000,0x31FC0000,0x7FFC0000,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0x1C80001,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,
-0xB1FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xB1FC0000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xD9FC0000,0xD9FC0000,0xD9FC0000,0xE4000000,0xE4000000,0x1E80000,0x1C80001,0x1C80001,0x49FC0000,0x81FC0000,0x9FFC0000,0x9FFC0000,0xC1FC0000,0x49FC0000,0x81FC0000,0xCFFC0000,0xD9FC0000,
-0xCFFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1D00C14,0xFFC40AAC,0xFFC00997,0xFFC00933,0xFFBC08C2,0xFFB4073B,0xFFB00696,0xFFAC050C,0xFFA00453,0xFFA00372,0xFFB00891,0xFFA40686,0xFFA405A5,0xFF940387,0xFF900266,0xFF8C0109,0xFF880354,0xFF7C01DB,0xFF74000A,0xF77401C5,0xB7FC0C14,0xFFAC09FD,0xFFA00934,0xFF8806A6,0xFF7C053D,0xFF700374,0xFF7005BB,0xFF58036A,0xFF380025,0xF73C01C4,0xDDF40C14,
-0xFF400933,0xFEE00372,0xF68001C4,0xE6000C14,0xFFC40A74,0xFFCC0B5C,0xFFCC0BBC,0xFFBC089D,0xFFAC066B,0xFF98043E,0xFF900351,0xFF840195,0xFFBC0A41,0xFFB00837,0xFF9003AD,0xFF380025,0xD3FC0C14,0x1EC01C3,0xFFE8018B,0xFFE40162,0xFFE40152,0xFFE00141,0xFFDC00EE,0xFFDC00CA,0xFFD80079,0xFFCC0041,0xFFC80001,0xE5FC01C3,0xFFD80176,0xFFD80152,0xFFCC00D1,0xFFB8007D,
-0xFFB00000,0xF3F801C3,0xFFAC0152,0xFF600000,0xF60001C4,0xE5FC01C3,0xFFD80176,0xFFD80152,0xFFCC00D1,0xFFB8007D,0xFFB00000,0xF3F801C3,0xFFAC0152,0xFF600000,0xF60001C4,0xF3F801C3,0xFFAC0152,0xFF600000,0xF60001C4,0xF60001C4,0xFBEC01A1,0xFFEC01A1,0xFFEC01B2,0xFFDC016D,0xFFDC0125,0xFFD000E3,0xFFC8009D,0xFFB8006A,0xF9EC01A1,0xFFDC015A,0xFFC80156,0xFF600000,
-0xEFFC01C3,0x1C00933,0x1C00933,0x1C00933,0x1C00933,0xFFB00696,0xFFB00696,0xFFB00696,0xFFA00453,0xFFA00453,0xFFA00372,0xFFA405A5,0xFFA405A5,0xFFA405A5,0xFF900266,0xFF900266,0xFF8C0109,0xFF7C01DB,0xFF7C01DB,0xFD740009,0xF1740155,0xA3FC0933,0xA3FC0933,0xA3FC0933,0xFF7C053D,0xFF7C053D,0xFF700374,0xFF58036A,0xFF58036A,0xFF380025,0xF1400154,0xD1FC0933,
-0xD1FC0933,0xFEE00372,0xF0980154,0xDE000934,0xFFBC07F2,0xF7BC08BB,0x1C00933,0xFFAC06AE,0xFFA00542,0xFF9803DA,0xFF900351,0xFF840195,0xFFB407AE,0xFFB0067E,0xFF9003A4,0xFF380025,0xC5FC0933,0x1E40152,0x1E40152,0x1E40152,0x1E40152,0xFFDC00CA,0xFFDC00CA,0xFFDC00CA,0xFFCC0041,0xFFCC0041,0xFFC80001,0xD9FC0152,0xD9FC0152,0xD9FC0152,0xFFB8007D,0xFFB8007D,
-0xFFB00000,0xEDF80152,0xEDF80152,0xFF600000,0xF0000154,0xD9FC0152,0xD9FC0152,0xD9FC0152,0xFFB8007D,0xFFB8007D,0xFFB00000,0xEDF80152,0xEDF80152,0xFF600000,0xF0000154,0xEDF80152,0xEDF80152,0xFF600000,0xF0000154,0xF0000154,0xF7E40139,0xFBE40139,0x1E40152,0xFFDC0109,0xFFD400E1,0xFFD000CA,0xFFC8009D,0xFFB8006A,0xF5E40139,0xFFDC0109,0xE7FC0152,0xFF600000,
-0xE7FC0152,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0x1A00372,0xFF8C0109,0xFF8C0109,0xFF8C0109,0xFF8C0109,0xFF8C0109,
-0xFF8C0109,0xF9780001,0xF9780001,0xF9780001,0xE5740001,0x71FC0372,0x71FC0372,0x71FC0372,0x71FC0372,0x71FC0372,0x71FC0372,0xFF380025,0xFF380025,0xFF380025,0xE54C0000,0xB9FC0372,0xB9FC0372,0xB9FC0372,0xE4C80000,0xCE000374,0xF79C02F9,0x1A00372,0x1A00372,0xFF900262,0xFF9401E1,0xFF90016D,0xFF90016D,0xFF800092,0xFF9002B9,0xFD900244,0xFF74002D,0xFF380025,
-0xA7FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table159[] = {
-0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0x49FC0000,0xA7F80000,
-0xA7F80000,0xA7F80000,0xA7F80000,0xC2000000,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1840001,0x1A00000,0x1A00000,0x1A00000,0x49FC0000,0x8FFC0000,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0x1D80001,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,
-0xC9FC0000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xC9FC0000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xE5F80000,0xE5F80000,0xE5F80000,0xEC000000,0xEC000000,0x5F80000,0x1D80001,0x1D80001,0x83FC0000,0xA9FC0000,0xBDFC0000,0xBDFC0000,0xD3FC0000,0x83FC0000,0xA9FC0000,0xDFF80000,0xE5F80000,
-0xDFF80000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1D80964,0xFFD00864,0xFFCC07AB,0xFFC8076B,0xFFC4070C,0xFFC40607,0xFFBC059E,0xFFB80484,0xFFB4040A,0xFFB00372,0xFFC40694,0xFFB40543,0xFFB0049D,0xFFAC031F,0xFFA0024E,0xFF980151,0xFF940274,0xFF940153,0xFF880002,0xF98400D9,0xC3FC0964,0xFFB807ED,0xFFB00768,0xFFA00596,0xFF9404A5,0xFF880374,0xFF880463,0xFF7002BA,0xFF580061,0xF95000D8,0xE1FC0964,
-0xFF600768,0xFF100372,0xF8A800D8,0xEA000964,0xFFCC082B,0xF3D4091A,0xF5D8093F,0xFFC406CC,0xFFB4054F,0xFFAC03A2,0xFFA40306,0xFF9C019B,0xFFD0080D,0xFFC406A4,0xFF9C02F7,0xFF580061,0xDBFC0964,0x1F000DB,0xFDF000C3,0xFFEC00AB,0xFFEC00A2,0xFFE80093,0xFFE80072,0xFFE80062,0xFFE40039,0xFFE00022,0xFFD80001,0xEFFC00D8,0xFFE400B2,0xFFE400A2,0xFFD80061,0xFFD8003D,
-0xFFC80000,0xF7F800D8,0xFFC800A2,0xFF900000,0xF80000D8,0xEFFC00D8,0xFFE400B2,0xFFE400A2,0xFFD80061,0xFFD8003D,0xFFC80000,0xF7F800D8,0xFFC800A2,0xFF900000,0xF80000D8,0xF7F800D8,0xFFC800A2,0xFF900000,0xF80000D8,0xF80000D8,0xFDF000C1,0xFFEC00D1,0xF1F000DB,0xFFEC00AE,0xFFE80092,0xFFE0006B,0xFFD80050,0xFFD00034,0xFBF000C1,0xFFE800A6,0xFFD800A3,0xFF900000,
-0xF5FC00D8,0x1C8076B,0x1C8076B,0x1C8076B,0x1C8076B,0xFFBC059E,0xFFBC059E,0xFFBC059E,0xFFB4040A,0xFFB4040A,0xFFB00372,0xFFB0049D,0xFFB0049D,0xFFB0049D,0xFFA0024E,0xFFA0024E,0xFF980151,0xFF940153,0xFF940153,0xFF880002,0xF58400A5,0xB1FC0768,0xB1FC0768,0xB1FC0768,0xFF9404A5,0xFF9404A5,0xFF880374,0xFF7002BA,0xFF7002BA,0xFF580061,0xF55400A4,0xD9FC0768,
-0xD9FC0768,0xFF100372,0xF4B800A4,0xE4000768,0xFBC40683,0xFDC80703,0x1C8076B,0xFFBC057D,0xFFB4046E,0xFFAC0362,0xFFA40306,0xFF9C019B,0xFFBC0651,0xFFB4055B,0xFF9C02EE,0xFF580061,0xCFFC0768,0x1EC00A2,0x1EC00A2,0x1EC00A2,0x1EC00A2,0xFFE80062,0xFFE80062,0xFFE80062,0xFFE00022,0xFFE00022,0xFFD80001,0xE5FC00A2,0xE5FC00A2,0xE5FC00A2,0xFFD8003D,0xFFD8003D,
-0xFFC80000,0xF3F800A2,0xF3F800A2,0xFF900000,0xF40000A4,0xE5FC00A2,0xE5FC00A2,0xE5FC00A2,0xFFD8003D,0xFFD8003D,0xFFC80000,0xF3F800A2,0xF3F800A2,0xFF900000,0xF40000A4,0xF3F800A2,0xF3F800A2,0xFF900000,0xF40000A4,0xF40000A4,0xFBEC0091,0xFFEC0091,0x1EC00A2,0xF9EC0091,0xFFDC0074,0xFFD80061,0xFFD80050,0xFFD00034,0xF9EC0091,0xFDE40082,0xEFFC00A2,0xFF900000,
-0xEFFC00A2,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0x1B00372,0xFF980151,0xFF980151,0xFF980151,0xFF980151,0xFF980151,
-0xFF980151,0xFF880002,0xFF880002,0xFF880002,0xED840001,0x89FC0372,0x89FC0372,0x89FC0372,0x89FC0372,0x89FC0372,0x89FC0372,0xFF580061,0xFF580061,0xFF580061,0xED5C0000,0xC5FC0372,0xC5FC0372,0xC5FC0372,0xECD80000,0xD6000374,0xFFAC02F9,0x1B00372,0x1B00372,0xFDA8028A,0xFFA00212,0xFF9801BA,0xFF9801BA,0xFF9400DA,0xFFA002E4,0xFFA00269,0xFF88007D,0xFF580061,
-0xB7FC0372,};
-static const uint32_t g_etc1_to_bc7_m6_table160[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x300000,0x300000,0x300000,0x300000,0x2440000,0x2440000,0x2440000,0x8C0000,0x8C0000,0x16000001,0x2440000,0x2440000,0x2440000,0x8C0000,0x8C0000,0x16000001,0x8C0000,0x8C0000,0x16000001,0x16000001,0x2440000,0x2440000,0x2440000,0x8C0000,0x8C0000,0x16000001,0x8C0000,0x8C0000,0x16000001,0x16000001,0x8C0000,
-0x8C0000,0x16000001,0x16000001,0x16000001,0x380000,0x340000,0x300000,0x400000,0x500000,0x640000,0x740000,0xAC0000,0x3C0000,0x2440000,0x640000,0x16000001,0x640000,0xA00000,0xEC0000,0x1E40000,0x4E000001,0xEC0000,0x1E40000,0x4E000001,0x1E40000,0x4E000001,0x4E000001,0xEC0000,0x1E40000,0x4E000001,0x1E40000,0x4E000001,
-0x4E000001,0x1E40000,0x4E000001,0x4E000001,0x4E000001,0xEC0000,0x1E40000,0x4E000001,0x1E40000,0x4E000001,0x4E000001,0x1E40000,0x4E000001,0x4E000001,0x4E000001,0x1E40000,0x4E000001,0x4E000001,0x4E000001,0x4E000001,0xC80000,0xCA80000,0xCA80000,0x10C0000,0x1880000,0x25F00000,0x4E000001,0x4E000001,0xD80000,0x12C0000,0x45DC0000,0x4E000001,
-0x1540000,0x3410B0,0xE2100180,0x72100180,0x4E100181,0x9C000620,0x6E0000A9,0x4E000005,0x4C000620,0x4400025D,0x32000622,0x6A000D2B,0x620004E9,0x46000263,0x46000851,0x3C00041A,0x30000732,0x32000D2C,0x32000831,0x2C000A06,0x22000D2B,0x4C10B0,0x4C0007F3,0x4000045C,0x3A000A12,0x3A00058B,0x2E000814,0x2E000E21,0x2C000952,0x26000ABF,0x22000DA4,0x9810B0,
-0x26000BDD,0x20000CBA,0x1C000EF4,0x180010B4,0xFE0004A0,0xFA240B7A,0xFE2C0B05,0x9E000409,0x62000449,0x4C00041A,0x440002DE,0x36000555,0xD0000776,0x7C00058B,0x3E00065B,0x26000ABF,0x6C10B0,0x440D2C,0xDA180120,0x70180120,0x4E180121,0x9C000620,0x6E0000A9,0x4E000005,0x4C000620,0x4400025D,0x32000622,0x680D2B,0x620004E9,0x46000263,0x46000851,0x3C00041A,
-0x30000732,0xD00D2B,0x32000831,0x2C000A06,0x22000D2B,0x680D2B,0x620004E9,0x46000263,0x46000851,0x3C00041A,0x30000732,0xD00D2B,0x32000831,0x2C000A06,0x22000D2B,0xD00D2B,0x32000831,0x2C000A06,0x22000D2B,0x22000D2B,0xFE0004A0,0xFE2C09C6,0xF63C08B8,0x9E000409,0x62000449,0x4C00041A,0x440002DE,0x36000555,0xD00006CD,0x7C00054B,0x3E00064B,0x2C000A06,
-0x940D2B,0x100180,0x100180,0x100180,0x100180,0x48000000,0x48000000,0x48000000,0x22000000,0x22000000,0x16000001,0x24000120,0x24000120,0x24000120,0x1E00006D,0x1E00006D,0x1400003A,0x10000122,0x10000122,0x100000AA,0xA000122,0x180180,0x180180,0x180180,0x180000B1,0x180000B1,0x12000061,0xE000141,0xE000141,0xE0000CE,0xA000132,0x2C0180,
-0x2C0180,0xA00010B,0xA000153,0x6000183,0x84000059,0xFA04002C,0x100180,0x42000075,0x3200006D,0x2200006A,0x2200005A,0x16000082,0x420000C5,0x3200009E,0x16000121,0xE0000CE,0x200180,0x180120,0x180120,0x180120,0x180120,0x48000000,0x48000000,0x48000000,0x22000000,0x22000000,0x16000001,0x240120,0x240120,0x240120,0x1E00006D,0x1E00006D,
-0x1400003A,0x440120,0x440120,0x100000AA,0xA000122,0x240120,0x240120,0x240120,0x1E00006D,0x1E00006D,0x1400003A,0x440120,0x440120,0x100000AA,0xA000122,0x440120,0x440120,0x100000AA,0xA000122,0xA000122,0x84000059,0xFC080020,0x180120,0x42000075,0x3200006D,0x2200006A,0x2200005A,0x16000082,0x560000B4,0x32000095,0x300120,0x100000AA,
-0x300120,0x680620,0xC2300000,0x6A300000,0x4E300001,0x2980620,0x6E0000A9,0x4E000005,0x1380620,0x4400025D,0x32000622,0x2980620,0x6E0000A9,0x4E000005,0x1380620,0x4400025D,0x32000622,0x1380620,0x4400025D,0x32000622,0x32000622,0x2980620,0x6E0000A9,0x4E000005,0x1380620,0x4400025D,0x32000622,0x1380620,0x4400025D,0x32000622,0x32000622,0x1380620,
-0x4400025D,0x32000622,0x32000622,0x32000622,0xFE140320,0xE6C0620,0xF65C039D,0xA6000220,0x6A000269,0x54000249,0x46000195,0x3E000305,0xF2000352,0x8C0002A8,0x4C000161,0x32000622,0xDC0620,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table161[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x400000,0x400000,0x400000,0x400000,0x25C0000,0x25C0000,0x25C0000,0xBC0000,0xBC0000,0x1E000001,0x25C0000,0x25C0000,0x25C0000,0xBC0000,0xBC0000,0x1E000001,0xBC0000,0xBC0000,0x1E000001,0x1E000001,0x25C0000,0x25C0000,0x25C0000,0xBC0000,0xBC0000,0x1E000001,0xBC0000,0xBC0000,0x1E000001,0x1E000001,0xBC0000,
-0xBC0000,0x1E000001,0x1E000001,0x1E000001,0x4480000,0x440000,0x400000,0x2540000,0x6C0000,0x880000,0x9C0000,0xE80000,0x500000,0x25C0000,0x880000,0x1E000001,0x880000,0xB00000,0x1040000,0x7FC0000,0x56000001,0x1040000,0x7FC0000,0x56000001,0x7FC0000,0x56000001,0x56000001,0x1040000,0x7FC0000,0x56000001,0x7FC0000,0x56000001,
-0x56000001,0x7FC0000,0x56000001,0x56000001,0x56000001,0x1040000,0x7FC0000,0x56000001,0x7FC0000,0x56000001,0x56000001,0x7FC0000,0x56000001,0x56000001,0x56000001,0x7FC0000,0x56000001,0x56000001,0x56000001,0x56000001,0xDC0000,0xBC0000,0xBC0000,0x1280000,0x1AC0000,0x2FF00000,0x56000001,0x56000001,0xF00000,0x1480000,0x4DEC0000,0x56000001,
-0x1740000,0x3C1430,0xF61402AC,0x7C1402AD,0x561402AD,0xB6000620,0x7A000059,0x5600000A,0x58000620,0x4A0001ED,0x3A000622,0x78000F80,0x680005A9,0x5200030B,0x4C000911,0x46000432,0x3A00078B,0x3A000F80,0x38000989,0x32000B46,0x26000F83,0x581430,0x580009AB,0x4C00058C,0x46000B62,0x40000613,0x340008C4,0x340010D1,0x38000AF2,0x2C000C47,0x2400102B,0xB01430,
-0x2C000E45,0x26000EDA,0x200011F7,0x1C001434,0xFC080612,0xFE2C0E0A,0xFE2C0E55,0xA6000431,0x76000461,0x5A000421,0x480002C3,0x420005B3,0xD0000856,0x8600061A,0x46000791,0x2C000C47,0x7C1430,0x500F80,0xEA200200,0x7A200200,0x56200201,0xB6000620,0x7A000059,0x58040006,0x58000620,0x4A0001ED,0x3A000622,0x2740F80,0x680005A9,0x5200030B,0x4C000911,0x46000432,
-0x3A00078B,0xF00F80,0x38000989,0x32000B46,0x26000F83,0x2740F80,0x680005A9,0x5200030B,0x4C000911,0x46000432,0x3A00078B,0xF00F80,0x38000989,0x32000B46,0x26000F83,0xF00F80,0x38000989,0x32000B46,0x26000F83,0x26000F83,0xFE0C05F6,0xF63C0BA4,0xFC480AC4,0xA6000431,0x76000461,0x5A000421,0x480002C3,0x420005B3,0xD00007AD,0x900005D9,0x46000781,0x32000B46,
-0xA80F80,0x1402AC,0x1402AC,0x1402AC,0x1402AC,0x60000000,0x60000000,0x60000000,0x2E000000,0x2E000000,0x1E000001,0x30000200,0x30000200,0x30000200,0x220000B9,0x220000B9,0x1E000065,0x16000202,0x16000202,0x16000132,0xE000202,0x2002AB,0x2002AB,0x2002AB,0x22000132,0x22000132,0x180000B1,0x12000236,0x12000236,0x1400016E,0xE00021B,0x3C02AB,
-0x3C02AB,0x100001D3,0xA000263,0xA0002AB,0xB600009D,0xFE0C00AC,0x1402AC,0x5C0000DA,0x3C0000C1,0x2E0000C1,0x2A00009D,0x220000E8,0x64000161,0x44000125,0x1E000204,0x1400016E,0x2C02AB,0x200200,0x200200,0x200200,0x200200,0x60000000,0x60000000,0x60000000,0x2E000000,0x2E000000,0x1E000001,0x300200,0x300200,0x300200,0x220000B9,0x220000B9,
-0x1E000065,0x5C0200,0x5C0200,0x16000132,0xE000202,0x300200,0x300200,0x300200,0x220000B9,0x220000B9,0x1E000065,0x5C0200,0x5C0200,0x16000132,0xE000202,0x5C0200,0x5C0200,0x16000132,0xE000202,0xE000202,0xB600009D,0xFE0C0088,0x200200,0x5C0000DA,0x3C0000C1,0x2E0000C1,0x2A00009D,0x220000E8,0x6400013D,0x4A000112,0x400200,0x16000132,
-0x400200,0x780620,0xCA400000,0x72400000,0x56400001,0x2B00620,0x7A000059,0x58080001,0x1680620,0x4A0001ED,0x3A000622,0x2B00620,0x7A000059,0x58080001,0x1680620,0x4A0001ED,0x3A000622,0x1680620,0x4A0001ED,0x3A000622,0x3A000622,0x2B00620,0x7A000059,0x58080001,0x1680620,0x4A0001ED,0x3A000622,0x1680620,0x4A0001ED,0x3A000622,0x3A000622,0x1680620,
-0x4A0001ED,0x3A000622,0x3A000622,0x3A000622,0xF8280349,0x800620,0xFE6C039D,0xBC0001BD,0x80000209,0x5E0001D4,0x50000131,0x460002B1,0xF8040320,0x9E000239,0x560000E8,0x3A000622,0xFC0620,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table162[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x500000,0x500000,0x500000,0x500000,0x2740000,0x2740000,0x2740000,0xF00000,0xF00000,0x26000001,0x2740000,0x2740000,0x2740000,0xF00000,0xF00000,0x26000001,0xF00000,0xF00000,0x26000001,0x26000001,0x2740000,0x2740000,0x2740000,0xF00000,0xF00000,0x26000001,0xF00000,0xF00000,0x26000001,0x26000001,0xF00000,
-0xF00000,0x26000001,0x26000001,0x26000001,0x5C0000,0x2540000,0x500000,0x6C0000,0x840000,0xA80000,0xC00000,0x1240000,0x640000,0x2740000,0xA80000,0x26000001,0xA80000,0xC00000,0x11C0000,0x13FC0000,0x5E000001,0x11C0000,0x13FC0000,0x5E000001,0x13FC0000,0x5E000001,0x5E000001,0x11C0000,0x13FC0000,0x5E000001,0x13FC0000,0x5E000001,
-0x5E000001,0x13FC0000,0x5E000001,0x5E000001,0x5E000001,0x11C0000,0x13FC0000,0x5E000001,0x13FC0000,0x5E000001,0x5E000001,0x13FC0000,0x5E000001,0x5E000001,0x5E000001,0x13FC0000,0x5E000001,0x5E000001,0x5E000001,0x5E000001,0xF00000,0xCC0000,0xCC0000,0x3400000,0x1D40000,0x39F00000,0x5E000001,0x5E000001,0x1040000,0x1680000,0x55FC0000,0x5E000001,
-0x1980000,0x441830,0xFE1C0435,0x861C042D,0x5E1C042D,0xCE000620,0x8C000025,0x62040035,0x64000620,0x56000195,0x42000622,0x8800122B,0x740006C9,0x580003F3,0x580009E9,0x4C00046A,0x400007EB,0x4200122B,0x3E000B29,0x38000CB6,0x2C00122B,0x641830,0x62000B89,0x52000704,0x52000CF2,0x460006D3,0x3A000994,0x3A0013E9,0x3E000CE2,0x32000E07,0x2A0012FF,0xCC1830,
-0x32001115,0x2C00114A,0x26001547,0x20001834,0xFE1407F5,0xFE2C116A,0xF63C11FC,0xC800047D,0x8000049D,0x5E000465,0x520002DE,0x4A00061E,0xF2000977,0x9E0006AD,0x4E0008E5,0x32000E07,0x901830,0x5C122C,0xFA280320,0x84280320,0x5E280321,0xCE000620,0x8C000025,0x6008002A,0x64000620,0x56000195,0x42000622,0x84122B,0x740006C9,0x580003F3,0x580009E9,0x4C00046A,
-0x400007EB,0x10C122B,0x3E000B29,0x38000CB6,0x2C00122B,0x84122B,0x740006C9,0x580003F3,0x580009E9,0x4C00046A,0x400007EB,0x10C122B,0x3E000B29,0x38000CB6,0x2C00122B,0x10C122B,0x3E000B29,0x38000CB6,0x2C00122B,0x2C00122B,0xFE140791,0xFC480DB8,0xFE4C0D38,0xC800047D,0x8000049D,0x5E000465,0x520002DE,0x4A00061E,0xFA00088B,0x9E000649,0x4E0008CC,0x38000CB6,
-0xBC122B,0x1C042C,0x1C042C,0x1C042C,0x1C042C,0x78000000,0x78000000,0x78000000,0x3A000000,0x3A000000,0x26000001,0x3C000320,0x3C000320,0x3C000320,0x2E000121,0x2E000121,0x22000092,0x1C000322,0x1C000322,0x1C0001E2,0x12000322,0x224042B,0x224042B,0x224042B,0x280001E2,0x280001E2,0x2200010B,0x18000372,0x18000372,0x1A00023E,0x12000346,0x4C042B,
-0x4C042B,0x160002E3,0x100003AB,0xC00042B,0xF60000FA,0xFE0C018C,0x1C042C,0x72000151,0x50000131,0x4000013A,0x38000105,0x2A000161,0x8200022D,0x500001C2,0x24000324,0x1A00023E,0x34042B,0x280320,0x280320,0x280320,0x280320,0x78000000,0x78000000,0x78000000,0x3A000000,0x3A000000,0x26000001,0x3C0320,0x3C0320,0x3C0320,0x2E000121,0x2E000121,
-0x22000092,0x740320,0x740320,0x1C0001E2,0x12000322,0x3C0320,0x3C0320,0x3C0320,0x2E000121,0x2E000121,0x22000092,0x740320,0x740320,0x1C0001E2,0x12000322,0x740320,0x740320,0x1C0001E2,0x12000322,0x12000322,0xF60000FA,0xF4180139,0x280320,0x72000151,0x50000131,0x4000013A,0x38000105,0x2A000161,0x820001ED,0x5A0001A8,0x540320,0x1C0001E2,
-0x540320,0x880620,0xD2500000,0x7A500000,0x5E500001,0xC80620,0x8C000025,0x60180001,0x1980620,0x56000195,0x42000622,0xC80620,0x8C000025,0x60180001,0x1980620,0x56000195,0x42000622,0x1980620,0x56000195,0x42000622,0x42000622,0xC80620,0x8C000025,0x60180001,0x1980620,0x56000195,0x42000622,0x1980620,0x56000195,0x42000622,0x42000622,0x1980620,
-0x56000195,0x42000622,0x42000622,0x42000622,0xFE340355,0x900620,0xF67C03C8,0xD2000164,0x8A0001A9,0x68000190,0x580000DA,0x5000024A,0xFC140322,0xB40001E2,0x5E000095,0x42000622,0x1200620,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table163[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x600000,0x600000,0x600000,0x600000,0x28C0000,0x28C0000,0x28C0000,0x1200000,0x1200000,0x2E000001,0x28C0000,0x28C0000,0x28C0000,0x1200000,0x1200000,0x2E000001,0x1200000,0x1200000,0x2E000001,0x2E000001,0x28C0000,0x28C0000,0x28C0000,0x1200000,0x1200000,0x2E000001,0x1200000,0x1200000,0x2E000001,0x2E000001,0x1200000,
-0x1200000,0x2E000001,0x2E000001,0x2E000001,0x700000,0xA640000,0x600000,0x2800000,0xA00000,0xCC0000,0xE80000,0x1640000,0x780000,0x28C0000,0xCC0000,0x2E000001,0xCC0000,0xD00000,0x1340000,0x1FF80000,0x66000001,0x1340000,0x1FF80000,0x66000001,0x1FF80000,0x66000001,0x66000001,0x1340000,0x1FF80000,0x66000001,0x1FF80000,0x66000001,
-0x66000001,0x1FF80000,0x66000001,0x66000001,0x66000001,0x1340000,0x1FF80000,0x66000001,0x1FF80000,0x66000001,0x66000001,0x1FF80000,0x66000001,0x66000001,0x66000001,0x1FF80000,0x66000001,0x66000001,0x66000001,0x66000001,0x1040000,0x6DC0000,0x6DC0000,0x15C0000,0x1FC0000,0x43F00000,0x66000001,0x66000001,0x3180000,0x1840000,0x5FD00000,0x66000001,
-0x1B80000,0x4C1CB0,0xFE24064C,0x92200600,0x66200601,0xE6000620,0x98000005,0x6A08008D,0x70000620,0x5C00013D,0x4A000622,0x9A00152B,0x80000829,0x62000519,0x62000AC2,0x580004BA,0x46000863,0x4A00152C,0x44000D11,0x3E000E56,0x3200152B,0x701CB0,0x6E000DF9,0x580008DC,0x58000EB2,0x520007AB,0x46000A74,0x46001751,0x44000F22,0x38000FFF,0x30001633,0xE41CB0,
-0x3800144D,0x3200140A,0x2C00190F,0x24001CB4,0xFE140A75,0xF63C1528,0xFA44160C,0xDE0004E3,0x90000506,0x6E0004C2,0x5E000302,0x500006CB,0xFA000B2B,0xB2000792,0x56000A45,0x38000FFF,0xA01CB0,0x64152C,0xFE300490,0x8E300480,0x66300481,0xE6000620,0x98000005,0x6A08007D,0x70000620,0x5C00013D,0x4A000622,0x98152B,0x80000829,0x62000519,0x62000AC2,0x580004BA,
-0x46000863,0x130152B,0x44000D11,0x3E000E56,0x3200152B,0x98152B,0x80000829,0x62000519,0x62000AC2,0x580004BA,0x46000863,0x130152B,0x44000D11,0x3E000E56,0x3200152B,0x130152B,0x44000D11,0x3E000E56,0x3200152B,0x3200152B,0xFE2009B6,0xFE4C1044,0xF65C1031,0xDE0004E3,0x90000506,0x6E0004C2,0x5E000302,0x500006CB,0xFA000A2B,0xB2000719,0x56000A2C,0x3E000E56,
-0xD8152B,0x200600,0x200600,0x200600,0x200600,0x90000000,0x90000000,0x90000000,0x46000000,0x46000000,0x2E000001,0x48000480,0x48000480,0x48000480,0x3A0001A9,0x3A0001A9,0x280000DA,0x22000480,0x22000480,0x200002C5,0x16000482,0x300600,0x300600,0x300600,0x2E0002C2,0x2E0002C2,0x28000183,0x1E0004F6,0x1E0004F6,0x1C000336,0x160004C2,0x5C0600,
-0x5C0600,0x16000433,0x14000556,0xE000603,0xF600018A,0xF21402D9,0x200600,0x900001E1,0x5E0001BA,0x480001BA,0x40000172,0x2E000212,0x9600032B,0x660002A1,0x2C000489,0x1C000336,0x400600,0x300480,0x300480,0x300480,0x300480,0x90000000,0x90000000,0x90000000,0x46000000,0x46000000,0x2E000001,0x2440480,0x2440480,0x2440480,0x3A0001A9,0x3A0001A9,
-0x280000DA,0x8C0480,0x8C0480,0x200002C5,0x16000482,0x2440480,0x2440480,0x2440480,0x3A0001A9,0x3A0001A9,0x280000DA,0x8C0480,0x8C0480,0x200002C5,0x16000482,0x8C0480,0x8C0480,0x200002C5,0x16000482,0x16000482,0xF600018A,0xF8200221,0x300480,0x900001E1,0x5E0001BA,0x480001BA,0x40000172,0x2E000212,0xA40002D5,0x6C000274,0x640480,0x200002C5,
-0x640480,0x980620,0xDA600000,0x82600000,0x66600001,0xE00620,0x98000005,0x68280001,0x1CC0620,0x5C00013D,0x4A000622,0xE00620,0x98000005,0x68280001,0x1CC0620,0x5C00013D,0x4A000622,0x1CC0620,0x5C00013D,0x4A000622,0x4A000622,0xE00620,0x98000005,0x68280001,0x1CC0620,0x5C00013D,0x4A000622,0x1CC0620,0x5C00013D,0x4A000622,0x4A000622,0x1CC0620,
-0x5C00013D,0x4A000622,0x4A000622,0x4A000622,0xFC4C0374,0x8A00620,0xFE8C03C8,0xE600010D,0xA0000161,0x76000132,0x5E000091,0x58000202,0xFE2C0349,0xC600019A,0x6A000061,0x4A000622,0x1400620,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table164[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,
-0x2180000,0x300000,0x300000,0x300000,0x8000000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x300000,0x300000,0x300000,0x8000000,0x300000,0x300000,0x300000,0x8000000,0x8000000,0x140000,0x100001,0x100001,0x140000,0x2140000,0x180000,0x180000,0x1C0000,0x140000,0x2140000,0x240000,0x300000,
-0x240000,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x38000000,0x38000000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x38000000,0x38000000,0x1580000,
-0x1580000,0x38000000,0x38000000,0x38000000,0x840000,0x4780000,0x700001,0x2980000,0xC00000,0xF00000,0x1140000,0x1A40000,0x48C0000,0xA80000,0xF00000,0x38000000,0xF00000,0xE00001,0x1500000,0x2DF80000,0x70000000,0x1500000,0x2DF80000,0x70000000,0x2DF80000,0x70000000,0x70000000,0x1500000,0x2DF80000,0x70000000,0x2DF80000,0x70000000,
-0x70000000,0x2DF80000,0x70000000,0x70000000,0x70000000,0x1500000,0x2DF80000,0x70000000,0x2DF80000,0x70000000,0x70000000,0x2DF80000,0x70000000,0x70000000,0x70000000,0x2DF80000,0x70000000,0x70000000,0x70000000,0x70000000,0x11C0000,0xF00000,0xF00000,0x17C0000,0x11FC0000,0x4DF80000,0x70000000,0x70000000,0x1340000,0x1A80000,0x69C40000,0x70000000,
-0x1E00000,0x581F9B,0xFE30080B,0x9C2C0756,0x702C0756,0xFA080649,0xA408002A,0x741000FA,0x7C080649,0x68040139,0x54080649,0xB600152B,0x92000706,0x6E000496,0x740009B1,0x62000339,0x5200076C,0x5800152B,0x50000BF2,0x44000D79,0x3A00152C,0x841F99,0x7A000EA6,0x62000946,0x68000E9D,0x5E00071A,0x4C000A11,0x5200182C,0x4E000EC8,0x44000F8A,0x3A001695,0x10C1F99,
-0x440015A6,0x3E00150D,0x32001A90,0x2C001F99,0xFE200C15,0xFE4C178F,0xFE4C18DF,0xF400035A,0xA2000391,0x7C00033A,0x660001B8,0x60000548,0xFE000BDB,0xC6000621,0x5E000926,0x44000F8A,0xBC1F99,0x78152B,0xFE4404A6,0x96440482,0x70400482,0xEE140621,0xA0100006,0x741C007A,0x78140621,0x660C0131,0x54100621,0x2B0152B,0x92000706,0x6E000496,0x740009B1,0x62000339,
-0x5200076C,0x168152B,0x50000BF2,0x44000D79,0x3A00152C,0x2B0152B,0x92000706,0x6E000496,0x740009B1,0x62000339,0x5200076C,0x168152B,0x50000BF2,0x44000D79,0x3A00152C,0x168152B,0x50000BF2,0x44000D79,0x3A00152C,0x3A00152C,0xFE340A02,0xFA641079,0xFE6C1036,0xF400035A,0xA2000391,0x7C00033A,0x660001B8,0x60000548,0xFE080A6C,0xC6000591,0x5E000902,0x44000D79,
-0xFC152B,0x2C0756,0x2C0756,0x2C0756,0x2C0756,0xA4080029,0xA4080029,0xA4080029,0x52080029,0x52080029,0x38080029,0x64000480,0x64000480,0x64000480,0x46000115,0x46000115,0x34000050,0x30000480,0x30000480,0x2C000249,0x20000480,0x400756,0x400756,0x400756,0x3A0002EE,0x3A0002EE,0x2E000169,0x2800055B,0x2800055B,0x2600030E,0x1E0004EC,0x800756,
-0x800756,0x200004CD,0x1C000609,0x14000759,0xFC0C01B4,0xF82003D5,0x2C0756,0xB2000151,0x7C000124,0x5E000131,0x560000DD,0x40000172,0xC20002F5,0x8400022D,0x3E000490,0x2600030E,0x5C0756,0x400482,0x400482,0x400482,0x400482,0x98140001,0x98140001,0x98140001,0x50100001,0x50100001,0x38100001,0x600480,0x600480,0x600480,0x46000115,0x46000115,
-0x34000050,0xC40480,0xC40480,0x2C000249,0x20000480,0x600480,0x600480,0x600480,0x46000115,0x46000115,0x34000050,0xC40480,0xC40480,0x2C000249,0x20000480,0xC40480,0xC40480,0x2C000249,0x20000480,0x20000480,0xFE100188,0xFE2C0239,0x400482,0xB2000151,0x7C000124,0x5E000131,0x560000DD,0x40000172,0xD6000262,0x900001E1,0x8C0480,0x2C000249,
-0x8C0480,0xA80622,0xE0740001,0x8C700001,0x70700001,0xFC0620,0xA4080001,0x703C0000,0x3F80620,0x660000F4,0x54000620,0xFC0620,0xA4080001,0x703C0000,0x3F80620,0x660000F4,0x54000620,0x3F80620,0x660000F4,0x54000620,0x54000620,0xFC0620,0xA4080001,0x703C0000,0x3F80620,0x660000F4,0x54000620,0x3F80620,0x660000F4,0x54000620,0x54000620,0x3F80620,
-0x660000F4,0x54000620,0x54000620,0x54000620,0xF668039D,0x2B40620,0xF8A003F5,0xFC0000C8,0xAA000109,0x7E0400F2,0x6C000050,0x640001B1,0xF4480372,0xD800013D,0x74000022,0x54000620,0x1680620,0x80029,0x80029,0x80029,0x80029,0x80029,0x80029,0x80029,0x80029,0x80029,0x80029,0x1A000000,0x1A000000,0x1A000000,0x1A000000,0x1A000000,
-0x1A000000,0xC000000,0xC000000,0xC000000,0x8000000,0xC0029,0xC0029,0xC0029,0xC0029,0xC0029,0xC0029,0xC000010,0xC000010,0xC000010,0x6000008,0x140029,0x140029,0x140029,0x4000019,0x4000029,0x88000000,0x80029,0x80029,0x3C000000,0x2A000000,0x20000000,0x20000000,0x14000000,0x3600000A,0x24000005,0x10000001,0xC000010,
-0x100029,};
-static const uint32_t g_etc1_to_bc7_m6_table165[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,
-0x2300000,0x640000,0x640000,0x640000,0x10000000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x640000,0x640000,0x640000,0x10000000,0x640000,0x640000,0x640000,0x10000000,0x10000000,0x240000,0x200001,0x200001,0x280000,0x2280000,0x22C0000,0x22C0000,0x380000,0x280000,0x2280000,0x480000,0x640000,
-0x480000,0x800001,0x800001,0x800001,0x800001,0xC00000,0xC00000,0xC00000,0x1880000,0x1880000,0x40000000,0xC00000,0xC00000,0xC00000,0x1880000,0x1880000,0x40000000,0x1880000,0x1880000,0x40000000,0x40000000,0xC00000,0xC00000,0xC00000,0x1880000,0x1880000,0x40000000,0x1880000,0x1880000,0x40000000,0x40000000,0x1880000,
-0x1880000,0x40000000,0x40000000,0x40000000,0x980000,0xC880000,0x800001,0xB00000,0xD80000,0x1140000,0x13C0000,0x1E40000,0x4A00000,0xC00000,0x1140000,0x40000000,0x1140000,0xF00001,0x1680000,0x39F80000,0x78000000,0x1680000,0x39F80000,0x78000000,0x39F80000,0x78000000,0x78000000,0x1680000,0x39F80000,0x78000000,0x39F80000,0x78000000,
-0x78000000,0x39F80000,0x78000000,0x78000000,0x78000000,0x1680000,0x39F80000,0x78000000,0x39F80000,0x78000000,0x78000000,0x39F80000,0x78000000,0x78000000,0x78000000,0x39F80000,0x78000000,0x78000000,0x78000000,0x78000000,0x1300000,0x9000000,0x9000000,0x3940000,0x1FF80000,0x57F80000,0x78000000,0x78000000,0x1480000,0x1C40000,0x71D40000,0x78000000,
-0x3FC0000,0x642297,0xFE3C0A17,0xA63808E2,0x783808E2,0xFC1406C7,0xAE100096,0x7E1801AE,0x861006B1,0x6E100185,0x5C1006B1,0xCE00152B,0xA2000619,0x78000482,0x800008D9,0x6E000231,0x580006D0,0x6400152B,0x5C000B02,0x50000C99,0x4200152C,0x982295,0x8C000F9A,0x6E000A26,0x74000F05,0x620006C9,0x58000A19,0x5E001914,0x56000E8F,0x4A000F5A,0x400016F9,0x1302295,
-0x4A00172A,0x4400162D,0x38001C24,0x32002295,0xFE340DE6,0xFE4C1A9F,0xF8601BF2,0xFE000293,0xB6000271,0x88000239,0x740000E4,0x64000421,0xFE100D87,0xDA000512,0x6E00085A,0x4A000F5A,0xD82295,0x88152B,0xFC5804D2,0x9E540482,0x78500482,0xF6240621,0xA8200006,0x7C2C007A,0x80240621,0x6E1C0131,0x5C200621,0xC8152B,0xA2000619,0x78040480,0x800008D9,0x6E000231,
-0x580006D0,0x198152B,0x5C000B02,0x50000C99,0x4200152C,0xC8152B,0xA2000619,0x78040480,0x800008D9,0x6E000231,0x580006D0,0x198152B,0x5C000B02,0x50000C99,0x4200152C,0x198152B,0x5C000B02,0x50000C99,0x4200152C,0x4200152C,0xFC4C0A7E,0xFE6C10C1,0xF880108B,0xFE000293,0xB6000271,0x88000239,0x740000E4,0x64000421,0xFE240AC5,0xDA00044E,0x6E000829,0x50000C99,
-0x120152B,0x3808E2,0x3808E2,0x3808E2,0x3808E2,0xB4100091,0xB4100091,0xB4100091,0x5C100091,0x5C100091,0x40100091,0x7C000480,0x7C000480,0x7C000480,0x580000A9,0x580000A9,0x40000010,0x3C000480,0x3C000480,0x320001E5,0x28000480,0x5008E1,0x5008E1,0x5008E1,0x46000356,0x46000356,0x3A000191,0x340005D3,0x340005D3,0x30000300,0x28000529,0xA008E1,
-0xA008E1,0x26000599,0x200006E4,0x1A0008E1,0xFE100248,0xFE2C050D,0x3808E2,0xD00000E1,0x900000B4,0x6E0000B4,0x66000074,0x4C000104,0xF40002E3,0xA60001E9,0x4C000499,0x30000300,0x7008E1,0x500482,0x500482,0x500482,0x500482,0xA0240001,0xA0240001,0xA0240001,0x58200001,0x58200001,0x40200001,0x780480,0x780480,0x780480,0x580000A9,0x580000A9,
-0x40000010,0xF40480,0xF40480,0x320001E5,0x28000480,0x780480,0x780480,0x780480,0x580000A9,0x580000A9,0x40000010,0xF40480,0xF40480,0x320001E5,0x28000480,0xF40480,0xF40480,0x320001E5,0x28000480,0x28000480,0xFE2001A5,0xFA440242,0x500482,0xD00000E1,0x900000B4,0x6E0000B4,0x66000074,0x4C000104,0xF4000202,0xAC000184,0xAC0480,0x320001E5,
-0xAC0480,0xB80622,0xE8840001,0x94800001,0x78800001,0x1140620,0xAC180001,0x784C0000,0xFF80620,0x720000B4,0x5C000620,0x1140620,0xAC180001,0x784C0000,0xFF80620,0x720000B4,0x5C000620,0xFF80620,0x720000B4,0x5C000620,0x5C000620,0x1140620,0xAC180001,0x784C0000,0xFF80620,0x720000B4,0x5C000620,0xFF80620,0x720000B4,0x5C000620,0x5C000620,0xFF80620,
-0x720000B4,0x5C000620,0x5C000620,0x5C000620,0xFE78039D,0xAC40620,0xFEAC03F9,0xFE1400DD,0xC00000CD,0x900000A9,0x74000020,0x6C000171,0xFC580372,0xEE0000FA,0x7E000008,0x5C000620,0x18C0620,0x100091,0x100091,0x100091,0x100091,0x100091,0x100091,0x100091,0x100091,0x100091,0x100091,0x32000000,0x32000000,0x32000000,0x32000000,0x32000000,
-0x32000000,0x18000000,0x18000000,0x18000000,0x10000000,0x180091,0x180091,0x180091,0x180091,0x180091,0x180091,0x12000034,0x12000034,0x12000034,0xE00001D,0x2C0091,0x2C0091,0x2C0091,0xA000055,0x8000091,0xF8000001,0x100091,0x100091,0x76000000,0x52000000,0x3E000000,0x3E000000,0x28000000,0x6000002D,0x42000019,0x1E000005,0x12000034,
-0x200091,};
-static const uint32_t g_etc1_to_bc7_m6_table166[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x300001,0x480000,0x480000,0x480000,0x480000,0x480000,
-0x480000,0x940000,0x940000,0x940000,0x18000000,0x480000,0x480000,0x480000,0x480000,0x480000,0x480000,0x940000,0x940000,0x940000,0x18000000,0x940000,0x940000,0x940000,0x18000000,0x18000000,0x2340000,0x300001,0x300001,0x4380000,0x23C0000,0x440000,0x440000,0x540000,0x4380000,0x23C0000,0x680000,0x940000,
-0x680000,0x900001,0x900001,0x900001,0x900001,0xD80000,0xD80000,0xD80000,0x1B80000,0x1B80000,0x48000000,0xD80000,0xD80000,0xD80000,0x1B80000,0x1B80000,0x48000000,0x1B80000,0x1B80000,0x48000000,0x48000000,0xD80000,0xD80000,0xD80000,0x1B80000,0x1B80000,0x48000000,0x1B80000,0x1B80000,0x48000000,0x48000000,0x1B80000,
-0x1B80000,0x48000000,0x48000000,0x48000000,0x2A80000,0x9C0000,0x900001,0x2C40000,0xF40000,0x1340000,0x1640000,0x7FC0000,0x4B40000,0xD80000,0x1340000,0x48000000,0x1340000,0x1000001,0x1800000,0x45F80000,0x80000000,0x1800000,0x45F80000,0x80000000,0x45F80000,0x80000000,0x80000000,0x1800000,0x45F80000,0x80000000,0x45F80000,0x80000000,
-0x80000000,0x45F80000,0x80000000,0x80000000,0x80000000,0x1800000,0x45F80000,0x80000000,0x45F80000,0x80000000,0x80000000,0x45F80000,0x80000000,0x80000000,0x80000000,0x45F80000,0x80000000,0x80000000,0x80000000,0x80000000,0x1440000,0x1140000,0x1140000,0x1B00000,0x2BFC0000,0x61FC0000,0x80000000,0x80000000,0x35C0000,0x1E40000,0x79E40000,0x80000000,
-0x13FC0000,0x7025F3,0xFE480C9B,0xB0400AC2,0x80400AC2,0xFE2407C7,0xBA1C0142,0x842402A6,0x90180759,0x78180209,0x64180759,0xE600152B,0xAE000579,0x820804A6,0x8C000821,0x76000159,0x62000659,0x7000152B,0x66000A44,0x56000BE9,0x4A00152C,0xA825F1,0x980010EA,0x7A000B86,0x80000FAD,0x6E0006D1,0x5E000A4D,0x680019F0,0x60000E80,0x54000F30,0x4600177D,0x15825F1,
-0x50001916,0x4A00179D,0x3E001DF0,0x380025F1,0xFE3C105F,0xFA641D9D,0xFE6C1F1A,0xFE0802E9,0xCA000191,0x96000163,0x7C000051,0x7000031D,0xFE180FB2,0xF2000433,0x7600075A,0x54000F30,0xF025F1,0x98152B,0xFE6804F6,0xA6640482,0x80600482,0xFE340621,0xB0300006,0x843C007A,0x88340621,0x762C0131,0x64300621,0xE0152B,0xAE000579,0x80140480,0x8C000821,0x76000159,
-0x62000659,0x1CC152B,0x66000A44,0x56000BE9,0x4A00152C,0xE0152B,0xAE000579,0x80140480,0x8C000821,0x76000159,0x62000659,0x1CC152B,0x66000A44,0x56000BE9,0x4A00152C,0x1CC152B,0x66000A44,0x56000BE9,0x4A00152C,0x4A00152C,0xFE580AE5,0xFC8810CB,0xFE8C1093,0xFE1402D6,0xCA000191,0x96000163,0x7C000051,0x7000031D,0xFE340B25,0xF2000352,0x76000729,0x56000BE9,
-0x140152B,0x400AC2,0x400AC2,0x400AC2,0x400AC2,0xC4180139,0xC4180139,0xC4180139,0x66180139,0x66180139,0x48180139,0x94000480,0x94000480,0x94000480,0x62000055,0x62000055,0x48000001,0x48000480,0x48000480,0x3E000185,0x30000480,0x600AC1,0x600AC1,0x600AC1,0x520003FE,0x520003FE,0x40000209,0x4000066B,0x4000066B,0x3A00030E,0x2E000569,0xC40AC1,
-0xC40AC1,0x2C0006AD,0x260007EC,0x20000AC1,0xFE200335,0xF43806DA,0x400AC2,0xFA000088,0xA8000061,0x80000061,0x78000032,0x5A0000B4,0xFE000332,0xC40001B1,0x5C0004A4,0x3A00030E,0x8C0AC1,0x600482,0x600482,0x600482,0x600482,0xA8340001,0xA8340001,0xA8340001,0x60300001,0x60300001,0x48300001,0x900480,0x900480,0x900480,0x62000055,0x62000055,
-0x48040000,0x1240480,0x1240480,0x3E000185,0x30000480,0x900480,0x900480,0x900480,0x62000055,0x62000055,0x48040000,0x1240480,0x1240480,0x3E000185,0x30000480,0x1240480,0x1240480,0x3E000185,0x30000480,0x30000480,0xFA3401C2,0xFE4C0262,0x600482,0xFA000088,0xA8000061,0x80000061,0x78000032,0x5A0000B4,0xFE0C0200,0xC4000121,0xD00480,0x3E000185,
-0xD00480,0xC80622,0xF0940001,0x9C900001,0x80900001,0x12C0620,0xB4280001,0x805C0000,0x1BF80620,0x78000080,0x64000620,0x12C0620,0xB4280001,0x805C0000,0x1BF80620,0x78000080,0x64000620,0x1BF80620,0x78000080,0x64000620,0x64000620,0x12C0620,0xB4280001,0x805C0000,0x1BF80620,0x78000080,0x64000620,0x1BF80620,0x78000080,0x64000620,0x64000620,0x1BF80620,
-0x78000080,0x64000620,0x64000620,0x64000620,0xFA8C03C8,0xD80620,0xF8C00422,0xFE2C0109,0xCA000091,0x9A00006A,0x7E000008,0x76000128,0xFC6C039D,0xFC0000C8,0x88000001,0x64000620,0x1AC0620,0x180139,0x180139,0x180139,0x180139,0x180139,0x180139,0x180139,0x180139,0x180139,0x180139,0x4A000000,0x4A000000,0x4A000000,0x4A000000,0x4A000000,
-0x4A000000,0x24000000,0x24000000,0x24000000,0x18000000,0x240139,0x240139,0x240139,0x240139,0x240139,0x240139,0x1E000074,0x1E000074,0x1E000074,0x18000040,0x440139,0x440139,0x440139,0x100000B9,0xC000139,0xFC080029,0x180139,0x180139,0xAE000000,0x78000000,0x5C000000,0x5C000000,0x3C000000,0x92000061,0x74000032,0x2E000009,0x1E000074,
-0x300139,};
-static const uint32_t g_etc1_to_bc7_m6_table167[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x600000,0x600000,0x600000,0x600000,0x600000,
-0x600000,0xC40000,0xC40000,0xC40000,0x20000000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0xC40000,0x20000000,0x20000000,0xA440000,0x400001,0x400001,0x4C0000,0x4500000,0x580000,0x580000,0x26C0000,0x4C0000,0x4500000,0x8C0000,0xC40000,
-0x8C0000,0xA00001,0xA00001,0xA00001,0xA00001,0xF00000,0xF00000,0xF00000,0x1E80000,0x1E80000,0x50000000,0xF00000,0xF00000,0xF00000,0x1E80000,0x1E80000,0x50000000,0x1E80000,0x1E80000,0x50000000,0x50000000,0xF00000,0xF00000,0xF00000,0x1E80000,0x1E80000,0x50000000,0x1E80000,0x1E80000,0x50000000,0x50000000,0x1E80000,
-0x1E80000,0x50000000,0x50000000,0x50000000,0xBC0000,0xAC0000,0xA00001,0xDC0000,0x1100000,0x1580000,0x18C0000,0x13F80000,0xCC0000,0xF00000,0x1580000,0x50000000,0x1580000,0x1100001,0x1980000,0x51F80000,0x88000000,0x1980000,0x51F80000,0x88000000,0x51F80000,0x88000000,0x88000000,0x1980000,0x51F80000,0x88000000,0x51F80000,0x88000000,
-0x88000000,0x51F80000,0x88000000,0x88000000,0x88000000,0x1980000,0x51F80000,0x88000000,0x51F80000,0x88000000,0x88000000,0x51F80000,0x88000000,0x88000000,0x88000000,0x51F80000,0x88000000,0x88000000,0x88000000,0x88000000,0x1580000,0x1240000,0x1240000,0x1CC0000,0x39FC0000,0x6BFC0000,0x88000000,0x88000000,0x1740000,0x5FC0000,0x81F40000,0x88000000,
-0x21FC0000,0x7C29AF,0xFE540F97,0xB84C0CF6,0x884C0CF6,0xFE300953,0xC4240236,0x8E2C03EA,0x9A200841,0x801C02D1,0x6C200841,0xFE00152B,0xC00004F9,0x8C100502,0x98000789,0x800000B5,0x6C000629,0x7C00152B,0x72000984,0x60000B40,0x5200152C,0xBC29AD,0xA800126A,0x82000D65,0x8C001095,0x7A000739,0x68000AC5,0x74001B10,0x68000E8C,0x5C000F2A,0x52001805,0x17C29AD,
-0x5C001B46,0x5000195D,0x4A001FD0,0x3E0029AD,0xFE501342,0xFE6C2105,0xFE6C231A,0xFE1403DA,0xD80000EA,0xA20000B9,0x88000009,0x7A000236,0xFE2412A6,0xFC00038C,0x7E0006CC,0x5C000F2A,0x10C29AD,0xA8152B,0xFE78053B,0xAE740482,0x88700482,0xFE440629,0xB8400006,0x8C4C007A,0x90440621,0x7E3C0131,0x6C400621,0xF8152B,0xC00004F9,0x88240480,0x98000789,0x800000B5,
-0x6C000629,0x1FC152B,0x72000984,0x60000B40,0x5200152C,0xF8152B,0xC00004F9,0x88240480,0x98000789,0x800000B5,0x6C000629,0x1FC152B,0x72000984,0x60000B40,0x5200152C,0x1FC152B,0x72000984,0x60000B40,0x5200152C,0x5200152C,0xFC740B5A,0xF4981121,0xF8A010E6,0xFE240355,0xD80000EA,0xA20000B9,0x88000009,0x7A000236,0xFE440B94,0xFC00028C,0x7E00068C,0x60000B40,
-0x164152B,0x4C0CF6,0x4C0CF6,0x4C0CF6,0x4C0CF6,0xD4200221,0xD4200221,0xD4200221,0x70200221,0x70200221,0x50200221,0xAC000480,0xAC000480,0xAC000480,0x7400001D,0x7400001D,0x52040018,0x54000480,0x54000480,0x44000139,0x38000480,0x700CF6,0x700CF6,0x700CF6,0x620004C5,0x620004C5,0x4C0002C1,0x4C000723,0x4C000723,0x40000332,0x340005C1,0xE40CF6,
-0xE40CF6,0x380007FD,0x2C000924,0x24000CF9,0xFE2C0498,0xFA4408C6,0x4C0CF6,0xFE080098,0xBC000029,0x9200002D,0x84000008,0x6A000068,0xFE000452,0xD4000186,0x660004AC,0x40000332,0xA00CF6,0x700482,0x700482,0x700482,0x700482,0xB0440001,0xB0440001,0xB0440001,0x68400001,0x68400001,0x50400001,0xA80480,0xA80480,0xA80480,0x7400001D,0x7400001D,
-0x50140000,0x1580480,0x1580480,0x44000139,0x38000480,0xA80480,0xA80480,0xA80480,0x7400001D,0x7400001D,0x50140000,0x1580480,0x1580480,0x44000139,0x38000480,0x1580480,0x1580480,0x44000139,0x38000480,0x38000480,0xF64801E1,0xFA640265,0x700482,0xFE080088,0xBC000029,0x9200002D,0x84000008,0x6A000068,0xF6240221,0xDA0000CA,0xF00480,0x44000139,
-0xF00480,0xD80622,0xF8A40001,0xA4A00001,0x88A00001,0x1440620,0xBC380001,0x886C0000,0x27F80620,0x84000050,0x6C000620,0x1440620,0xBC380001,0x886C0000,0x27F80620,0x84000050,0x6C000620,0x27F80620,0x84000050,0x6C000620,0x6C000620,0x1440620,0xBC380001,0x886C0000,0x27F80620,0x84000050,0x6C000620,0x27F80620,0x84000050,0x6C000620,0x6C000620,0x27F80620,
-0x84000050,0x6C000620,0x6C000620,0x6C000620,0xFE9403E8,0xE80620,0xFECC042A,0xFE400128,0xE000006D,0xA6000050,0x88000000,0x7E0000F4,0xF88403C8,0xFE1400E1,0x90100001,0x6C000620,0x1D00620,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x64000000,0x64000000,0x64000000,0x64000000,0x64000000,
-0x64000000,0x30000000,0x30000000,0x30000000,0x20000000,0x300221,0x300221,0x300221,0x300221,0x300221,0x300221,0x280000C2,0x280000C2,0x280000C2,0x1E000068,0x5C0221,0x5C0221,0x5C0221,0x16000145,0x10000221,0xFE0C009D,0x200221,0x200221,0xE8000000,0xA0000000,0x7A000000,0x7A000000,0x50000000,0xC40000A9,0x96000055,0x3E000010,0x280000C2,
-0x400221,};
-static const uint32_t g_etc1_to_bc7_m6_table168[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,
-0x7C0000,0xFC0000,0xFC0000,0xFC0000,0x28000001,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0xFC0000,0xFC0000,0xFC0000,0x28000001,0xFC0000,0xFC0000,0xFC0000,0x28000001,0x28000001,0x4580000,0x540000,0x540000,0x2600000,0x680000,0x2700000,0x2700000,0x8C0000,0x2600000,0x680000,0xB00000,0xFC0000,
-0xB00000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0x58000001,0x58000001,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0x58000001,0x58000001,0xBF80000,
-0xBF80000,0x58000001,0x58000001,0x58000001,0xD00000,0xC00000,0xB40000,0xF40000,0x32C0000,0x17C0000,0x1B80000,0x1FF80000,0x2E00000,0x10C0000,0x17C0000,0x58000001,0x17C0000,0x1240000,0x3B00000,0x5DFC0000,0x90000001,0x3B00000,0x5DFC0000,0x90000001,0x5DFC0000,0x90000001,0x90000001,0x3B00000,0x5DFC0000,0x90000001,0x5DFC0000,0x90000001,
-0x90000001,0x5DFC0000,0x90000001,0x90000001,0x90000001,0x3B00000,0x5DFC0000,0x90000001,0x5DFC0000,0x90000001,0x90000001,0x5DFC0000,0x90000001,0x90000001,0x90000001,0x5DFC0000,0x90000001,0x90000001,0x90000001,0x90000001,0x36C0000,0x1380000,0x1380000,0x1EC0000,0x49F80000,0x77F40000,0x90000001,0x90000001,0x18C0000,0x17FC0000,0x8BE80000,0x90000001,
-0x33FC0000,0x8C2E54,0xFE681371,0xC2580FD9,0x90580FD9,0xFE3C0BBC,0xD42C039D,0x9A3805B5,0xA42C0994,0x8A240406,0x74280996,0xFE0C1590,0xD20004A2,0x961805A9,0xA80006E9,0x8C000042,0x7404062A,0x8A00152B,0x780008B3,0x6C000A83,0x5C00152B,0xCC2E54,0xB40014BB,0x8E000FFC,0x980011E6,0x8000080E,0x6E000BB6,0x80001C6F,0x74000EBF,0x66000F3F,0x5800189C,0x1A02E54,
-0x60001E64,0x5C001B96,0x50002227,0x44002E54,0xFE58170A,0xF88025CC,0xFA842794,0xFE2405E6,0xEA000061,0xB0000042,0x9204000A,0x86000173,0xFE3415FA,0xFE00042C,0x90000627,0x66000F3F,0x1242E54,0xB8152C,0xFE8C0581,0xB8840480,0x90840481,0xFE5C0642,0xC2540005,0x945C007D,0x9A540620,0x864C0132,0x74540622,0x114152B,0xD20004A2,0x90380481,0xA80006E9,0x8C000042,
-0x740C0622,0xFF8152B,0x780008B3,0x6C000A83,0x5C00152B,0x114152B,0xD20004A2,0x90380481,0xA80006E9,0x8C000042,0x740C0622,0xFF8152B,0x780008B3,0x6C000A83,0x5C00152B,0xFF8152B,0x780008B3,0x6C000A83,0x5C00152B,0x5C00152B,0xFE800BBE,0xFEAC1122,0xFEAC1101,0xFE4003CB,0xEA000061,0xB0000042,0x92080002,0x86000173,0xFE5C0BD5,0xFE1002E5,0x900005D6,0x6C000A83,
-0x18C152B,0x580FD8,0x580FD8,0x580FD8,0x580FD8,0xE42C0374,0xE42C0374,0xE42C0374,0x7A2C0374,0x7A2C0374,0x58280375,0xC8000480,0xC8000480,0xC8000480,0x82000005,0x82000005,0x5C0C005E,0x60000482,0x60000482,0x500000E1,0x40000482,0x2800FD8,0x2800FD8,0x2800FD8,0x6E000615,0x6E000615,0x520003EB,0x58000811,0x58000811,0x4C00037E,0x4000063B,0x1080FD8,
-0x1080FD8,0x3E0009BD,0x32000AC6,0x2A000FDB,0xFC380694,0xFE4C0B58,0x580FD8,0xFE100141,0xDA000005,0xA6000005,0x98000002,0x7600002D,0xFE1405F9,0xF6000172,0x7C0004C0,0x4C00037E,0xB80FD8,0x840480,0x840480,0x840480,0x840480,0xBA540000,0xBA540000,0xBA540000,0x70540000,0x70540000,0x58540001,0xC40480,0xC40480,0xC40480,0x82000005,0x82000005,
-0x5A240001,0x18C0480,0x18C0480,0x500000E1,0x40000482,0xC40480,0xC40480,0xC40480,0x82000005,0x82000005,0x5A240001,0x18C0480,0x18C0480,0x500000E1,0x40000482,0x18C0480,0x18C0480,0x500000E1,0x40000482,0x40000482,0xFE5801E1,0xF4780288,0x840480,0xFE2400A2,0xDA000005,0xA6000005,0x96040000,0x7600002D,0xFE340221,0xFC000082,0x1180480,0x500000E1,
-0x1180480,0xEC0620,0xFEB40004,0xACB40000,0x90B40001,0x35C0620,0xC44C0001,0x927C0001,0x33FC0620,0x8E00002D,0x74000622,0x35C0620,0xC44C0001,0x927C0001,0x33FC0620,0x8E00002D,0x74000622,0x33FC0620,0x8E00002D,0x74000622,0x74000622,0x35C0620,0xC44C0001,0x927C0001,0x33FC0620,0x8E00002D,0x74000622,0x33FC0620,0x8E00002D,0x74000622,0x74000622,0x33FC0620,
-0x8E00002D,0x74000622,0x74000622,0x74000622,0xFEB403F5,0xFC0620,0xFAE40451,0xFE580164,0xEA00003D,0xB2000022,0x90140001,0x8A0000C1,0xFE9803C8,0xFE380120,0x9A200000,0x74000622,0x1F40620,0x280374,0x280374,0x280374,0x280374,0x280374,0x280374,0x280374,0x280374,0x280374,0x280374,0x7E000000,0x7E000000,0x7E000000,0x7E000000,0x7E000000,
-0x7E000000,0x3E000000,0x3E000000,0x3E000000,0x28000001,0x23C0372,0x23C0372,0x23C0372,0x23C0372,0x23C0372,0x23C0372,0x2E000145,0x2E000145,0x2E000145,0x240000A9,0x7C0372,0x7C0372,0x7C0372,0x1C000212,0x14000372,0xF61C016D,0x280374,0x280374,0xFE040014,0xCC000000,0x9C000000,0x9C000000,0x66000000,0xF6000112,0xB4000092,0x4E000019,0x2E000145,
-0x580372,};
-static const uint32_t g_etc1_to_bc7_m6_table169[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x940000,0x940000,0x940000,0x940000,0x940000,
-0x940000,0x12C0000,0x12C0000,0x12C0000,0x30000001,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x12C0000,0x12C0000,0x12C0000,0x30000001,0x12C0000,0x12C0000,0x12C0000,0x30000001,0x30000001,0xC680000,0x640000,0x640000,0x740000,0x7C0000,0x880000,0x880000,0xA80000,0x740000,0x7C0000,0xD40000,0x12C0000,
-0xD40000,0xC40000,0xC40000,0xC40000,0xC40000,0x1240000,0x1240000,0x1240000,0x17F80000,0x17F80000,0x60000001,0x1240000,0x1240000,0x1240000,0x17F80000,0x17F80000,0x60000001,0x17F80000,0x17F80000,0x60000001,0x60000001,0x1240000,0x1240000,0x1240000,0x17F80000,0x17F80000,0x60000001,0x17F80000,0x17F80000,0x60000001,0x60000001,0x17F80000,
-0x17F80000,0x60000001,0x60000001,0x60000001,0xE40000,0xD00000,0xC40000,0x3080000,0x1480000,0x1A00000,0x1E00000,0x29FC0000,0x2F40000,0x1240000,0x1A00000,0x60000001,0x1A00000,0x1340000,0x1C80000,0x69FC0000,0x98000001,0x1C80000,0x69FC0000,0x98000001,0x69FC0000,0x98000001,0x98000001,0x1C80000,0x69FC0000,0x98000001,0x69FC0000,0x98000001,
-0x98000001,0x69FC0000,0x98000001,0x98000001,0x98000001,0x1C80000,0x69FC0000,0x98000001,0x69FC0000,0x98000001,0x98000001,0x69FC0000,0x98000001,0x98000001,0x98000001,0x69FC0000,0x98000001,0x98000001,0x98000001,0x98000001,0x3800000,0x1480000,0x1480000,0x7FC0000,0x55FC0000,0x81F40000,0x98000001,0x98000001,0x1A40000,0x29FC0000,0x93F80000,0x98000001,
-0x41FC0000,0x9832DC,0xFE741765,0xCC6412C4,0x986412C5,0xFE440E76,0xDE340525,0xA0440799,0xAE340B04,0x942C055E,0x7C300B06,0xFE18169C,0xE2000481,0xA0200679,0xB4000691,0x96000012,0x7C080656,0x9600152B,0x840007FB,0x720009EB,0x6400152B,0xE032DC,0xC0001733,0x980012C5,0xA2001322,0x8C000906,0x7A000CD6,0x8C001DCF,0x7C000F22,0x6E000F63,0x62001933,0x1C432DC,
-0x6C00213C,0x60001E14,0x5600247F,0x4A0032DC,0xFE641ADA,0xFE8C29F4,0xFE8C2C04,0xFE2C081A,0xFA00002A,0xBC00000D,0x9C08003B,0x900000E2,0xFE4419D3,0xFE100626,0x960005A9,0x6E000F63,0x13C32DC,0xC8152C,0xFEA405E1,0xC0940480,0x98940481,0xFE740672,0xCA640005,0x9C6C007D,0xA2640620,0x8E5C0132,0x7C640622,0x12C152B,0xE2000481,0x98480481,0xB4000691,0x96000012,
-0x7C1C0622,0x1BF8152B,0x840007FB,0x720009EB,0x6400152B,0x12C152B,0xE2000481,0x98480481,0xB4000691,0x96000012,0x7C1C0622,0x1BF8152B,0x840007FB,0x720009EB,0x6400152B,0x1BF8152B,0x840007FB,0x720009EB,0x6400152B,0x6400152B,0xFE940C03,0xF6BC1178,0xFAC41144,0xFE540456,0xFA00002A,0xBC00000D,0x9A180002,0x900000E2,0xFE780C41,0xFE240375,0x9A000551,0x720009EB,
-0x1AC152B,0x6412C4,0x6412C4,0x6412C4,0x6412C4,0xF43404E4,0xF43404E4,0xF43404E4,0x843404E4,0x843404E4,0x603004E5,0xE0000480,0xE0000480,0xE0000480,0x90000005,0x90000005,0x641000C2,0x6C000482,0x6C000482,0x5C0000A9,0x48000482,0x9012C3,0x9012C3,0x9012C3,0x7A000785,0x7A000785,0x5E000533,0x620008E2,0x620008E2,0x520003DE,0x460006A3,0x12412C3,
-0x12412C3,0x44000B9D,0x3E000C56,0x300012C3,0xFE3C08B8,0xF4580E45,0x6412C4,0xFE1C0258,0xF0000001,0xB8000001,0xA6040018,0x8000000D,0xFE200802,0xFE0001C8,0x860004C8,0x520003DE,0xD012C3,0x940480,0x940480,0x940480,0x940480,0xC2640000,0xC2640000,0xC2640000,0x78640000,0x78640000,0x60640001,0xDC0480,0xDC0480,0xDC0480,0x8C0C0001,0x8C0C0001,
-0x62340001,0x1BC0480,0x1BC0480,0x5C0000A9,0x48000482,0xDC0480,0xDC0480,0xDC0480,0x8C0C0001,0x8C0C0001,0x62340001,0x1BC0480,0x1BC0480,0x5C0000A9,0x48000482,0x1BC0480,0x1BC0480,0x5C0000A9,0x48000482,0x48000482,0xFA6C0200,0xFC880288,0x940480,0xFE3400B9,0xEE040000,0xB6040000,0x9E140000,0x8000000D,0xFA480244,0xFC140091,0x1380480,0x5C0000A9,
-0x1380480,0xFC0620,0xFEC8000D,0xB4C40000,0x98C40001,0x3740620,0xCC5C0001,0x9A8C0001,0x3FFC0620,0x96000012,0x7C000622,0x3740620,0xCC5C0001,0x9A8C0001,0x3FFC0620,0x96000012,0x7C000622,0x3FFC0620,0x96000012,0x7C000622,0x7C000622,0x3740620,0xCC5C0001,0x9A8C0001,0x3FFC0620,0x96000012,0x7C000622,0x3FFC0620,0x96000012,0x7C000622,0x7C000622,0x3FFC0620,
-0x96000012,0x7C000622,0x7C000622,0x7C000622,0xFAC80422,0x10C0620,0xFEEC0469,0xFE740190,0xF8040029,0xBC00000D,0x98240001,0x90000091,0xFEAC03F5,0xFE4C013D,0xA2300000,0x7C000622,0xDFC0620,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x3004E4,0x96000000,0x96000000,0x96000000,0x96000000,0x96000000,
-0x96000000,0x4A000000,0x4A000000,0x4A000000,0x30000001,0x4804E2,0x4804E2,0x4804E2,0x4804E2,0x4804E2,0x4804E2,0x3A0001CD,0x3A0001CD,0x3A0001CD,0x2E0000EA,0x9404E2,0x9404E2,0x9404E2,0x20000305,0x180004E2,0xFA240265,0x3004E4,0x3004E4,0xFE100071,0xF4000000,0xBA000000,0xBA000000,0x7A000000,0xF60001C2,0xD60000CD,0x5E000024,0x3A0001CD,
-0x6804E2,};
-static const uint32_t g_etc1_to_bc7_m6_table170[] = {
-0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,
-0x80000,0x80000,0x80000,0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,
-0xAC0000,0x15C0000,0x15C0000,0x15C0000,0x38000001,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0xAC0000,0x15C0000,0x15C0000,0x15C0000,0x38000001,0x15C0000,0x15C0000,0x15C0000,0x38000001,0x38000001,0x7C0000,0x740000,0x740000,0x6840000,0x900000,0x9C0000,0x9C0000,0x2C00000,0x6840000,0x900000,0xF40000,0x15C0000,
-0xF40000,0xD40000,0xD40000,0xD40000,0xD40000,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,0x21FC0000,0x68000001,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,0x21FC0000,0x68000001,0x21FC0000,0x21FC0000,0x68000001,0x68000001,0x13C0000,0x13C0000,0x13C0000,0x21FC0000,0x21FC0000,0x68000001,0x21FC0000,0x21FC0000,0x68000001,0x68000001,0x21FC0000,
-0x21FC0000,0x68000001,0x68000001,0x68000001,0x4F40000,0x8E00000,0xD40000,0x1200000,0x1640000,0x1C00000,0x5FC0000,0x35F80000,0x3080000,0x13C0000,0x1C00000,0x68000001,0x1C00000,0x1440000,0x1E00000,0x75FC0000,0xA0000001,0x1E00000,0x75FC0000,0xA0000001,0x75FC0000,0xA0000001,0xA0000001,0x1E00000,0x75FC0000,0xA0000001,0x75FC0000,0xA0000001,
-0xA0000001,0x75FC0000,0xA0000001,0xA0000001,0xA0000001,0x1E00000,0x75FC0000,0xA0000001,0x75FC0000,0xA0000001,0xA0000001,0x75FC0000,0xA0000001,0xA0000001,0xA0000001,0x75FC0000,0xA0000001,0xA0000001,0xA0000001,0xA0000001,0x3940000,0x5580000,0x5580000,0x1BFC0000,0x63FC0000,0x8BF40000,0xA0000001,0xA0000001,0x1B80000,0x39FC0000,0x9DCC0000,0xA0000001,
-0x51FC0000,0xA43680,0xFE801AD5,0xD470152D,0xA070152D,0xFE5C1112,0xE440067D,0xAA4C0931,0xB83C0C40,0x9A38068A,0x843C0C42,0xFE3017F0,0xF0080485,0xAA280735,0xC6000655,0xA0080012,0x86100686,0xA004152B,0x90000767,0x7E00095F,0x6C04152B,0xF43680,0xD20018EB,0xA204152C,0xAE0013F2,0x9800099A,0x80000D92,0x92001E87,0x86000EDF,0x76000F1F,0x68001953,0x1F03680,
-0x7200234C,0x6C001F94,0x5C00261B,0x50003684,0xFE781E55,0xFE8C2DE0,0xF8A02FFD,0xFE400A6F,0xFE0C0052,0xC804000E,0xA4100086,0x98000089,0xFE541CF7,0xFE18084E,0xA4000539,0x76000F1F,0x15C3680,0xD8152C,0xFEB00631,0xC8A40480,0xA0A40481,0xFE8006A6,0xD2740005,0xA47C007D,0xAA740620,0x966C0132,0x84740622,0x144152B,0xEA100481,0xA0580481,0xC6000651,0xA008000E,
-0x842C0622,0x27F8152B,0x90000763,0x7E00095B,0x6C00152B,0x144152B,0xEA100481,0xA0580481,0xC6000651,0xA008000E,0x842C0622,0x27F8152B,0x90000763,0x7E00095B,0x6C00152B,0x27F8152B,0x90000763,0x7E00095B,0x6C00152B,0x6C00152B,0xFEA00C86,0xFECC1178,0xFECC1164,0xFE6C04E5,0xFE100035,0xC410000D,0xA2280002,0x98000085,0xFE880CA0,0xFE4003DB,0xA40004F9,0x7E00095B,
-0x1D0152B,0x70152C,0x70152C,0x70152C,0x70152C,0xFE3C0624,0xFE3C0624,0xFE3C0624,0x8E3C0620,0x8E3C0620,0x683C0621,0xF4040480,0xF4040480,0xF4040480,0x9E040011,0x9E040011,0x6C180131,0x78040480,0x78040480,0x64000081,0x50040482,0xA4152B,0xA4152B,0xA4152B,0x86000889,0x86000889,0x68000641,0x6E000966,0x6E000966,0x5E0003E6,0x4C0006C3,0x14C152B,
-0x14C152B,0x4A000D01,0x44000D62,0x3600152B,0xFE4C0A8D,0xFA641079,0x70152C,0xFE2C037D,0xFE08000D,0xC608000C,0xB6080038,0x8E040001,0xFC3009DD,0xFE0002CC,0x960004B9,0x5E0003E6,0xE8152B,0xA40480,0xA40480,0xA40480,0xA40480,0xCA740000,0xCA740000,0xCA740000,0x80740000,0x80740000,0x68740001,0xF40480,0xF40480,0xF40480,0x941C0001,0x941C0001,
-0x6A440001,0x1F00480,0x1F00480,0x6400007D,0x50000482,0xF40480,0xF40480,0xF40480,0x941C0001,0x941C0001,0x6A440001,0x1F00480,0x1F00480,0x6400007D,0x50000482,0x1F00480,0x1F00480,0x6400007D,0x50000482,0x50000482,0xFE740220,0xF49802AD,0xA40480,0xFE4400DA,0xF6140000,0xBE140000,0xA6240000,0x8E000001,0xF2600265,0xFE2400A4,0x15C0480,0x6400007D,
-0x15C0480,0x10C0620,0xFEDC0020,0xBCD40000,0xA0D40001,0x38C0620,0xD46C0001,0xA29C0001,0x4BFC0620,0xA0000005,0x84000622,0x38C0620,0xD46C0001,0xA29C0001,0x4BFC0620,0xA0000005,0x84000622,0x4BFC0620,0xA0000005,0x84000622,0x84000622,0x38C0620,0xD46C0001,0xA29C0001,0x4BFC0620,0xA0000005,0x84000622,0x4BFC0620,0xA0000005,0x84000622,0x84000622,0x4BFC0620,
-0xA0000005,0x84000622,0x84000622,0x84000622,0xFED00442,0x71C0620,0xFB040480,0xFC9001C4,0xFE100035,0xCA000004,0xA0340001,0x9C00006A,0xF4C80422,0xFE640179,0xAA400000,0x84000622,0x1DF80620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0x3C0620,0xAC040000,0xAC040000,0xAC040000,0xAC040000,0xAC040000,
-0xAC040000,0x54040000,0x54040000,0x54040000,0x38040001,0x580620,0x580620,0x580620,0x580620,0x580620,0x580620,0x4600021D,0x4600021D,0x4600021D,0x340000FA,0xB00620,0xB00620,0xB00620,0x260003A9,0x1C000622,0xFE2C0349,0x3C0620,0x3C0620,0xFA1800F4,0xFC08000D,0xD2040000,0xD2040000,0x8A040000,0xFA08028A,0xFE0000E8,0x66000019,0x4600021D,
-0x7C0620,};
-static const uint32_t g_etc1_to_bc7_m6_table171[] = {
-0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x380000,
-0x380000,0x380000,0x380000,0x8000001,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x2140000,0x2140000,0x2140000,0x1C0000,0x280000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,
-0xC40000,0x18C0000,0x18C0000,0x18C0000,0x40000001,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x18C0000,0x40000001,0x40000001,0x8C0000,0x840000,0x840000,0x2980000,0xA40000,0xB40000,0xB40000,0xDC0000,0x2980000,0xA40000,0x1180000,0x18C0000,
-0x1180000,0xE40000,0xE40000,0xE40000,0xE40000,0x3500000,0x3500000,0x3500000,0x2DFC0000,0x2DFC0000,0x70000001,0x3500000,0x3500000,0x3500000,0x2DFC0000,0x2DFC0000,0x70000001,0x2DFC0000,0x2DFC0000,0x70000001,0x70000001,0x3500000,0x3500000,0x3500000,0x2DFC0000,0x2DFC0000,0x70000001,0x2DFC0000,0x2DFC0000,0x70000001,0x70000001,0x2DFC0000,
-0x2DFC0000,0x70000001,0x70000001,0x70000001,0x1080000,0xF40000,0xE40000,0x3340000,0x1800000,0x1E40000,0x13F80000,0x3FFC0000,0x31C0000,0x3500000,0x1E40000,0x70000001,0x1E40000,0x1540000,0x1F80000,0x81FC0000,0xA8000001,0x1F80000,0x81FC0000,0xA8000001,0x81FC0000,0xA8000001,0xA8000001,0x1F80000,0x81FC0000,0xA8000001,0x81FC0000,0xA8000001,
-0xA8000001,0x81FC0000,0xA8000001,0xA8000001,0xA8000001,0x1F80000,0x81FC0000,0xA8000001,0x81FC0000,0xA8000001,0xA8000001,0x81FC0000,0xA8000001,0xA8000001,0xA8000001,0x81FC0000,0xA8000001,0xA8000001,0xA8000001,0xA8000001,0x3A80000,0xD680000,0xD680000,0x2FFC0000,0x71FC0000,0x95F40000,0xA8000001,0xA8000001,0x3CC0000,0x4BFC0000,0xA5DC0000,0xA8000001,
-0x5FFC0000,0xB43680,0xFE8C1BA1,0xDC80152D,0xA880152D,0xFE6811EA,0xEC50067D,0xB25C0931,0xC04C0C40,0xA248068A,0x8C4C0C42,0xFE441883,0xF8180485,0xB2380735,0xCC0C0651,0xA8180012,0x8E200686,0xA814152B,0x98040733,0x86040933,0x7414152B,0x10C3680,0xE20017A1,0xAA14152C,0xC0001242,0xA2000823,0x8C000CC2,0xA2001CE9,0x92000CD7,0x80000D2B,0x7000182C,0xBF83680,
-0x7E0021E4,0x72001DD4,0x660024F4,0x58003684,0xFE8C1F58,0xFCA82E0A,0xFEAC3001,0xFE540B96,0xFE2000B6,0xD014000E,0xAC200086,0xA20C0082,0xFE701E35,0xFE300991,0xB00004B1,0x80000D2B,0x17C3680,0xE8152C,0xFEC40690,0xD0B40480,0xA8B40481,0xFE9806E6,0xDA840005,0xAC8C007D,0xB2840620,0x9E7C0132,0x8C840622,0x15C152B,0xF2200481,0xA8680481,0xD2000629,0xA818000E,
-0x8C3C0622,0x33F8152B,0x960006DB,0x840008D3,0x7400152B,0x15C152B,0xF2200481,0xA8680481,0xD2000629,0xA818000E,0x8C3C0622,0x33F8152B,0x960006DB,0x840008D3,0x7400152B,0x33F8152B,0x960006DB,0x840008D3,0x7400152B,0x7400152B,0xFEB40CC9,0xF6DC11D2,0xFAE411A1,0xFE78057E,0xFE280069,0xCC20000D,0xAA380002,0xA400004B,0xFE980D11,0xFE54047D,0xB00004B0,0x840008D3,
-0x1F0152B,0x80152C,0x80152C,0x80152C,0x80152C,0xFE500631,0xFE500631,0xFE500631,0x964C0620,0x964C0620,0x704C0621,0xFC140480,0xFC140480,0xFC140480,0xA6140011,0xA6140011,0x74280131,0x80140480,0x80140480,0x6C0C007A,0x58140482,0xBC152B,0xBC152B,0xBC152B,0x980007B9,0x980007B9,0x70040621,0x7A000866,0x7A000866,0x680002AA,0x580005EB,0x17C152B,
-0x17C152B,0x56000C19,0x4A000C82,0x3E00152B,0xFC640B12,0xFE6C10C1,0x80152C,0xFE3C03E0,0xFE1C001D,0xCE18000C,0xBE180038,0x96140001,0xFE3C0A38,0xFE18031E,0xA8000481,0x680002AA,0x10C152B,0xB40480,0xB40480,0xB40480,0xB40480,0xD2840000,0xD2840000,0xD2840000,0x88840000,0x88840000,0x70840001,0x10C0480,0x10C0480,0x10C0480,0x9C2C0001,0x9C2C0001,
-0x72540001,0xBF80480,0xBF80480,0x6C00004A,0x58000482,0x10C0480,0x10C0480,0x10C0480,0x9C2C0001,0x9C2C0001,0x72540001,0xBF80480,0xBF80480,0x6C00004A,0x58000482,0xBF80480,0xBF80480,0x6C00004A,0x58000482,0x58000482,0xFE900221,0xFCA802AD,0xB40480,0xFE5C00E1,0xFE240000,0xC6240000,0xAE340000,0x980C0000,0xFA700265,0xFA4000C8,0x17C0480,0x6C00004A,
-0x17C0480,0x11C0620,0xFEEC0041,0xC4E40000,0xA8E40001,0x3A40620,0xDC7C0001,0xAAAC0001,0x57FC0620,0xA8000001,0x8C000622,0x3A40620,0xDC7C0001,0xAAAC0001,0x57FC0620,0xA8000001,0x8C000622,0x57FC0620,0xA8000001,0x8C000622,0x8C000622,0x3A40620,0xDC7C0001,0xAAAC0001,0x57FC0620,0xA8000001,0x8C000622,0x57FC0620,0xA8000001,0x8C000622,0x8C000622,0x57FC0620,
-0xA8000001,0x8C000622,0x8C000622,0x8C000622,0xFAF00451,0xF2C0620,0xFF0C04A0,0xFEA001F9,0xFE380055,0xD4040000,0xA8440001,0xA400004A,0xFCD80422,0xFE8401A5,0xB2500000,0x8C000622,0x2BFC0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0x4C0620,0xB4140000,0xB4140000,0xB4140000,0xB4140000,0xB4140000,
-0xB4140000,0x5C140000,0x5C140000,0x5C140000,0x40140001,0x700620,0x700620,0x700620,0x700620,0x700620,0x700620,0x52000185,0x52000185,0x52000185,0x4000007A,0xE40620,0xE40620,0xE40620,0x32000321,0x24000622,0xF63C0374,0x4C0620,0x4C0620,0xFE2C0109,0xFC1C0019,0xDA140000,0xDA140000,0x92140000,0xFC1C02AD,0xFA0C00DD,0x7A040000,0x52000185,
-0xA00620,};
-static const uint32_t g_etc1_to_bc7_m6_table172[] = {
-0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x700000,
-0x700000,0x700000,0x700000,0x12000000,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x280000,0x280000,0x280000,0x380000,0x500000,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,
-0x2DC0000,0x1C40000,0x1C40000,0x1C40000,0x4A000000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,0x1C40000,0x1C40000,0x4A000000,0x1C40000,0x1C40000,0x1C40000,0x4A000000,0x4A000000,0xA00000,0x940001,0x940001,0x4AC0000,0xBC0000,0xCC0000,0xCC0000,0xFC0000,0x4AC0000,0xBC0000,0x13C0000,0x1C40000,
-0x13C0000,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0x3BFC0000,
-0x3BFC0000,0x7A000000,0x7A000000,0x7A000000,0x51C0000,0xB040000,0xF40001,0x34C0000,0x19C0000,0x7FC0000,0x21FC0000,0x4BFC0000,0x1340000,0x16C0000,0x7FC0000,0x7A000000,0x7FC0000,0x1640001,0x19FC0000,0x8FF80000,0xB2000000,0x19FC0000,0x8FF80000,0xB2000000,0x8FF80000,0xB2000000,0xB2000000,0x19FC0000,0x8FF80000,0xB2000000,0x8FF80000,0xB2000000,
-0xB2000000,0x8FF80000,0xB2000000,0xB2000000,0xB2000000,0x19FC0000,0x8FF80000,0xB2000000,0x8FF80000,0xB2000000,0xB2000000,0x8FF80000,0xB2000000,0xB2000000,0xB2000000,0x8FF80000,0xB2000000,0xB2000000,0xB2000000,0xB2000000,0x1C00000,0x77C0000,0x77C0000,0x45FC0000,0x7FFC0000,0xA1F00000,0xB2000000,0xB2000000,0x1E80000,0x5DFC0000,0xAFD00000,0xB2000000,
-0x71FC0000,0xC43684,0xFEA41C83,0xE690152B,0xB290152B,0xFE801302,0xF664067B,0xBA6C0933,0xC8600C42,0xAC5C0686,0x965C0C42,0xFE5C1959,0xFE28048A,0xBA480733,0xD620064F,0xB0280012,0x9630068A,0xB224152C,0xA0140735,0x8E140931,0x7C24152D,0x3243680,0xF400169F,0xB228152B,0xD20010AA,0xAE000711,0x94000C50,0xAE001B7F,0x9E000AD1,0x8C000B51,0x7A0016F9,0x17FC3680,
-0x8A00206C,0x7E001C08,0x720023A4,0x62003680,0xFEA0204E,0xF6BC2EC4,0xF8C0308C,0xFE6C0D33,0xFE38015B,0xDA28000E,0xB6340082,0xAA1C0086,0xFE781F23,0xFE440AEA,0xBA0C049F,0x8C000B51,0x1A43680,0xFC152B,0xFED40722,0xD8C80482,0xB2C40482,0xFEB00749,0xE2940006,0xB6A0007A,0xBA980621,0xA8900131,0x96940621,0x374152B,0xFC300481,0xB2780480,0xDE080621,0xB0280011,
-0x964C0620,0x3FFC152B,0xA6000662,0x9000084C,0x7C00152C,0x374152B,0xFC300481,0xB2780480,0xDE080621,0xB0280011,0x964C0620,0x3FFC152B,0xA6000662,0x9000084C,0x7C00152C,0x3FFC152B,0xA6000662,0x9000084C,0x7C00152C,0x7C00152C,0xFED00D38,0xFEEC11D9,0xFEEC11E2,0xFE940602,0xFE4800C6,0xD630000D,0xB24C0001,0xAE04003D,0xFEB40D8E,0xFE700541,0xBC000486,0x9000084C,
-0xDFC152B,0x90152B,0x90152B,0x90152B,0x90152B,0xFE600653,0xFE600653,0xFE600653,0x9E600622,0x9E600622,0x7A5C0622,0xFE280489,0xFE280489,0xFE280489,0xB028000E,0xB028000E,0x7E3C0132,0x88280481,0x88280481,0x741C007D,0x62240481,0xD4152B,0xD4152B,0xD4152B,0xA80006F1,0xA80006F1,0x7A140620,0x86000771,0x86000771,0x7400019D,0x62000529,0x1B0152B,
-0x1B0152B,0x60000B4C,0x56000B91,0x4800152C,0xFE740B63,0xFC8810CB,0x90152B,0xFE54045A,0xFE300042,0xD628000B,0xC628003B,0x9E240002,0xFE500A8E,0xFE30038A,0xB2100480,0x7400019D,0x130152B,0xC40482,0xC40482,0xC40482,0xC40482,0xDA980001,0xDA980001,0xDA980001,0x92940001,0x92940001,0x7A940001,0x3240480,0x3240480,0x3240480,0xA63C0001,0xA63C0001,
-0x7A680000,0x17FC0480,0x17FC0480,0x78000028,0x62000480,0x3240480,0x3240480,0x3240480,0xA63C0001,0xA63C0001,0x7A680000,0x17FC0480,0x17FC0480,0x78000028,0x62000480,0x17FC0480,0x17FC0480,0x78000028,0x62000480,0x62000480,0xFEA00244,0xF6BC02D2,0xC40482,0xFE780109,0xFC3C0004,0xD0340000,0xB6480000,0xA21C0000,0xF8880288,0xFC5800DD,0x1A40480,0x78000028,
-0x1A40480,0x12C0622,0xFF00006A,0xCEF40001,0xB2F40001,0x1C00620,0xE68C0001,0xB2C00000,0x65F80620,0xB2100000,0x96000620,0x1C00620,0xE68C0001,0xB2C00000,0x65F80620,0xB2100000,0x96000620,0x65F80620,0xB2100000,0x96000620,0x96000620,0x1C00620,0xE68C0001,0xB2C00000,0x65F80620,0xB2100000,0x96000620,0x65F80620,0xB2100000,0x96000620,0x96000620,0x65F80620,
-0xB2100000,0x96000620,0x96000620,0x96000620,0xFEF80479,0x9400620,0xFD2804B1,0xFEC00225,0xFE500089,0xDC180000,0xB2500000,0xAC000034,0xFEF00451,0xFEA001E5,0xBA640001,0x96000620,0x3DF80620,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0x5C0622,0xBA280001,0xBA280001,0xBA280001,0xBA280001,0xBA280001,
-0xBA280001,0x66240001,0x66240001,0x66240001,0x4A240001,0x8C0620,0x8C0620,0x8C0620,0x8C0620,0x8C0620,0x8C0620,0x620000E9,0x620000E9,0x620000E9,0x4800001D,0x1180620,0x1180620,0x1180620,0x3E0002A1,0x2E000620,0xFE4C037A,0x5C0622,0x5C0622,0xFE3C0122,0xFC300029,0xE2280001,0xE2280001,0x9A280001,0xF83002D2,0xFC1C00F4,0x82180000,0x620000E9,
-0xC80620,};
-static const uint32_t g_etc1_to_bc7_m6_table173[] = {
-0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0xA00000,
-0xA00000,0xA00000,0xA00000,0x1A000000,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x4380000,0x4380000,0x4380000,0x500000,0x700000,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,
-0x2F40000,0x1F40000,0x1F40000,0x1F40000,0x52000000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x1F40000,0x1F40000,0x1F40000,0x52000000,0x1F40000,0x1F40000,0x1F40000,0x52000000,0x52000000,0xB00000,0xA40001,0xA40001,0xC00000,0xD00000,0xE00000,0xE00000,0x3140000,0xC00000,0xD00000,0x1600000,0x1F40000,
-0x1600000,0x1040001,0x1040001,0x1040001,0x1040001,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x82000000,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x82000000,0x47FC0000,0x47FC0000,0x82000000,0x82000000,0x1840000,0x1840000,0x1840000,0x47FC0000,0x47FC0000,0x82000000,0x47FC0000,0x47FC0000,0x82000000,0x82000000,0x47FC0000,
-0x47FC0000,0x82000000,0x82000000,0x82000000,0x1300000,0x1180000,0x1040001,0x1640000,0x1B80000,0x17FC0000,0x2FFC0000,0x57F80000,0x1480000,0x1840000,0x17FC0000,0x82000000,0x17FC0000,0x1740001,0x31FC0000,0x9BF80000,0xBA000000,0x31FC0000,0x9BF80000,0xBA000000,0x9BF80000,0xBA000000,0xBA000000,0x31FC0000,0x9BF80000,0xBA000000,0x9BF80000,0xBA000000,
-0xBA000000,0x9BF80000,0xBA000000,0xBA000000,0xBA000000,0x31FC0000,0x9BF80000,0xBA000000,0x9BF80000,0xBA000000,0xBA000000,0x9BF80000,0xBA000000,0xBA000000,0xBA000000,0x9BF80000,0xBA000000,0xBA000000,0xBA000000,0xBA000000,0x1D40000,0xF8C0000,0xF8C0000,0x59FC0000,0x8DFC0000,0xABF00000,0xBA000000,0xBA000000,0x1FC0000,0x6FFC0000,0xB7E00000,0xBA000000,
-0x7FFC0000,0xD43684,0xFEB01D6B,0xEEA0152B,0xBAA0152B,0xFE8C1422,0xFE74067B,0xC27C0933,0xD0700C42,0xB46C0686,0x9E6C0C42,0xFE681A21,0xFE4004BA,0xC2580733,0xDE30064F,0xB8380012,0x9E40068A,0xBA34152C,0xA8240735,0x96240931,0x8434152D,0x33C3680,0xFE04164D,0xBA38152B,0xE2000F5A,0xBA000681,0x9E080C40,0xBA001A5F,0xA8000936,0x940009ED,0x82001640,0x23FC3680,
-0x96001F3C,0x84001A90,0x78002270,0x6A003680,0xFEA82162,0xFECC2EC4,0xFECC30A4,0xFE800E82,0xFE4C0223,0xE238000E,0xBE440082,0xB22C0086,0xFE902009,0xFE5C0C52,0xC21C049F,0x940009ED,0x1C83680,0x10C152B,0xFEE80793,0xE0D80482,0xBAD40482,0xFEC407A3,0xEAA40006,0xBEB0007A,0xC2A80621,0xB0A00131,0x9EA40621,0x38C152B,0xFE4C0491,0xBA880480,0xE6180621,0xB8380011,
-0x9E5C0620,0x4BFC152B,0xAC0005E6,0x960007E4,0x8400152C,0x38C152B,0xFE4C0491,0xBA880480,0xE6180621,0xB8380011,0x9E5C0620,0x4BFC152B,0xAC0005E6,0x960007E4,0x8400152C,0x4BFC152B,0xAC0005E6,0x960007E4,0x8400152C,0x8400152C,0xFEE40DD9,0xF900122B,0xFB041203,0xFEB006D2,0xFE680131,0xDE40000D,0xBA5C0001,0xB80C0038,0xFEC40DFB,0xFE8405C2,0xC6040480,0x960007E4,
-0x1DF8152B,0xA0152B,0xA0152B,0xA0152B,0xA0152B,0xFE740672,0xFE740672,0xFE740672,0xA6700622,0xA6700622,0x826C0622,0xFE3C049B,0xFE3C049B,0xFE3C049B,0xB838000E,0xB838000E,0x864C0132,0x90380481,0x90380481,0x7C2C007D,0x6A340481,0xEC152B,0xEC152B,0xEC152B,0xBA000681,0xBA000681,0x82240620,0x980006B1,0x980006B1,0x7A0000E9,0x680004B9,0x1E4152B,
-0x1E4152B,0x6C000A8C,0x5C000AD1,0x5000152C,0xFE840BAE,0xF4981122,0xA0152B,0xFE6804E6,0xFE440072,0xDE38000B,0xCE38003B,0xA6340002,0xFE640B11,0xFE440409,0xBA200480,0x7A0000E9,0x154152B,0xD40482,0xD40482,0xD40482,0xD40482,0xE2A80001,0xE2A80001,0xE2A80001,0x9AA40001,0x9AA40001,0x82A40001,0x33C0480,0x33C0480,0x33C0480,0xAE4C0001,0xAE4C0001,
-0x82780000,0x23FC0480,0x23FC0480,0x7E000014,0x6A000480,0x33C0480,0x33C0480,0x33C0480,0xAE4C0001,0xAE4C0001,0x82780000,0x23FC0480,0x23FC0480,0x7E000014,0x6A000480,0x23FC0480,0x23FC0480,0x7E000014,0x6A000480,0x6A000480,0xFAB40265,0xFECC02D2,0xD40482,0xFE880122,0xFE540008,0xD8440000,0xBE580000,0xAA2C0000,0xFE940290,0xFE6800F4,0x1C80480,0x7E000014,
-0x1C80480,0x13C0622,0xFF1800A2,0xD7040001,0xBB040001,0x1D80620,0xEE9C0001,0xBAD00000,0x71F80620,0xBA200000,0x9E000620,0x1D80620,0xEE9C0001,0xBAD00000,0x71F80620,0xBA200000,0x9E000620,0x71F80620,0xBA200000,0x9E000620,0x9E000620,0x1D80620,0xEE9C0001,0xBAD00000,0x71F80620,0xBA200000,0x9E000620,0x71F80620,0xBA200000,0x9E000620,0x9E000620,0x71F80620,
-0xBA200000,0x9E000620,0x9E000620,0x9E000620,0xFF140482,0x1540620,0xF53804E2,0xFED80269,0xFE7C00B5,0xE4280000,0xBA600000,0xB6000019,0xF7080480,0xFEC00221,0xC2740001,0x9E000620,0x4BFC0620,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0x6C0622,0xC2380001,0xC2380001,0xC2380001,0xC2380001,0xC2380001,
-0xC2380001,0x6E340001,0x6E340001,0x6E340001,0x52340001,0xA40620,0xA40620,0xA40620,0xA40620,0xA40620,0xA40620,0x74000089,0x74000089,0x74000089,0x52000000,0x14C0620,0x14C0620,0x14C0620,0x4400022D,0x36000620,0xF860039D,0x6C0622,0x6C0622,0xFE4C013D,0xFE3C003A,0xEA380001,0xEA380001,0xA2380001,0xFE3C02DA,0xFC300109,0x8A280000,0x74000089,
-0xE80620,};
-static const uint32_t g_etc1_to_bc7_m6_table174[] = {
-0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0xD00000,
-0xD00000,0xD00000,0xD00000,0x22000000,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0xC480000,0xC480000,0xC480000,0x680000,0x940000,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0xB40001,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,
-0x30C0000,0xBFC0000,0xBFC0000,0xBFC0000,0x5A000000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0x30C0000,0xBFC0000,0xBFC0000,0xBFC0000,0x5A000000,0xBFC0000,0xBFC0000,0xBFC0000,0x5A000000,0x5A000000,0x8C00000,0xB40001,0xB40001,0xD40000,0xE40000,0xF80000,0xF80000,0x1300000,0xD40000,0xE40000,0x1800000,0xBFC0000,
-0x1800000,0x1140001,0x1140001,0x1140001,0x1140001,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,0x53FC0000,0x8A000000,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,0x53FC0000,0x8A000000,0x53FC0000,0x53FC0000,0x8A000000,0x8A000000,0x19C0000,0x19C0000,0x19C0000,0x53FC0000,0x53FC0000,0x8A000000,0x53FC0000,0x53FC0000,0x8A000000,0x8A000000,0x53FC0000,
-0x53FC0000,0x8A000000,0x8A000000,0x8A000000,0x1440000,0x1280000,0x1140001,0x3780000,0x1D40000,0x25FC0000,0x3DF80000,0x61FC0000,0x15C0000,0x19C0000,0x25FC0000,0x8A000000,0x25FC0000,0x1840001,0x49FC0000,0xA7F80000,0xC2000000,0x49FC0000,0xA7F80000,0xC2000000,0xA7F80000,0xC2000000,0xC2000000,0x49FC0000,0xA7F80000,0xC2000000,0xA7F80000,0xC2000000,
-0xC2000000,0xA7F80000,0xC2000000,0xC2000000,0xC2000000,0x49FC0000,0xA7F80000,0xC2000000,0xA7F80000,0xC2000000,0xC2000000,0xA7F80000,0xC2000000,0xC2000000,0xC2000000,0xA7F80000,0xC2000000,0xC2000000,0xC2000000,0xC2000000,0x1E80000,0x1A00000,0x1A00000,0x6DFC0000,0x9BFC0000,0xB5F00000,0xC2000000,0xC2000000,0x1FFC0000,0x81FC0000,0xBFF00000,0xC2000000,
-0x8FFC0000,0xE43684,0xFEC41E2C,0xF6B0152B,0xC2B0152B,0xFEA41532,0xFE8406AF,0xCA8C0933,0xD8800C42,0xBC7C0686,0xA67C0C42,0xFE801AF9,0xFE540527,0xCA680733,0xE640064F,0xC0480012,0xA650068A,0xC244152C,0xB0340735,0x9E340931,0x8C44152D,0x1543680,0xFE1416C3,0xC248152B,0xEE000E4A,0xC6000651,0xA6180C40,0xCC001953,0xB40007E6,0x9E0008C1,0x8C0015A5,0x2FFC3680,
-0x9C001E10,0x90001910,0x84002170,0x72003680,0xFEBC2236,0xF6DC2F82,0xF8E0311F,0xFE940FF3,0xFE640317,0xEA48000E,0xC6540082,0xBA3C0086,0xFEA42172,0xFE700DE2,0xCA2C049F,0x9E0008C1,0x1E83680,0x11C152B,0xFEF4082B,0xE8E80482,0xC2E40482,0xFED0081B,0xF2B40006,0xC6C0007A,0xCAB80621,0xB8B00131,0xA6B40621,0x3A4152B,0xFE6404B9,0xC2980480,0xEE280621,0xC0480011,
-0xA66C0620,0x57FC152B,0xB8000586,0xA0000789,0x8C00152C,0x3A4152B,0xFE6404B9,0xC2980480,0xEE280621,0xC0480011,0xA66C0620,0x57FC152B,0xB8000586,0xA0000789,0x8C00152C,0x57FC152B,0xB8000586,0xA0000789,0x8C00152C,0x8C00152C,0xFEF80E2A,0xFF0C123B,0xFF0C124B,0xFEC0075E,0xFE7C01A9,0xE650000D,0xC26C0001,0xC01C0038,0xFEDC0E35,0xFE980692,0xCE140480,0xA0000789,
-0x2BFC152B,0xB0152B,0xB0152B,0xB0152B,0xB0152B,0xFE8406AB,0xFE8406AB,0xFE8406AB,0xAE800622,0xAE800622,0x8A7C0622,0xFE5004B9,0xFE5004B9,0xFE5004B9,0xC048000E,0xC048000E,0x8E5C0132,0x98480481,0x98480481,0x843C007D,0x72440481,0x104152B,0x104152B,0x104152B,0xC6000641,0xC6000641,0x8A340620,0xA20005F6,0xA20005F6,0x86000061,0x72000489,0x7FC152B,
-0x7FC152B,0x720009E8,0x66000A24,0x5800152C,0xFE900C2A,0xFCA81122,0xB0152B,0xFE78055A,0xFE5800B2,0xE648000B,0xD648003B,0xAE440002,0xFE780B5A,0xFE500485,0xC2300480,0x86000061,0x174152B,0xE40482,0xE40482,0xE40482,0xE40482,0xEAB80001,0xEAB80001,0xEAB80001,0xA2B40001,0xA2B40001,0x8AB40001,0x1540480,0x1540480,0x1540480,0xB65C0001,0xB65C0001,
-0x8A880000,0x2FFC0480,0x2FFC0480,0x8A000004,0x72000480,0x1540480,0x1540480,0x1540480,0xB65C0001,0xB65C0001,0x8A880000,0x2FFC0480,0x2FFC0480,0x8A000004,0x72000480,0x2FFC0480,0x2FFC0480,0x8A000004,0x72000480,0x72000480,0xF6C80288,0xF6DC02F9,0xE40482,0xFE980145,0xFC6C0019,0xE0540000,0xC6680000,0xB23C0000,0xFAAC02AD,0xFE800120,0x1E80480,0x8A000004,
-0x1E80480,0x14C0622,0xFF2400DA,0xDF140001,0xC3140001,0x1F00620,0xF6AC0001,0xC2E00000,0x7DF80620,0xC2300000,0xA6000620,0x1F00620,0xF6AC0001,0xC2E00000,0x7DF80620,0xC2300000,0xA6000620,0x7DF80620,0xC2300000,0xA6000620,0xA6000620,0x1F00620,0xF6AC0001,0xC2E00000,0x7DF80620,0xC2300000,0xA6000620,0x7DF80620,0xC2300000,0xA6000620,0xA6000620,0x7DF80620,
-0xC2300000,0xA6000620,0xA6000620,0xA6000620,0xFB2C04B1,0x1640620,0xFD4804E2,0xFEEC029A,0xFE9000F5,0xEC380000,0xC2700000,0xC000000D,0xFF180480,0xFECC0262,0xCA840001,0xA6000620,0x5BFC0620,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0x7C0622,0xCA480001,0xCA480001,0xCA480001,0xCA480001,0xCA480001,
-0xCA480001,0x76440001,0x76440001,0x76440001,0x5A440001,0xBC0620,0xBC0620,0xBC0620,0xBC0620,0xBC0620,0xBC0620,0x80000041,0x80000041,0x80000041,0x5A100000,0x17C0620,0x17C0620,0x17C0620,0x500001CD,0x3E000620,0xFE6C03A9,0x7C0622,0x7C0622,0xFA60016D,0xFE50004A,0xF2480001,0xF2480001,0xAA480001,0xFC5402F9,0xFE3C0132,0x92380000,0x80000041,
-0x10C0620,};
-static const uint32_t g_etc1_to_bc7_m6_table175[] = {
-0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x1000000,
-0x1000000,0x1000000,0x1000000,0x2A000000,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x5C0000,0x5C0000,0x5C0000,0x800000,0xB40000,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,
-0x3240000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0xD40000,0xC40001,0xC40001,0x4E40000,0xF80000,0x10C0000,0x10C0000,0x14C0000,0x4E40000,0xF80000,0x1A40000,0x17FC0000,
-0x1A40000,0x1240001,0x1240001,0x1240001,0x1240001,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,0x5FF80000,0x92000000,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,0x5FF80000,0x92000000,0x5FF80000,0x5FF80000,0x92000000,0x92000000,0x1B40000,0x1B40000,0x1B40000,0x5FF80000,0x5FF80000,0x92000000,0x5FF80000,0x5FF80000,0x92000000,0x92000000,0x5FF80000,
-0x5FF80000,0x92000000,0x92000000,0x92000000,0x3540000,0x5380000,0x1240001,0x1900000,0x3EC0000,0x35FC0000,0x49FC0000,0x6DF80000,0x1700000,0x1B40000,0x35FC0000,0x92000000,0x35FC0000,0x1940001,0x63FC0000,0xB3F80000,0xCA000000,0x63FC0000,0xB3F80000,0xCA000000,0xB3F80000,0xCA000000,0xCA000000,0x63FC0000,0xB3F80000,0xCA000000,0xB3F80000,0xCA000000,
-0xCA000000,0xB3F80000,0xCA000000,0xCA000000,0xCA000000,0x63FC0000,0xB3F80000,0xCA000000,0xB3F80000,0xCA000000,0xCA000000,0xB3F80000,0xCA000000,0xCA000000,0xCA000000,0xB3F80000,0xCA000000,0xCA000000,0xCA000000,0xCA000000,0x1FC0000,0x1B00000,0x1B00000,0x81FC0000,0xA9F80000,0xBFF00000,0xCA000000,0xCA000000,0x3DFC0000,0x91FC0000,0xC9C40000,0xCA000000,
-0x9FF80000,0xF43684,0xFED01F3C,0xFEC0152B,0xCAC0152B,0xFEB01682,0xFE980713,0xD29C0933,0xE0900C42,0xC48C0686,0xAE8C0C42,0xFE981BF1,0xFE6C05CF,0xD2780733,0xEE50064F,0xC8580012,0xAE60068A,0xCA54152C,0xB8440735,0xA6440931,0x9454152D,0x16C3680,0xFE38176F,0xCA58152B,0xFA000D7A,0xCE100651,0xAE280C40,0xD8001863,0xBC0006CD,0xA60007D9,0x94001550,0x3BFC3680,
-0xA6001D1B,0x960017C8,0x8A002044,0x7A003680,0xFED02341,0xFEEC2F82,0xFEEC313B,0xFEAC11B2,0xFE78043B,0xF258000E,0xCE640082,0xC24C0086,0xFEB4221F,0xFE800F42,0xD23C049F,0xA60007D9,0x7FC3680,0x12C152B,0xFF0C08BB,0xF0F80482,0xCAF40482,0xFEE80893,0xFAC40006,0xCED0007A,0xD2C80621,0xC0C00131,0xAEC40621,0x3BC152B,0xFE7C0501,0xCAA80480,0xF6380621,0xC8580011,
-0xAE7C0620,0x63FC152B,0xC2000540,0xAA000740,0x9400152C,0x3BC152B,0xFE7C0501,0xCAA80480,0xF6380621,0xC8580011,0xAE7C0620,0x63FC152B,0xC2000540,0xAA000740,0x9400152C,0x63FC152B,0xC2000540,0xAA000740,0x9400152C,0x9400152C,0xFF000EBB,0xF9201289,0xFD281262,0xFED40821,0xFE900249,0xEE60000D,0xCA7C0001,0xC82C0038,0xFCF40ED9,0xFEC0074A,0xD6240480,0xAA000740,
-0x3BFC152B,0xC0152B,0xC0152B,0xC0152B,0xC0152B,0xFE9806E2,0xFE9806E2,0xFE9806E2,0xB6900622,0xB6900622,0x928C0622,0xFE6804F1,0xFE6804F1,0xFE6804F1,0xC858000E,0xC858000E,0x966C0132,0xA0580481,0xA0580481,0x8C4C007D,0x7A540481,0x11C152B,0x11C152B,0x11C152B,0xD8000621,0xD8000621,0x92440620,0xAE000576,0xAE000576,0x92000019,0x7A080480,0x13FC152B,
-0x13FC152B,0x7E000938,0x72000984,0x6000152C,0xFEA00C7D,0xF4B8117B,0xC0152B,0xFE8805D3,0xFE6C0102,0xEE58000B,0xDE58003B,0xB6540002,0xFE900BE5,0xFE680518,0xCA400480,0x92000019,0x198152B,0xF40482,0xF40482,0xF40482,0xF40482,0xF2C80001,0xF2C80001,0xF2C80001,0xAAC40001,0xAAC40001,0x92C40001,0x16C0480,0x16C0480,0x16C0480,0xBE6C0001,0xBE6C0001,
-0x92980000,0x3BFC0480,0x3BFC0480,0x92000000,0x7A000480,0x16C0480,0x16C0480,0x16C0480,0xBE6C0001,0xBE6C0001,0x92980000,0x3BFC0480,0x3BFC0480,0x92000000,0x7A000480,0x3BFC0480,0x3BFC0480,0x92000000,0x7A000480,0x7A000480,0xFED80288,0xFEEC02F9,0xF40482,0xFEB4016D,0xFE800029,0xE8640000,0xCE780000,0xBA4C0000,0xFCC002D2,0xFE980139,0x7FC0480,0x92000000,
-0x7FC0480,0x15C0622,0xFF3C0122,0xE7240001,0xCB240001,0xDFC0620,0xFEBC0001,0xCAF00000,0x89F80620,0xCA400000,0xAE000620,0xDFC0620,0xFEBC0001,0xCAF00000,0x89F80620,0xCA400000,0xAE000620,0x89F80620,0xCA400000,0xAE000620,0xAE000620,0xDFC0620,0xFEBC0001,0xCAF00000,0x89F80620,0xCA400000,0xAE000620,0x89F80620,0xCA400000,0xAE000620,0xAE000620,0x89F80620,
-0xCA400000,0xAE000620,0xAE000620,0xAE000620,0xFF3404C9,0x3740620,0xF5580515,0xFF0402E4,0xFEBC0139,0xF4480000,0xCA800000,0xCA000004,0xFF2C04B1,0xFEF002B1,0xD2940001,0xAE000620,0x69FC0620,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0x8C0622,0xD2580001,0xD2580001,0xD2580001,0xD2580001,0xD2580001,
-0xD2580001,0x7E540001,0x7E540001,0x7E540001,0x62540001,0x2D00620,0x2D00620,0x2D00620,0x2D00620,0x2D00620,0x2D00620,0x8C000019,0x8C000019,0x8C000019,0x62200000,0x1AC0620,0x1AC0620,0x1AC0620,0x56000171,0x46000620,0xF88003CA,0x8C0622,0x8C0622,0xFC70018A,0xFE640062,0xFA580001,0xFA580001,0xB2580001,0xFC640320,0xFE500145,0x9A480000,0x8C000019,
-0x12C0620,};
-static const uint32_t g_etc1_to_bc7_m6_table176[] = {
-0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x1380000,
-0x1380000,0x1380000,0x1380000,0x32000001,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0xE6C0000,0xE6C0000,0xE6C0000,0x2980000,0xDC0000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,
-0x1400000,0x25F80000,0x25F80000,0x25F80000,0x6A000001,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x25F80000,0x25F80000,0x25F80000,0x6A000001,0x25F80000,0x25F80000,0x25F80000,0x6A000001,0x6A000001,0xAE40000,0xD80000,0xD80000,0xFC0000,0x50C0000,0x3240000,0x3240000,0x16C0000,0xFC0000,0x50C0000,0x1CC0000,0x25F80000,
-0x1CC0000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x6DF80000,
-0x6DF80000,0x9A000001,0x9A000001,0x9A000001,0x7680000,0x14C0000,0x1380000,0x1A80000,0xDFC0000,0x45FC0000,0x59FC0000,0x79F80000,0x1880000,0x1D00000,0x45FC0000,0x9A000001,0x45FC0000,0x1A80000,0x7DFC0000,0xBFFC0000,0xD2000001,0x7DFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xD2000001,0xD2000001,0x7DFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xD2000001,
-0xD2000001,0xBFFC0000,0xD2000001,0xD2000001,0xD2000001,0x7DFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xD2000001,0xD2000001,0xBFFC0000,0xD2000001,0xD2000001,0xD2000001,0xBFFC0000,0xD2000001,0xD2000001,0xD2000001,0xD2000001,0x2DFC0000,0x1C40000,0x1C40000,0x97FC0000,0xB7FC0000,0xC9FC0000,0xD2000001,0xD2000001,0x5FFC0000,0xA5FC0000,0xD1F40000,0xD2000001,
-0xAFFC0000,0x1083680,0xFEE82044,0xFED41550,0xD2D4152D,0xFED017C8,0xFEAC07D9,0xDCB00931,0xEAA00C40,0xCC9C068A,0xB6A00C42,0xFEB01D1B,0xFE8406CD,0xDC8C0735,0xF6600651,0xD26C0012,0xB8740686,0xD268152B,0xC2580733,0xB0580933,0x9E68152B,0x1883680,0xFE4C1863,0xD468152C,0xFE080D7A,0xD620064F,0xB63C0C42,0xE200176F,0xC80005CF,0xB2000713,0x9E00152B,0x49F83680,
-0xB2001BF1,0xA0001682,0x96001F3C,0x82003684,0xFEE42462,0xF9003040,0xFB0431B0,0xFEC01361,0xFE9005B6,0xFA68000E,0xD6740086,0xCC600082,0xFED02373,0xFE981122,0xDC4C049F,0xB2000713,0x19FC3680,0x13C152C,0xFF180984,0xFB080480,0xD3080481,0xFF000938,0xFED80019,0xD6E0007D,0xDCD80620,0xC8D00132,0xB6D80622,0x1D8152B,0xFEA00576,0xD2BC0481,0xFE4C0621,0xD26C000E,
-0xB6900622,0x71F8152B,0xCA0004F1,0xB20006E2,0x9E00152B,0x1D8152B,0xFEA00576,0xD2BC0481,0xFE4C0621,0xD26C000E,0xB6900622,0x71F8152B,0xCA0004F1,0xB20006E2,0x9E00152B,0x71F8152B,0xCA0004F1,0xB20006E2,0x9E00152B,0x9E00152B,0xFF140F21,0xFF2C12B6,0xF53812C4,0xFEEC08F1,0xFEA80326,0xF674000D,0xD48C0002,0xD03C003B,0xFF080F33,0xFEC8081B,0xDE380481,0xB20006E2,
-0x4BFC152B,0xD4152C,0xD4152C,0xD4152C,0xD4152C,0xFEA80740,0xFEA80740,0xFEA80740,0xC0A00620,0xC0A00620,0x9AA00621,0xFE780540,0xFE780540,0xFE780540,0xD0680011,0xD0680011,0x9E7C0131,0xAA680480,0xAA680480,0x9660007A,0x82680482,0x138152B,0x138152B,0x138152B,0xE2100621,0xE2100621,0x9A580621,0xC0000501,0xC0000501,0x9C040006,0x821C0482,0x21F8152B,
-0x21F8152B,0x8A000893,0x780008BB,0x6800152B,0xFEBC0CE5,0xFECC1178,0xD4152C,0xFE980671,0xFE80016D,0xF86C000C,0xE86C0038,0xC0680001,0xFEA00C3A,0xFE8005A2,0xD2540481,0x9C040006,0x1BC152B,0x1080480,0x1080480,0x1080480,0x1080480,0xFCD80000,0xFCD80000,0xFCD80000,0xB2D80000,0xB2D80000,0x9AD80001,0x1880480,0x1880480,0x1880480,0xC6800001,0xC6800001,
-0x9CA80001,0x49F80480,0x49F80480,0x9A180001,0x82000482,0x1880480,0x1880480,0x1880480,0xC6800001,0xC6800001,0x9CA80001,0x49F80480,0x49F80480,0x9A180001,0x82000482,0x49F80480,0x49F80480,0x9A180001,0x82000482,0x82000482,0xFAEC02AD,0xF9000320,0x1080480,0xFEC40190,0xFE980041,0xF0780000,0xD8880000,0xC2600000,0xFED002DA,0xFEB0016D,0x19FC0480,0x9A180001,
-0x19FC0480,0x1700620,0xFF500171,0xEF380000,0xD3380001,0x29FC0620,0xFED80019,0xD5000001,0x97F80620,0xD2540001,0xB6000622,0x29FC0620,0xFED80019,0xD5000001,0x97F80620,0xD2540001,0xB6000622,0x97F80620,0xD2540001,0xB6000622,0xB6000622,0x29FC0620,0xFED80019,0xD5000001,0x97F80620,0xD2540001,0xB6000622,0x97F80620,0xD2540001,0xB6000622,0xB6000622,0x97F80620,
-0xD2540001,0xB6000622,0xB6000622,0xB6000622,0xFF5004E4,0x1880620,0xFF6C0515,0xFF200332,0xFED00195,0xFE580000,0xD2980001,0xD2080001,0xF94804E2,0xFF0802E4,0xDCA40000,0xB6000622,0x7BFC0620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xA00620,0xDE680000,0xDE680000,0xDE680000,0xDE680000,0xDE680000,
-0xDE680000,0x86680000,0x86680000,0x86680000,0x6A680001,0xEC0620,0xEC0620,0xEC0620,0xEC0620,0xEC0620,0xEC0620,0x9E000001,0x9E000001,0x9E000001,0x6C300001,0x1E40620,0x1E40620,0x1E40620,0x60000122,0x4E000622,0xFE8C03E8,0xA00620,0xA00620,0xFE8401A5,0xFE780080,0xFE680004,0xFE680004,0xBC680000,0xFE740328,0xFE640179,0xA4580000,0x9E000001,
-0x1540620,};
-static const uint32_t g_etc1_to_bc7_m6_table177[] = {
-0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x1680000,
-0x1680000,0x1680000,0x1680000,0x3A000001,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x800000,0x800000,0x800000,0x2B00000,0xFC0000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,
-0x1580000,0x31F80000,0x31F80000,0x31F80000,0x72000001,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x31F80000,0x31F80000,0x31F80000,0x72000001,0x31F80000,0x31F80000,0x31F80000,0x72000001,0x72000001,0xF80000,0xE80000,0xE80000,0x30C0000,0x5200000,0x13C0000,0x13C0000,0x1840000,0x30C0000,0x5200000,0x1EC0000,0x31F80000,
-0x1EC0000,0x1480000,0x1480000,0x1480000,0x1480000,0x1E80000,0x1E80000,0x1E80000,0x79F80000,0x79F80000,0xA2000001,0x1E80000,0x1E80000,0x1E80000,0x79F80000,0x79F80000,0xA2000001,0x79F80000,0x79F80000,0xA2000001,0xA2000001,0x1E80000,0x1E80000,0x1E80000,0x79F80000,0x79F80000,0xA2000001,0x79F80000,0x79F80000,0xA2000001,0xA2000001,0x79F80000,
-0x79F80000,0xA2000001,0xA2000001,0xA2000001,0x37C0000,0x75C0000,0x1480000,0x3BC0000,0x21FC0000,0x55FC0000,0x67F80000,0x83FC0000,0x19C0000,0x1E80000,0x55FC0000,0xA2000001,0x55FC0000,0x1B80000,0x95FC0000,0xCBFC0000,0xDA000001,0x95FC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xDA000001,0xDA000001,0x95FC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xDA000001,
-0xDA000001,0xCBFC0000,0xDA000001,0xDA000001,0xDA000001,0x95FC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xDA000001,0xDA000001,0xCBFC0000,0xDA000001,0xDA000001,0xDA000001,0xCBFC0000,0xDA000001,0xDA000001,0xDA000001,0xDA000001,0x55FC0000,0x3D40000,0x3D40000,0xABFC0000,0xC5FC0000,0xD3FC0000,0xDA000001,0xDA000001,0x7DFC0000,0xB5FC0000,0xDBC80000,0xDA000001,
-0xBFF80000,0x1183680,0xFEF42170,0xFEE415A5,0xDAE4152D,0xFEDC1910,0xFEC008C1,0xE4C00931,0xF2B00C40,0xD4AC068A,0xBEB00C42,0xFEC41E10,0xFE9407E6,0xE49C0735,0xFE700651,0xDA7C0012,0xC0840686,0xDA78152B,0xCA680733,0xB8680933,0xA678152B,0x1A03680,0xFE641953,0xDC78152C,0xFE200E4A,0xDE30064F,0xBE4C0C42,0xF40016C3,0xD4000527,0xBC0006AF,0xA610152B,0x55F83680,
-0xBE001AF9,0xAC001532,0x9C001E2C,0x8A003684,0xFEF8257D,0xFF0C3050,0xFF0C31F0,0xFED414FA,0xFEA4073E,0xFE7C001E,0xDE840086,0xD4700082,0xFEDC247F,0xFEB012E5,0xE45C049F,0xBC0006AF,0x27FC3680,0x14C152C,0xFF300A24,0xFF180489,0xDB180481,0xFF1809E8,0xFEF00061,0xDEF0007D,0xE4E80620,0xD0E00132,0xBEE80622,0x1F0152B,0xFEB805F6,0xDACC0481,0xFE640641,0xDA7C000E,
-0xBEA00622,0x7DF8152B,0xD60004B9,0xBC0006AB,0xA600152B,0x1F0152B,0xFEB805F6,0xDACC0481,0xFE640641,0xDA7C000E,0xBEA00622,0x7DF8152B,0xD60004B9,0xBC0006AB,0xA600152B,0x7DF8152B,0xD60004B9,0xBC0006AB,0xA600152B,0xA600152B,0xFD300F89,0xFB4412E4,0xFD4812C4,0xFF0009C8,0xFEC803F2,0xFE84000D,0xDC9C0002,0xD84C003B,0xFF180FA2,0xFEE80905,0xE6480481,0xBC0006AB,
-0x5BFC152B,0xE4152C,0xE4152C,0xE4152C,0xE4152C,0xFEBC0789,0xFEBC0789,0xFEBC0789,0xC8B00620,0xC8B00620,0xA2B00621,0xFE8C0586,0xFE8C0586,0xFE8C0586,0xD8780011,0xD8780011,0xA68C0131,0xB2780480,0xB2780480,0x9E70007A,0x8A780482,0x150152B,0x150152B,0x150152B,0xEA200621,0xEA200621,0xA2680621,0xCC0004B9,0xCC0004B9,0xA4140006,0x8A2C0482,0x2DF8152B,
-0x2DF8152B,0x9600081B,0x8400082B,0x7000152B,0xFECC0D34,0xF6DC11D1,0xE4152C,0xFEAC0709,0xFE9401DD,0xFC7C000E,0xF07C0038,0xC8780001,0xFEB40C89,0xFE980632,0xDA640481,0xA4140006,0x1E0152B,0x1180480,0x1180480,0x1180480,0x1180480,0xFEE80004,0xFEE80004,0xFEE80004,0xBAE80000,0xBAE80000,0xA2E80001,0x1A00480,0x1A00480,0x1A00480,0xCE900001,0xCE900001,
-0xA4B80001,0x55F80480,0x55F80480,0xA2280001,0x8A000482,0x1A00480,0x1A00480,0x1A00480,0xCE900001,0xCE900001,0xA4B80001,0x55F80480,0x55F80480,0xA2280001,0x8A000482,0x55F80480,0x55F80480,0xA2280001,0x8A000482,0x8A000482,0xF70002D2,0xFF0C0328,0x1180480,0xFEDC01A5,0xFEAC0061,0xF8880000,0xE0980000,0xCA700000,0xF8EC02F9,0xFEC40188,0x27FC0480,0xA2280001,
-0x27FC0480,0x1800620,0xFF5C01CD,0xF7480000,0xDB480001,0x41FC0620,0xFEFC0041,0xDD100001,0xA1FC0620,0xDA640001,0xBE000622,0x41FC0620,0xFEFC0041,0xDD100001,0xA1FC0620,0xDA640001,0xBE000622,0xA1FC0620,0xDA640001,0xBE000622,0xBE000622,0x41FC0620,0xFEFC0041,0xDD100001,0xA1FC0620,0xDA640001,0xBE000622,0xA1FC0620,0xDA640001,0xBE000622,0xBE000622,0xA1FC0620,
-0xDA640001,0xBE000622,0xBE000622,0xBE000622,0xF7680515,0x5980620,0xF77C0548,0xFF400372,0xFEFC01E5,0xFE84000D,0xDAA80001,0xDA180001,0xFD5804E4,0xFF200340,0xE4B40000,0xBE000622,0x89FC0620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xB00620,0xE6780000,0xE6780000,0xE6780000,0xE6780000,0xE6780000,
-0xE6780000,0x8E780000,0x8E780000,0x8E780000,0x72780001,0x1040620,0x1040620,0x1040620,0x1040620,0x1040620,0x1040620,0xA6100001,0xA6100001,0xA6100001,0x74400001,0x7FC0620,0x7FC0620,0x7FC0620,0x6C0000DA,0x56000622,0xFAA403F5,0xB00620,0xB00620,0xFE9001D4,0xFC8800A4,0xFC7C000D,0xFC7C000D,0xC4780000,0xFC8C0349,0xFE780190,0xAC680000,0xA6100001,
-0x1740620,};
-static const uint32_t g_etc1_to_bc7_m6_table178[] = {
-0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0x1980000,
-0x1980000,0x1980000,0x1980000,0x42000001,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x900000,0x900000,0x900000,0xC80000,0x1200000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0xF80000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,
-0x1700000,0x3DF80000,0x3DF80000,0x3DF80000,0x7A000001,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x3DF80000,0x3DF80000,0x3DF80000,0x7A000001,0x3DF80000,0x3DF80000,0x3DF80000,0x7A000001,0x7A000001,0x1080000,0xF80000,0xF80000,0x1200000,0x5340000,0x1500000,0x1500000,0x1A00000,0x1200000,0x5340000,0x9FC0000,0x3DF80000,
-0x9FC0000,0x1580000,0x1580000,0x1580000,0x1580000,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,0x85F80000,0xAA000001,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,0x85F80000,0xAA000001,0x85F80000,0x85F80000,0xAA000001,0xAA000001,0x3FC0000,0x3FC0000,0x3FC0000,0x85F80000,0x85F80000,0xAA000001,0x85F80000,0x85F80000,0xAA000001,0xAA000001,0x85F80000,
-0x85F80000,0xAA000001,0xAA000001,0xAA000001,0x1900000,0xF6C0000,0x1580000,0x1D40000,0x33FC0000,0x63FC0000,0x75F80000,0x8FF80000,0x1B00000,0x3FC0000,0x63FC0000,0xAA000001,0x63FC0000,0x1C80000,0xAFFC0000,0xD7FC0000,0xE2000001,0xAFFC0000,0xD7FC0000,0xE2000001,0xD7FC0000,0xE2000001,0xE2000001,0xAFFC0000,0xD7FC0000,0xE2000001,0xD7FC0000,0xE2000001,
-0xE2000001,0xD7FC0000,0xE2000001,0xE2000001,0xE2000001,0xAFFC0000,0xD7FC0000,0xE2000001,0xD7FC0000,0xE2000001,0xE2000001,0xD7FC0000,0xE2000001,0xE2000001,0xE2000001,0xD7FC0000,0xE2000001,0xE2000001,0xE2000001,0xE2000001,0x7DFC0000,0xBE40000,0xBE40000,0xBDFC0000,0xD3F80000,0xDDFC0000,0xE2000001,0xE2000001,0x9BFC0000,0xC7FC0000,0xE3D80000,0xE2000001,
-0xCDFC0000,0x1283680,0xFF0C2270,0xFEF81640,0xE2F4152D,0xFEF41A90,0xFED409ED,0xECD00931,0xFAC00C40,0xDCBC068A,0xC6C00C42,0xFED01F3C,0xFEAC0936,0xECAC0735,0xFE880681,0xE28C0012,0xC8940686,0xE288152B,0xD2780733,0xC0780933,0xAE88152B,0x1B83680,0xFE881A5F,0xE488152C,0xFE380F5A,0xE640064F,0xC65C0C42,0xFC00164D,0xDE0004BA,0xC400067B,0xAE20152B,0x61F83680,
-0xCA001A21,0xB8001422,0xA6001D6B,0x92003684,0xFF0C269A,0xF9203102,0xFB243245,0xFEDC16B2,0xFEB808F6,0xFE900086,0xE6940086,0xDC800082,0xFEF82596,0xFEC41488,0xEC6C049F,0xC400067B,0x37FC3680,0x15C152C,0xFF440AD1,0xFF2C04B9,0xE3280481,0xFF240A8C,0xFF0400E9,0xE700007D,0xECF80620,0xD8F00132,0xC6F80622,0xDFC152B,0xFECC06B1,0xE2DC0481,0xFE880681,0xE28C000E,
-0xC6B00622,0x89F8152B,0xE000049B,0xC4000672,0xAE00152B,0xDFC152B,0xFECC06B1,0xE2DC0481,0xFE880681,0xE28C000E,0xC6B00622,0x89F8152B,0xE000049B,0xC4000672,0xAE00152B,0x89F8152B,0xE000049B,0xC4000672,0xAE00152B,0xAE00152B,0xFF3C1004,0xFF4C1324,0xF5581329,0xFF140A71,0xFEE804D9,0xFE980052,0xE4AC0002,0xE05C003B,0xFF34103A,0xFF0409B3,0xEE580481,0xC4000672,
-0x69FC152B,0xF4152C,0xF4152C,0xF4152C,0xF4152C,0xFED007E4,0xFED007E4,0xFED007E4,0xD0C00620,0xD0C00620,0xAAC00621,0xFEA405E6,0xFEA405E6,0xFEA405E6,0xE0880011,0xE0880011,0xAE9C0131,0xBA880480,0xBA880480,0xA680007A,0x92880482,0x168152B,0x168152B,0x168152B,0xF2300621,0xF2300621,0xAA780621,0xD8000491,0xD8000491,0xAC240006,0x923C0482,0x39F8152B,
-0x39F8152B,0x9C0007A3,0x8A000793,0x7800152B,0xFED80DAA,0xFEEC11D1,0xF4152C,0xFEC40794,0xFEAC0275,0xFE900022,0xF88C0038,0xD0880001,0xFEBC0D2A,0xFEB006D1,0xE2740481,0xAC240006,0x3FC152B,0x1280480,0x1280480,0x1280480,0x1280480,0xFCFC0014,0xFCFC0014,0xFCFC0014,0xC2F80000,0xC2F80000,0xAAF80001,0x1B80480,0x1B80480,0x1B80480,0xD6A00001,0xD6A00001,
-0xACC80001,0x61F80480,0x61F80480,0xAA380001,0x92000482,0x1B80480,0x1B80480,0x1B80480,0xD6A00001,0xD6A00001,0xACC80001,0x61F80480,0x61F80480,0xAA380001,0x92000482,0x61F80480,0x61F80480,0xAA380001,0x92000482,0x92000482,0xFF1002D2,0xF9200349,0x1280480,0xFCF401E1,0xFEC80082,0xFC9C0001,0xE8A80000,0xD2800000,0xFEF802FD,0xFEDC01A5,0x37FC0480,0xAA380001,
-0x37FC0480,0x1900620,0xFF74022D,0xFF580000,0xE3580001,0x59FC0620,0xFF140089,0xE5200001,0xADFC0620,0xE2740001,0xC6000622,0x59FC0620,0xFF140089,0xE5200001,0xADFC0620,0xE2740001,0xC6000622,0xADFC0620,0xE2740001,0xC6000622,0xC6000622,0x59FC0620,0xFF140089,0xE5200001,0xADFC0620,0xE2740001,0xC6000622,0xADFC0620,0xE2740001,0xC6000622,0xC6000622,0xADFC0620,
-0xE2740001,0xC6000622,0xC6000622,0xC6000622,0xFF780515,0xDA80620,0xFF8C0548,0xFF5803C8,0xFF100249,0xFEAC0041,0xE2B80001,0xE2280001,0xFF700515,0xFF40037A,0xECC40000,0xC6000622,0x99FC0620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xC00620,0xEE880000,0xEE880000,0xEE880000,0xEE880000,0xEE880000,
-0xEE880000,0x96880000,0x96880000,0x96880000,0x7A880001,0x11C0620,0x11C0620,0x11C0620,0x11C0620,0x11C0620,0x11C0620,0xAE200001,0xAE200001,0xAE200001,0x7C500001,0x13FC0620,0x13FC0620,0x13FC0620,0x720000A2,0x5E000622,0xFEAC041D,0xC00620,0xC00620,0xFEA001F9,0xFC9C00C8,0xFE900019,0xFE900019,0xCC880000,0xFC9C0372,0xFC9001C2,0xB4780000,0xAE200001,
-0x1980620,};
-static const uint32_t g_etc1_to_bc7_m6_table179[] = {
-0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0x1CC0000,
-0x1CC0000,0x1CC0000,0x1CC0000,0x4A000001,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x8A00000,0x8A00000,0x8A00000,0xE00000,0x1400000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,
-0x1880000,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x82000001,0x5180000,0x1080000,0x1080000,0x1340000,0x14C0000,0x1680000,0x1680000,0x1BC0000,0x1340000,0x14C0000,0x19FC0000,0x49F80000,
-0x19FC0000,0x1680000,0x1680000,0x1680000,0x1680000,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,0x91F80000,0xB2000001,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,0x91F80000,0xB2000001,0x91F80000,0x91F80000,0xB2000001,0xB2000001,0x1BFC0000,0x1BFC0000,0x1BFC0000,0x91F80000,0x91F80000,0xB2000001,0x91F80000,0x91F80000,0xB2000001,0xB2000001,0x91F80000,
-0x91F80000,0xB2000001,0xB2000001,0xB2000001,0x7A00000,0x1800000,0x1680000,0x3E80000,0x47FC0000,0x73FC0000,0x81FC0000,0x99FC0000,0x1C40000,0x1BFC0000,0x73FC0000,0xB2000001,0x73FC0000,0x1D80000,0xC7FC0000,0xE3FC0000,0xEA000001,0xC7FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xEA000001,0xEA000001,0xC7FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xEA000001,
-0xEA000001,0xE3FC0000,0xEA000001,0xEA000001,0xEA000001,0xC7FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xEA000001,0xEA000001,0xE3FC0000,0xEA000001,0xEA000001,0xEA000001,0xE3FC0000,0xEA000001,0xEA000001,0xEA000001,0xEA000001,0xA3FC0000,0x1F80000,0x1F80000,0xD1FC0000,0xDFFC0000,0xE7FC0000,0xEA000001,0xEA000001,0xB9FC0000,0xD7FC0000,0xEBE80000,0xEA000001,
-0xDDF80000,0x1383680,0xFF1823A4,0xFF0816F9,0xEB04152D,0xFF001C08,0xFEE40B51,0xF4E00931,0xFED40C50,0xE4CC068A,0xCED00C42,0xFEE8206C,0xFEC00AD1,0xF4BC0735,0xFEA00711,0xEA9C0012,0xD0A40686,0xEA98152B,0xDA880733,0xC8880933,0xB698152B,0x1D03680,0xFEA01B7F,0xEC98152C,0xFE5810AA,0xEE50064F,0xCE6C0C42,0xFE14169F,0xEA00048A,0xCC10067B,0xB630152B,0x6DF83680,
-0xD0001959,0xBE001302,0xAC001C83,0x9A003684,0xFF2027A2,0xFF2C311A,0xFF2C328D,0xFEF0188A,0xFECC0ADE,0xFEAC0156,0xEEA40086,0xE4900082,0xFF0026DC,0xFEDC1672,0xF47C049F,0xCC10067B,0x45FC3680,0x16C152C,0xFF500B91,0xFF380529,0xEB380481,0xFF3C0B4C,0xFF14019D,0xEF10007D,0xF5080620,0xE1000132,0xCF080622,0x25FC152B,0xFEF00771,0xEAEC0481,0xFEAC06F1,0xEA9C000E,
-0xCEC00622,0x95F8152B,0xEA000489,0xCE000653,0xB600152B,0x25FC152B,0xFEF00771,0xEAEC0481,0xFEAC06F1,0xEA9C000E,0xCEC00622,0x95F8152B,0xEA000489,0xCE000653,0xB600152B,0x95F8152B,0xEA000489,0xCE000653,0xB600152B,0xB600152B,0xFF501055,0xFD681342,0xFD681329,0xFF300B69,0xFEFC05D5,0xFEC000CD,0xECBC0002,0xE86C003B,0xFF44109B,0xFF180AA9,0xF6680481,0xCE000653,
-0x79FC152B,0x104152C,0x104152C,0x104152C,0x104152C,0xFEDC084C,0xFEDC084C,0xFEDC084C,0xD8D00620,0xD8D00620,0xB2D00621,0xFEB00662,0xFEB00662,0xFEB00662,0xE8980011,0xE8980011,0xB6AC0131,0xC2980480,0xC2980480,0xAE90007A,0x9A980482,0x180152B,0x180152B,0x180152B,0xFA400621,0xFA400621,0xB2880621,0xE6040481,0xE6040481,0xB4340006,0x9A4C0482,0x45F8152B,
-0x45F8152B,0xA6000749,0x94000722,0x8000152B,0xFEE80E01,0xF6FC122C,0x104152C,0xFED40821,0xFEC00309,0xFEA40058,0xFCA0003D,0xD8980001,0xFED00D73,0xFEBC076A,0xEA840481,0xB4340006,0x13FC152B,0x1380480,0x1380480,0x1380480,0x1380480,0xFF0C0028,0xFF0C0028,0xFF0C0028,0xCB080000,0xCB080000,0xB3080001,0x1D00480,0x1D00480,0x1D00480,0xDEB00001,0xDEB00001,
-0xB4D80001,0x6DF80480,0x6DF80480,0xB2480001,0x9A000482,0x1D00480,0x1D00480,0x1D00480,0xDEB00001,0xDEB00001,0xB4D80001,0x6DF80480,0x6DF80480,0xB2480001,0x9A000482,0x6DF80480,0x6DF80480,0xB2480001,0x9A000482,0x9A000482,0xFF2002F9,0xFF2C0355,0x1380480,0xFD040202,0xFEDC00AA,0xFEB40008,0xF0B80000,0xDA900000,0xFD100320,0xFEE801D4,0x45FC0480,0xB2480001,
-0x45FC0480,0x1A00620,0xFF8002A1,0xFF6C001D,0xEB680001,0x71FC0620,0xFF3800E9,0xED300001,0xB9FC0620,0xEA840001,0xCE000622,0x71FC0620,0xFF3800E9,0xED300001,0xB9FC0620,0xEA840001,0xCE000622,0xB9FC0620,0xEA840001,0xCE000622,0xCE000622,0x71FC0620,0xFF3800E9,0xED300001,0xB9FC0620,0xEA840001,0xCE000622,0xB9FC0620,0xEA840001,0xCE000622,0xCE000622,0xB9FC0620,
-0xEA840001,0xCE000622,0xCE000622,0xCE000622,0xFF8C0548,0x1BC0620,0xF79C057D,0xFF6C03F9,0xFF3C02B1,0xFED40092,0xEAC80001,0xEA380001,0xF7880548,0xFF5803DA,0xF4D40000,0xCE000622,0xA7FC0620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xD00620,0xF6980000,0xF6980000,0xF6980000,0xF6980000,0xF6980000,
-0xF6980000,0x9E980000,0x9E980000,0x9E980000,0x82980001,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0xB6300001,0xB6300001,0xB6300001,0x84600001,0x1FF80620,0x1FF80620,0x1FF80620,0x7E00006A,0x66000622,0xFAC40424,0xD00620,0xD00620,0xFCB80221,0xFEA800E9,0xFCA00034,0xFCA00034,0xD4980000,0xF8B0039D,0xFEA001E1,0xBC880000,0xB6300001,
-0x1B80620,};
-static const uint32_t g_etc1_to_bc7_m6_table180[] = {
-0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x3F80000,
-0x3F80000,0x3F80000,0x3F80000,0x54000000,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0x2B40000,0x2B40000,0x2B40000,0xFC0000,0x1680000,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,
-0x1A40000,0x57F80000,0x57F80000,0x57F80000,0x8C000000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,0x57F80000,0x57F80000,0x8C000000,0x57F80000,0x57F80000,0x57F80000,0x8C000000,0x8C000000,0x12C0000,0x1180001,0x1180001,0x1480000,0x3600000,0x1800000,0x1800000,0x1D80000,0x1480000,0x3600000,0x29FC0000,0x57F80000,
-0x29FC0000,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x9DFC0000,
-0x9DFC0000,0xBC000000,0xBC000000,0xBC000000,0x1B80000,0x1940000,0x1780001,0x9FC0000,0x5DFC0000,0x83FC0000,0x91FC0000,0xA5FC0000,0x3D80000,0x37FC0000,0x83FC0000,0xBC000000,0x83FC0000,0x1E80001,0xE3FC0000,0xF1F80000,0xF4000000,0xE3FC0000,0xF1F80000,0xF4000000,0xF1F80000,0xF4000000,0xF4000000,0xE3FC0000,0xF1F80000,0xF4000000,0xF1F80000,0xF4000000,
-0xF4000000,0xF1F80000,0xF4000000,0xF4000000,0xF4000000,0xE3FC0000,0xF1F80000,0xF4000000,0xF1F80000,0xF4000000,0xF4000000,0xF1F80000,0xF4000000,0xF4000000,0xF4000000,0xF1F80000,0xF4000000,0xF4000000,0xF4000000,0xF4000000,0xD1FC0000,0x67FC0000,0x67FC0000,0xE7FC0000,0xEFFC0000,0xF3F80000,0xF4000000,0xF4000000,0xDBFC0000,0xEBFC0000,0xF5DC0000,0xF4000000,
-0xEDFC0000,0x1483684,0xFF3024F4,0xFF1C182C,0xF514152B,0xFF181DD4,0xFEFC0D2B,0xFCF00933,0xFEE40CC2,0xEEE00686,0xD8E00C42,0xFF0021E4,0xFED80CD7,0xFCCC0733,0xFEB80823,0xF2AC0012,0xD8B4068A,0xF4A8152C,0xE2980735,0xD0980931,0xBEA8152D,0x3E83680,0xFEB81CE9,0xF4AC152B,0xFE7C1242,0xF8640651,0xD87C0C40,0xFE3817A1,0xF20C0485,0xD618067D,0xBE40152D,0x79FC3680,
-0xDC001883,0xCA0011EA,0xB8001BA1,0xA4003680,0xFF3428DF,0xFB4431C4,0xFD4832DC,0xFF081AA2,0xFEE80D2F,0xFEC002B7,0xF8B80082,0xECA00086,0xFF1827ED,0xFEF418C3,0xFC90049F,0xD618067D,0x57FC3680,0x180152B,0xFF680C82,0xFF4C05EB,0xF5480482,0xFF500C19,0xFF2C02AA,0xF924007A,0xFD1C0621,0xEB140131,0xD9180621,0x41FC152B,0xFF080866,0xF4FC0480,0xFECC07B9,0xF2AC0011,
-0xD8D00620,0xA1FC152B,0xF4000480,0xD6000631,0xBE00152C,0x41FC152B,0xFF080866,0xF4FC0480,0xFECC07B9,0xF2AC0011,0xD8D00620,0xA1FC152B,0xF4000480,0xD6000631,0xBE00152C,0xA1FC152B,0xF4000480,0xD6000631,0xBE00152C,0xBE00152C,0xFF64110D,0xF57813A3,0xF77C138B,0xFF400C54,0xFF100726,0xFEDC01AD,0xF4D00001,0xF2800038,0xFF5C10F2,0xFF2C0BE1,0xFE7C0484,0xD6000631,
-0x89FC152B,0x114152B,0x114152B,0x114152B,0x114152B,0xFEF408D3,0xFEF408D3,0xFEF408D3,0xE0E40622,0xE0E40622,0xBCE00622,0xFED006DB,0xFED006DB,0xFED006DB,0xF2AC000E,0xF2AC000E,0xC0C00132,0xCAAC0481,0xCAAC0481,0xB6A0007D,0xA4A80481,0x398152B,0x398152B,0x398152B,0xFE580629,0xFE580629,0xBC980620,0xEE180481,0xEE180481,0xBC480005,0xA45C0480,0x51FC152B,
-0x51FC152B,0xB20006E6,0x9C000690,0x8A00152C,0xFF040E83,0xFF0C1233,0x114152B,0xFEE808EA,0xFED403B6,0xFEBC00B2,0xFEB4004B,0xE0A80002,0xFEE40E19,0xFED007FD,0xF4940480,0xBC480005,0x23FC152B,0x1480482,0x1480482,0x1480482,0x1480482,0xFF24004A,0xFF24004A,0xFF24004A,0xD5180001,0xD5180001,0xBD180001,0x3E80480,0x3E80480,0x3E80480,0xE8C00001,0xE8C00001,
-0xBCEC0000,0x79FC0480,0x79FC0480,0xBC540000,0xA4000480,0x3E80480,0x3E80480,0x3E80480,0xE8C00001,0xE8C00001,0xBCEC0000,0x79FC0480,0x79FC0480,0xBC540000,0xA4000480,0x79FC0480,0x79FC0480,0xBC540000,0xA4000480,0xA4000480,0xFB340322,0xFB440372,0x1480482,0xFF180225,0xFEF000E8,0xFED00022,0xF8CC0000,0xE4A00000,0xF9280349,0xFF040208,0x57FC0480,0xBC540000,
-0x57FC0480,0x1B00622,0xFF980321,0xFF7C007A,0xF5780001,0x8DFC0620,0xFF580185,0xF5440000,0xC7FC0620,0xF4940000,0xD8000620,0x8DFC0620,0xFF580185,0xF5440000,0xC7FC0620,0xF4940000,0xD8000620,0xC7FC0620,0xF4940000,0xD8000620,0xD8000620,0x8DFC0620,0xFF580185,0xF5440000,0xC7FC0620,0xF4940000,0xD8000620,0xC7FC0620,0xF4940000,0xD8000620,0xD8000620,0xC7FC0620,
-0xF4940000,0xD8000620,0xD8000620,0xD8000620,0xF7A4057D,0xFCC0620,0xFFAC0581,0xFF84045D,0xFF500335,0xFF040112,0xF4D40000,0xF4440000,0xFF98054A,0xFD800451,0xFCE80001,0xD8000620,0xB9FC0620,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xE00622,0xFCAC0001,0xFCAC0001,0xFCAC0001,0xFCAC0001,0xFCAC0001,
-0xFCAC0001,0xA8A80001,0xA8A80001,0xA8A80001,0x8CA80001,0x1500620,0x1500620,0x1500620,0x1500620,0x1500620,0x1500620,0xC0400001,0xC0400001,0xC0400001,0x8C740000,0x2DF80620,0x2DF80620,0x2DF80620,0x88000041,0x70000620,0xF4D80451,0xE00622,0xE00622,0xFECC0242,0xFEBC0115,0xFEB4004A,0xFEB4004A,0xDCAC0001,0xFEBC03A9,0xFEB40202,0xC49C0000,0xC0400001,
-0x1E00620,};
-static const uint32_t g_etc1_to_bc7_m6_table181[] = {
-0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0xFF80000,
-0xFF80000,0xFF80000,0xFF80000,0x5C000000,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xAC40000,0xAC40000,0xAC40000,0x1140000,0x18C0000,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,
-0x1BC0000,0x61FC0000,0x61FC0000,0x61FC0000,0x94000000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x61FC0000,0x61FC0000,0x61FC0000,0x94000000,0x61FC0000,0x61FC0000,0x61FC0000,0x94000000,0x94000000,0x73C0000,0x1280001,0x1280001,0x5580000,0x3740000,0x1940000,0x1940000,0x1F40000,0x5580000,0x3740000,0x39FC0000,0x61FC0000,
-0x39FC0000,0x1880001,0x1880001,0x1880001,0x1880001,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xA9FC0000,0xC4000000,0xC4000000,0x4FFC0000,0x4FFC0000,0x4FFC0000,0xA9FC0000,0xA9FC0000,0xC4000000,0xA9FC0000,0xA9FC0000,0xC4000000,0xC4000000,0xA9FC0000,
-0xA9FC0000,0xC4000000,0xC4000000,0xC4000000,0x5C80000,0x1A40000,0x1880001,0x27FC0000,0x71FC0000,0x93FC0000,0x9FF80000,0xB1F80000,0x3EC0000,0x4FFC0000,0x93FC0000,0xC4000000,0x93FC0000,0x1F80001,0xFBFC0000,0xFDF80000,0xFC000000,0xFBFC0000,0xFDF80000,0xFC000000,0xFDF80000,0xFC000000,0xFC000000,0xFBFC0000,0xFDF80000,0xFC000000,0xFDF80000,0xFC000000,
-0xFC000000,0xFDF80000,0xFC000000,0xFC000000,0xFC000000,0xFBFC0000,0xFDF80000,0xFC000000,0xFDF80000,0xFC000000,0xFC000000,0xFDF80000,0xFC000000,0xFC000000,0xFC000000,0xFDF80000,0xFC000000,0xFC000000,0xFC000000,0xFC000000,0xF7FC0000,0xE7FC0000,0xE7FC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFC000000,0xFC000000,0xF9FC0000,0xFBFC0000,0xFDEC0000,0xFC000000,
-0xFDF80000,0x1583684,0xFF44261B,0xFF2C1953,0xFD24152B,0xFF241F94,0xFF100F1F,0xFF00095F,0xFEFC0D92,0xF6F00686,0xE0F00C42,0xFF18234C,0xFEF00EDF,0xFEDC0767,0xFECC099A,0xFABC0012,0xE0C4068A,0xFCB8152C,0xEAA80735,0xD8A80931,0xC6B8152D,0x7FC3680,0xFED81E87,0xFCBC152B,0xFE9413F2,0xFE700655,0xE08C0C40,0xFE5818EB,0xFA1C0485,0xDE28067D,0xC650152D,0x85FC3680,
-0xE60017F0,0xD0001112,0xBE001AD5,0xAC003680,0xFF3C29CC,0xFF4C3204,0xFF4C3354,0xFF181C7A,0xFEFC0F77,0xFED40452,0xFECC0089,0xF4B00086,0xFF342916,0xFF101AAA,0xFEA404D3,0xDE28067D,0x65FC3680,0x190152B,0xFF740D62,0xFF6406C3,0xFD580482,0xFF680D01,0xFF4003E6,0xFF340081,0xFF2C0641,0xF3240131,0xE1280621,0x59FC152B,0xFF200966,0xFD0C0480,0xFEF00889,0xFABC0011,
-0xE0E00620,0xADFC152B,0xFC100480,0xE0000624,0xC600152C,0x59FC152B,0xFF200966,0xFD0C0480,0xFEF00889,0xFABC0011,0xE0E00620,0xADFC152B,0xFC100480,0xE0000624,0xC600152C,0xADFC152B,0xFC100480,0xE0000624,0xC600152C,0xC600152C,0xFF781162,0xFD8813A3,0xFF8C138B,0xFF540D51,0xFF28086E,0xFEF002BB,0xFCE00001,0xFA900038,0xFD7411AE,0xFF480C8E,0xFEA004D1,0xE0000624,
-0x99FC152B,0x124152B,0x124152B,0x124152B,0x124152B,0xFF00095B,0xFF00095B,0xFF00095B,0xE8F40622,0xE8F40622,0xC4F00622,0xFEDC0763,0xFEDC0763,0xFEDC0763,0xFABC000E,0xFABC000E,0xC8D00132,0xD2BC0481,0xD2BC0481,0xBEB0007D,0xACB80481,0x3B0152B,0x3B0152B,0x3B0152B,0xFE700651,0xFE700651,0xC4A80620,0xF6280481,0xF6280481,0xC4580005,0xAC6C0480,0x5DFC152B,
-0x5DFC152B,0xBE0006A6,0xA6000631,0x9200152C,0xFF100EED,0xF9201286,0x124152B,0xFEF80996,0xFEE40481,0xFECC0139,0xFECC0085,0xE8B80002,0xFEF80E66,0xFEE808E2,0xFCA40480,0xC4580005,0x33FC152B,0x1580482,0x1580482,0x1580482,0x1580482,0xFF34007D,0xFF34007D,0xFF34007D,0xDD280001,0xDD280001,0xC5280001,0x7FC0480,0x7FC0480,0x7FC0480,0xF0D00001,0xF0D00001,
-0xC4FC0000,0x85FC0480,0x85FC0480,0xC4640000,0xAC000480,0x7FC0480,0x7FC0480,0x7FC0480,0xF0D00001,0xF0D00001,0xC4FC0000,0x85FC0480,0x85FC0480,0xC4640000,0xAC000480,0x85FC0480,0x85FC0480,0xC4640000,0xAC000480,0xAC000480,0xF7480349,0xFF4C0392,0x1580482,0xFD300265,0xFD140120,0xFEEC003D,0xFCE00001,0xECB00000,0xFF34034D,0xFF18022D,0x65FC0480,0xC4640000,
-0x65FC0480,0x1C00622,0xFFB003A9,0xFF9400FA,0xFD880001,0xA5FC0620,0xFF70021D,0xFD540000,0xD3FC0620,0xFCA40000,0xE0000620,0xA5FC0620,0xFF70021D,0xFD540000,0xD3FC0620,0xFCA40000,0xE0000620,0xD3FC0620,0xFCA40000,0xE0000620,0xE0000620,0xA5FC0620,0xFF70021D,0xFD540000,0xD3FC0620,0xFCA40000,0xE0000620,0xD3FC0620,0xFCA40000,0xE0000620,0xE0000620,0xD3FC0620,
-0xFCA40000,0xE0000620,0xE0000620,0xE0000620,0xFFB4057D,0x1E00620,0xF9C005B2,0xFF9404B2,0xFF7C03A9,0xFF2C01BA,0xFCE40000,0xFC540000,0xFFAC0581,0xFF940488,0xFF10001D,0xE0000620,0xC7FC0620,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xF00622,0xFEBC0005,0xFEBC0005,0xFEBC0005,0xFEBC0005,0xFEBC0005,
-0xFEBC0005,0xB0B80001,0xB0B80001,0xB0B80001,0x94B80001,0x1680620,0x1680620,0x1680620,0x1680620,0x1680620,0x1680620,0xC8500001,0xC8500001,0xC8500001,0x94840000,0x39F80620,0x39F80620,0x39F80620,0x90000020,0x78000620,0xFCE80451,0xF00622,0xF00622,0xFED80271,0xFED0013D,0xFEC4006A,0xFEC4006A,0xE4BC0001,0xFCD403C8,0xFAC80244,0xCCAC0000,0xC8500001,
-0x3FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table182[] = {
-0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x1BF80000,
-0x1BF80000,0x1BF80000,0x1BF80000,0x64000000,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xD80000,0xD80000,0xD80000,0x12C0000,0x1AC0000,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x1380001,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,
-0x3D00000,0x6DFC0000,0x6DFC0000,0x6DFC0000,0x9C000000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x3D00000,0x6DFC0000,0x6DFC0000,0x6DFC0000,0x9C000000,0x6DFC0000,0x6DFC0000,0x6DFC0000,0x9C000000,0x9C000000,0xF4C0000,0x1380001,0x1380001,0x16C0000,0x3880000,0x1AC0000,0x1AC0000,0xFFC0000,0x16C0000,0x3880000,0x47FC0000,0x6DFC0000,
-0x47FC0000,0x1980001,0x1980001,0x1980001,0x1980001,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xB5FC0000,0xCC000000,0xCC000000,0x69FC0000,0x69FC0000,0x69FC0000,0xB5FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xB5FC0000,0xCC000000,0xCC000000,0xB5FC0000,
-0xB5FC0000,0xCC000000,0xCC000000,0xCC000000,0x1DC0000,0x3B40000,0x1980001,0x45FC0000,0x85FC0000,0xA1FC0000,0xABFC0000,0xBBFC0000,0xBFC0000,0x69FC0000,0xA1FC0000,0xCC000000,0xA1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x16832DC,0xFF50247F,0xFF381933,0xFF34152B,0xFF3C1E14,0xFF200F63,0xFF1809EB,0xFF080CD6,0xF9000656,0xE5000B06,0xFF24213C,0xFF040F22,0xFEF407FB,0xFEE40906,0xFED00012,0xE8D4055E,0xFECC12C5,0xEEBC0679,0xDCBC0799,0xCCC812C5,0x19FC32DC,0xFEE41DCF,0xFED0152B,0xFEB81322,0xFE940691,0xE4A00B04,0xFE701733,0xFE380481,0xE4400525,0xCC6412C4,0x8FF832DC,
-0xF200169C,0xDC000E76,0xC4001765,0xB20032DC,0xFF502793,0xF9602F06,0xFB642FFB,0xFF2C1B7E,0xFF0C0F8A,0xFEE804E2,0xFEDC00E2,0xFAC4003B,0xFF3C26EC,0xFF1819CA,0xFEB804ED,0xE4400525,0x71FC32DC,0x19C12C3,0xFF800C56,0xFF7006A3,0xFF680482,0xFF740B9D,0xFF5803DE,0xFF4400A9,0xFF400533,0xF73400C2,0xE53804E5,0x6BFC12C3,0xFF3808E2,0xFF200482,0xFF080785,0xFCD80005,
-0xE4F404E4,0xB7F812C3,0xFE380480,0xE41404E4,0xCC0012C4,0x6BFC12C3,0xFF3808E2,0xFF200482,0xFF080785,0xFCD80005,0xE4F404E4,0xB7F812C3,0xFE380480,0xE41404E4,0xCC0012C4,0xB7F812C3,0xFE380480,0xE41404E4,0xCC0012C4,0xCC0012C4,0xFF800FC5,0xFF8C1193,0xF59811A2,0xFF6C0C12,0xFF4807DA,0xFF1002DD,0xFEF8000D,0xFCB00018,0xFF780FAA,0xFF5C0B8A,0xFED004E3,0xE41404E4,
-0xA3FC12C3,0x134152B,0x134152B,0x134152B,0x134152B,0xFF1809EB,0xFF1809EB,0xFF1809EB,0xF1040622,0xF1040622,0xCD000622,0xFEF407FB,0xFEF407FB,0xFEF407FB,0xFED00012,0xFED00012,0xD0E00132,0xDACC0481,0xDACC0481,0xC6C0007D,0xB4C80481,0x1C8152B,0x1C8152B,0x1C8152B,0xFE940691,0xFE940691,0xCCB80620,0xFE380481,0xFE380481,0xCC680005,0xB47C0480,0x69FC152B,
-0x69FC152B,0xC4000672,0xAC0005E1,0x9A00152C,0xFF200F46,0xFF2C1292,0x134152B,0xFF100A32,0xFEF40542,0xFEE001CB,0xFEDC00E2,0xF0C80002,0xFF140ED2,0xFEF4097A,0xFEB80489,0xCC680005,0x41FC152B,0x1680482,0x1680482,0x1680482,0x1680482,0xFF4400A9,0xFF4400A9,0xFF4400A9,0xE5380001,0xE5380001,0xCD380001,0x1FFC0480,0x1FFC0480,0x1FFC0480,0xF8E00001,0xF8E00001,
-0xCD0C0000,0x91FC0480,0x91FC0480,0xCC740000,0xB4000480,0x1FFC0480,0x1FFC0480,0x1FFC0480,0xF8E00001,0xF8E00001,0xCD0C0000,0x91FC0480,0x91FC0480,0xCC740000,0xB4000480,0x91FC0480,0x91FC0480,0xCC740000,0xB4000480,0xB4000480,0xFF580349,0xFB64039D,0x1680482,0xFF440288,0xFD280154,0xFEFC006D,0xFEF8000D,0xF4C00000,0xFD4C0372,0xFF300269,0x75FC0480,0xCC740000,
-0x75FC0480,0x1CC04E2,0xFFBC0305,0xFFA000EA,0xFF980001,0xB5FC04E2,0xFF8801CD,0xFF680000,0xDBF804E2,0xFECC0000,0xE40004E4,0xB5FC04E2,0xFF8801CD,0xFF680000,0xDBF804E2,0xFECC0000,0xE40004E4,0xDBF804E2,0xFECC0000,0xE40004E4,0xE40004E4,0xB5FC04E2,0xFF8801CD,0xFF680000,0xDBF804E2,0xFECC0000,0xE40004E4,0xDBF804E2,0xFECC0000,0xE40004E4,0xE40004E4,0xDBF804E2,
-0xFECC0000,0xE40004E4,0xE40004E4,0xE40004E4,0xF9C40480,0xDE804E2,0xFDC80482,0xFFB003CA,0xFF900305,0xFF480175,0xFF080000,0xFE840000,0xF3C40480,0xFFA003B5,0xFF300028,0xE40004E4,0xD1FC04E2,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0x1000622,0xFED00012,0xFED00012,0xFED00012,0xFED00012,0xFED00012,
-0xFED00012,0xB8C80001,0xB8C80001,0xB8C80001,0x9CC80001,0x1800620,0x1800620,0x1800620,0x1800620,0x1800620,0x1800620,0xD0600001,0xD0600001,0xD0600001,0x9C940000,0x45F80620,0x45F80620,0x45F80620,0x9A00000D,0x80000620,0xF4F80482,0x1000622,0x1000622,0xFEE8029A,0xFEE4016D,0xFEDC0091,0xFEDC0091,0xECCC0001,0xF8E803F5,0xFED80269,0xD4BC0000,0xD0600001,
-0x13FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table183[] = {
-0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x27F80000,
-0x27F80000,0x27F80000,0x27F80000,0x6C000000,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xE80000,0xE80000,0xE80000,0x1440000,0x1D00000,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,
-0x3E80000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x1600000,0x1480001,0x1480001,0x1800000,0x39C0000,0x1C00000,0x1C00000,0x23FC0000,0x1800000,0x39C0000,0x57FC0000,0x79FC0000,
-0x57FC0000,0x1A80001,0x1A80001,0x1A80001,0x1A80001,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xC1FC0000,0xD4000000,0xD4000000,0x81FC0000,0x81FC0000,0x81FC0000,0xC1FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xC1FC0000,0xD4000000,0xD4000000,0xC1FC0000,
-0xC1FC0000,0xD4000000,0xD4000000,0xD4000000,0x1F00000,0xBC40000,0x1A80001,0x63FC0000,0x99FC0000,0xB1FC0000,0xB9FC0000,0xC7F40000,0x33FC0000,0x81FC0000,0xB1FC0000,0xD4000000,0xB1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1742E54,0xFF5C2227,0xFF4C189C,0xFF44152B,0xFF441B96,0xFF2C0F3F,0xFF240A83,0xFF200BB6,0xFD10062A,0xE9100996,0xFF301E64,0xFF140EBF,0xFF0C08B3,0xFEF0080E,0xFEE40042,0xEAE40406,0xFEE00FFC,0xF2D005A9,0xE2C805B5,0xD2D80FD9,0x2BFC2E54,0xFEFC1C6F,0xFEE8152B,0xFECC11E6,0xFEAC06E9,0xE8B40994,0xFE9414BB,0xFE5804A2,0xE854039D,0xD2740FD9,0x97FC2E54,
-0xF8001590,0xE0000BBC,0xCA001371,0xB8002E54,0xFF58245E,0xFF6C2AA6,0xFF6C2BC3,0xFF4019B9,0xFF1C0EF6,0xFEFC0522,0xFEF00173,0xFCD8000A,0xFF5023BA,0xFF30184B,0xFED00513,0xE854039D,0x7DF82E54,0x1A40FDB,0xFF8C0AC6,0xFF7C063B,0xFF780482,0xFF8009BD,0xFF64037E,0xFF5C00E1,0xFF5803EB,0xF944005E,0xE9480375,0x7BFC0FD8,0xFF4C0811,0xFF380482,0xFF200615,0xFEF00005,
-0xE9080374,0xBFF80FD8,0xFE6C0480,0xE8340374,0xD2000FD8,0x7BFC0FD8,0xFF4C0811,0xFF380482,0xFF200615,0xFEF00005,0xE9080374,0xBFF80FD8,0xFE6C0480,0xE8340374,0xD2000FD8,0xBFF80FD8,0xFE6C0480,0xE8340374,0xD2000FD8,0xD2000FD8,0xFF940D34,0xF9A00ECD,0xFBA40ED6,0xFF780A71,0xFF5C06D6,0xFF2C0293,0xFF10002D,0xFECC0002,0xFF880D43,0xFF680A04,0xFEE404D1,0xE8340374,
-0xADFC0FD8,0x144152B,0x144152B,0x144152B,0x144152B,0xFF240A83,0xFF240A83,0xFF240A83,0xF9140622,0xF9140622,0xD5100622,0xFF0C08B3,0xFF0C08B3,0xFF0C08B3,0xFEE40042,0xFEE40042,0xD8F00132,0xE2DC0481,0xE2DC0481,0xCED0007D,0xBCD80481,0x1E0152B,0x1E0152B,0x1E0152B,0xFEAC06E9,0xFEAC06E9,0xD4C80620,0xFE5804A2,0xFE5804A2,0xD4780005,0xBC8C0480,0x75FC152B,
-0x75FC152B,0xD0000642,0xB8000581,0xA200152C,0xFF2C0FD1,0xF94012E3,0x144152B,0xFF180AF6,0xFF080612,0xFEF8026E,0xFEF00173,0xF8D80002,0xFF200F41,0xFF100A41,0xFED004C2,0xD4780005,0x51FC152B,0x1780482,0x1780482,0x1780482,0x1780482,0xFF5C00E1,0xFF5C00E1,0xFF5C00E1,0xED480001,0xED480001,0xD5480001,0x37FC0480,0x37FC0480,0x37FC0480,0xFCF80005,0xFCF80005,
-0xD51C0000,0x9DFC0480,0x9DFC0480,0xD4840000,0xBC000480,0x37FC0480,0x37FC0480,0x37FC0480,0xFCF80005,0xFCF80005,0xD51C0000,0x9DFC0480,0x9DFC0480,0xD4840000,0xBC000480,0x9DFC0480,0x9DFC0480,0xD4840000,0xBC000480,0xBC000480,0xFB6C0372,0xFF6C03C5,0x1780482,0xFD5802AD,0xFF400188,0xFF2000A4,0xFF10002D,0xFCD00000,0xF564039D,0xFF48028A,0x83FC0480,0xD4840000,
-0x83FC0480,0x1D40372,0xFFC40212,0xFFB400A9,0xFFA80001,0xC1FC0372,0xFF940145,0xFF800000,0xE1F80372,0xFEFC0000,0xE8000374,0xC1FC0372,0xFF940145,0xFF800000,0xE1F80372,0xFEFC0000,0xE8000374,0xE1F80372,0xFEFC0000,0xE8000374,0xE8000374,0xC1FC0372,0xFF940145,0xFF800000,0xE1F80372,0xFEFC0000,0xE8000374,0xE1F80372,0xFEFC0000,0xE8000374,0xE8000374,0xE1F80372,
-0xFEFC0000,0xE8000374,0xE8000374,0xE8000374,0xFDCC0320,0x1F40372,0xFFCC0332,0xFFC002AD,0xFFA80221,0xFF5C0110,0xFF300000,0xFEC00000,0xF7CC0320,0xFFAC02A8,0xFF50001A,0xE8000374,0xD9FC0372,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0x1100622,0xFEE0002D,0xFEE0002D,0xFEE0002D,0xFEE0002D,0xFEE0002D,
-0xFEE0002D,0xC0D80001,0xC0D80001,0xC0D80001,0xA4D80001,0x1980620,0x1980620,0x1980620,0x1980620,0x1980620,0x1980620,0xD8700001,0xD8700001,0xD8700001,0xA4A40000,0x51F80620,0x51F80620,0x51F80620,0xA4000004,0x88000620,0xFD080482,0x1100622,0x1100622,0xFD0002D2,0xFEF801A5,0xFEE800C1,0xFEE800C1,0xF4DC0001,0xFEF403F9,0xFEE402A8,0xDCCC0000,0xD8700001,
-0x21FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table184[] = {
-0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x33FC0000,
-0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xFC0000,0xFC0000,0xFC0000,0x35C0000,0x1F40000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x15C0000,0x9FC0000,0x9FC0000,0x9FC0000,0x9FC0000,0x9FC0000,
-0x9FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x9FC0000,0x9FC0000,0x9FC0000,0x9FC0000,0x9FC0000,0x9FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0xAC000001,0x87FC0000,0x87FC0000,0x87FC0000,0xAC000001,0xAC000001,0x1740000,0x15C0000,0x15C0000,0x1940000,0x1B40000,0x1D80000,0x1D80000,0x39FC0000,0x1940000,0x1B40000,0x67FC0000,0x87FC0000,
-0x67FC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0xCFF80000,
-0xCFF80000,0xDC000001,0xDC000001,0xDC000001,0x17FC0000,0x5D80000,0x1BC0000,0x85FC0000,0xAFFC0000,0xC1FC0000,0xC9F80000,0xD3F80000,0x5FFC0000,0x9BFC0000,0xC1FC0000,0xDC000001,0xC1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x18029AD,0xFF681FD0,0xFF581805,0xFF58152C,0xFF5C195D,0xFF400F2A,0xFF3C0B40,0xFF2C0AC5,0xFF240629,0xEF240841,0xFF441B46,0xFF2C0E8C,0xFF180984,0xFF080739,0xFEFC00B5,0xF0FC02D1,0xFEF80D65,0xF6E40502,0xE8E003EA,0xD8EC0CF6,0x41FC29AD,0xFF141B10,0xFF04152B,0xFEE41095,0xFECC0789,0xEEC80841,0xFEAC126A,0xFE7C04F9,0xEC740236,0xD88C0CF6,0xA1FC29AD,
-0xFE00152B,0xE6000953,0xD4000F97,0xC00029AF,0xFF64212B,0xF77C26D9,0xF98027C5,0xFF5417DD,0xFF300E79,0xFF1805B7,0xFF080236,0xFEEC0009,0xFF5C209B,0xFF40169E,0xFEF0056C,0xEC740236,0x89FC29AD,0x1B00CF9,0xFFA40924,0xFF9405C1,0xFF8C0480,0xFF8C07FD,0xFF7C0332,0xFF740139,0xFF6402C1,0xFD580018,0xEF5C0221,0x8DFC0CF6,0xFF640723,0xFF540480,0xFF3804C5,0xFF14001D,
-0xEF1C0221,0xC7FC0CF6,0xFEA00480,0xEE500221,0xD8000CF6,0x8DFC0CF6,0xFF640723,0xFF540480,0xFF3804C5,0xFF14001D,0xEF1C0221,0xC7FC0CF6,0xFEA00480,0xEE500221,0xD8000CF6,0xC7FC0CF6,0xFEA00480,0xEE500221,0xD8000CF6,0xD8000CF6,0xFFA00AE3,0xFFAC0BFD,0xFFAC0C24,0xFF9408AE,0xFF6805CD,0xFF48026C,0xFF280068,0xFEF40008,0xFF980AF2,0xFF840840,0xFF1004B3,0xEE500221,
-0xB9FC0CF6,0x158152C,0x158152C,0x158152C,0x158152C,0xFF3C0B40,0xFF3C0B40,0xFF3C0B40,0xFF240629,0xFF240629,0xDD240621,0xFF180984,0xFF180984,0xFF180984,0xFEFC00B5,0xFEFC00B5,0xE1000131,0xECEC0480,0xECEC0480,0xD8E4007A,0xC4EC0482,0x1FC152B,0x1FC152B,0x1FC152B,0xFECC0789,0xFECC0789,0xDCDC0621,0xFE7C04F9,0xFE7C04F9,0xDE880006,0xC4A00482,0x83F8152B,
-0x83F8152B,0xDC000629,0xC200053B,0xAA00152B,0xFD48103D,0xFF4C1304,0x158152C,0xFF340BC5,0xFF1C0715,0xFF100379,0xFF080236,0xFEEC0009,0xFF340FA9,0xFF240B2D,0xFEF0052C,0xDE880006,0x61FC152B,0x18C0480,0x18C0480,0x18C0480,0x18C0480,0xFF740139,0xFF740139,0xFF740139,0xF55C0000,0xF55C0000,0xDD5C0001,0x53FC0480,0x53FC0480,0x53FC0480,0xFF14001D,0xFF14001D,
-0xDF2C0001,0xABF80480,0xABF80480,0xDC9C0001,0xC4000482,0x53FC0480,0x53FC0480,0x53FC0480,0xFF14001D,0xFF14001D,0xDF2C0001,0xABF80480,0xABF80480,0xDC9C0001,0xC4000482,0xABF80480,0xABF80480,0xDC9C0001,0xC4000482,0xC4000482,0xF780039D,0xFD8803C8,0x18C0480,0xFB7002F9,0xFF5401CA,0xFF3800E9,0xFF280068,0xFEF40008,0xFD74039D,0xFF5C02B9,0x95FC0480,0xDC9C0001,
-0x95FC0480,0x1DC0221,0xFFD00145,0xFFC00068,0xFFBC0000,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xE7F80221,0xFF340000,0xEE000221,0xEE000221,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xE7F80221,0xFF340000,0xEE000221,0xEE000221,0xE7F80221,
-0xFF340000,0xEE000221,0xEE000221,0xEE000221,0xFFD001ED,0x1FC0221,0xF7DC0200,0xFFC001A8,0xFFA80152,0xFF8400A0,0xFF5C0000,0xFF040000,0xFBD401E1,0xFFC00190,0xFF700010,0xEE000221,0xDFFC0221,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0x1240620,0xFEF40050,0xFEF40050,0xFEF40050,0xFEF40050,0xFEF40050,
-0xFEF40050,0xC8EC0000,0xC8EC0000,0xC8EC0000,0xACEC0001,0x3B00620,0x3B00620,0x3B00620,0x3B00620,0x3B00620,0x3B00620,0xE0840001,0xE0840001,0xE0840001,0xAEB40001,0x5DFC0620,0x5DFC0620,0x5DFC0620,0xAC080001,0x90000622,0xF71C04B1,0x1240620,0x1240620,0xFF1002FD,0xFF0C01E1,0xFF0000F4,0xFF0000F4,0xFEEC0000,0xFD0C0422,0xFF0002D4,0xE6DC0000,0xE0840001,
-0x33FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table185[] = {
-0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3FFC0000,
-0x3FFC0000,0x3FFC0000,0x3FFC0000,0x7C000001,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x10C0000,0x10C0000,0x10C0000,0x3740000,0xDFC0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x23FC0000,0x23FC0000,0x23FC0000,0x23FC0000,0x23FC0000,
-0x23FC0000,0x93FC0000,0x93FC0000,0x93FC0000,0xB4000001,0x23FC0000,0x23FC0000,0x23FC0000,0x23FC0000,0x23FC0000,0x23FC0000,0x93FC0000,0x93FC0000,0x93FC0000,0xB4000001,0x93FC0000,0x93FC0000,0x93FC0000,0xB4000001,0xB4000001,0x1840000,0x16C0000,0x16C0000,0x1A80000,0x1C80000,0x1F00000,0x1F00000,0x4DFC0000,0x1A80000,0x1C80000,0x77FC0000,0x93FC0000,
-0x77FC0000,0x1CC0000,0x1CC0000,0x1CC0000,0x1CC0000,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xDBF80000,0xDBF80000,0xE4000001,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xDBF80000,0xDBF80000,0xE4000001,0xDBF80000,0xDBF80000,0xE4000001,0xE4000001,0xB5FC0000,0xB5FC0000,0xB5FC0000,0xDBF80000,0xDBF80000,0xE4000001,0xDBF80000,0xDBF80000,0xE4000001,0xE4000001,0xDBF80000,
-0xDBF80000,0xE4000001,0xE4000001,0xE4000001,0x51FC0000,0xDE80000,0x1CC0000,0xA3FC0000,0xC3FC0000,0xD1FC0000,0xD5FC0000,0xDDFC0000,0x87FC0000,0xB5FC0000,0xD1FC0000,0xE4000001,0xD1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x18C25F1,0xFF801DF0,0xFF70177D,0xFF68152C,0xFF68179D,0xFF540F30,0xFF500BE9,0xFF400A4D,0xFF380659,0xF3340759,0xFF5C1916,0xFF3C0E80,0xFF300A44,0xFF2006D1,0xFF100159,0xF30C0209,0xFF080B86,0xFAF804A6,0xECF402A6,0xDEFC0AC2,0x53FC25F1,0xFF2C19F0,0xFF1C152B,0xFEFC0FAD,0xFEE40821,0xF2DC0759,0xFECC10EA,0xFE940579,0xF0880142,0xDE9C0AC2,0xABF825F1,
-0xFE2C152B,0xEC0007C7,0xDA000C9B,0xC60025F3,0xFF781EA2,0xFD88233D,0xFF8C2439,0xFF5C168D,0xFF440E39,0xFF240649,0xFF1C031D,0xFF040051,0xFF701E24,0xFF501536,0xFEFC05F2,0xF0880142,0x95FC25F1,0x1BC0AC1,0xFFB007EC,0xFFA00569,0xFF9C0480,0xFFA406AD,0xFF88030E,0xFF800185,0xFF7C0209,0xFF6C0001,0xF36C0139,0x9BFC0AC1,0xFF7C066B,0xFF6C0480,0xFF5803FE,0xFF2C0055,
-0xF3300139,0xCFF80AC1,0xFED40480,0xF2700139,0xDE000AC2,0x9BFC0AC1,0xFF7C066B,0xFF6C0480,0xFF5803FE,0xFF2C0055,0xF3300139,0xCFF80AC1,0xFED40480,0xF2700139,0xDE000AC2,0xCFF80AC1,0xFED40480,0xF2700139,0xDE000AC2,0xDE000AC2,0xFFB4091E,0xF5B80A25,0xF7BC0A41,0xFF98075D,0xFF7C0521,0xFF5C026D,0xFF4800B4,0xFF0C0032,0xFFA4093E,0xFF900722,0xFF3004A6,0xF2700139,
-0xC1FC0AC1,0x168152C,0x168152C,0x168152C,0x168152C,0xFF500BE9,0xFF500BE9,0xFF500BE9,0xFF380659,0xFF380659,0xE5340621,0xFF300A44,0xFF300A44,0xFF300A44,0xFF100159,0xFF100159,0xE9100131,0xF4FC0480,0xF4FC0480,0xE0F4007A,0xCCFC0482,0x19FC152B,0x19FC152B,0x19FC152B,0xFEE40821,0xFEE40821,0xE4EC0621,0xFE940579,0xFE940579,0xE6980006,0xCCB00482,0x8FF8152B,
-0x8FF8152B,0xE4000621,0xCA0004F6,0xB200152B,0xFF5810A1,0xFB641341,0x168152C,0xFF440C7A,0xFF300805,0xFF240465,0xFF1C031D,0xFF040051,0xFF501035,0xFF300BE1,0xFEFC05B2,0xE6980006,0x71FC152B,0x19C0480,0x19C0480,0x19C0480,0x19C0480,0xFF800185,0xFF800185,0xFF800185,0xFD6C0000,0xFD6C0000,0xE56C0001,0x6BFC0480,0x6BFC0480,0x6BFC0480,0xFF2C0055,0xFF2C0055,
-0xE73C0001,0xB7F80480,0xB7F80480,0xE4AC0001,0xCC000482,0x6BFC0480,0x6BFC0480,0x6BFC0480,0xFF2C0055,0xFF2C0055,0xE73C0001,0xB7F80480,0xB7F80480,0xE4AC0001,0xCC000482,0xB7F80480,0xB7F80480,0xE4AC0001,0xCC000482,0xCC000482,0xFF90039D,0xF59803F5,0x19C0480,0xFD840320,0xFF700221,0xFF500151,0xFF4800B4,0xFF0C0032,0xFD8803C8,0xFF7402FD,0xA3FC0480,0xE4AC0001,
-0xA3FC0480,0x1E40139,0xFFDC00B9,0xFFCC0040,0xFFCC0000,0xD9FC0139,0xFFC00074,0xFFB40000,0xEDF80139,0xFF640000,0xF2000139,0xD9FC0139,0xFFC00074,0xFFB40000,0xEDF80139,0xFF640000,0xF2000139,0xEDF80139,0xFF640000,0xF2000139,0xF2000139,0xD9FC0139,0xFFC00074,0xFFB40000,0xEDF80139,0xFF640000,0xF2000139,0xEDF80139,0xFF640000,0xF2000139,0xF2000139,0xEDF80139,
-0xFF640000,0xF2000139,0xF2000139,0xF2000139,0xF5E40120,0x37FC0139,0xFBE40120,0xFFD800F2,0xFFBC00C2,0xFFA40061,0xFF840000,0xFF400000,0xFFDC0109,0xFFCC00E9,0xFF940009,0xF2000139,0xE7FC0139,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0x1340620,0xFF0C0080,0xFF0C0080,0xFF0C0080,0xFF0C0080,0xFF0C0080,
-0xFF0C0080,0xD0FC0000,0xD0FC0000,0xD0FC0000,0xB4FC0001,0x1C80620,0x1C80620,0x1C80620,0x1C80620,0x1C80620,0x1C80620,0xE8940001,0xE8940001,0xE8940001,0xB6C40001,0x69FC0620,0x69FC0620,0x69FC0620,0xB4180001,0x98000622,0xFF2C04B1,0x1340620,0x1340620,0xFF200328,0xFF140220,0xFF100128,0xFF100128,0xFF000008,0xFD1C0451,0xFF1402F9,0xEEEC0000,0xE8940001,
-0x41FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table186[] = {
-0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x4BFC0000,
-0x4BFC0000,0x4BFC0000,0x4BFC0000,0x84000001,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x71C0000,0x71C0000,0x71C0000,0x38C0000,0x1DF80000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x17C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,
-0x3BFC0000,0x9FF80000,0x9FF80000,0x9FF80000,0xBC000001,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x9FF80000,0x9FF80000,0x9FF80000,0xBC000001,0x9FF80000,0x9FF80000,0x9FF80000,0xBC000001,0xBC000001,0x3940000,0x17C0000,0x17C0000,0x5B80000,0x1DC0000,0xDFC0000,0xDFC0000,0x61FC0000,0x5B80000,0x1DC0000,0x85FC0000,0x9FF80000,
-0x85FC0000,0x1DC0000,0x1DC0000,0x1DC0000,0x1DC0000,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xE7F80000,0xE7F80000,0xEC000001,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xE7F80000,0xE7F80000,0xEC000001,0xE7F80000,0xE7F80000,0xEC000001,0xEC000001,0xCDFC0000,0xCDFC0000,0xCDFC0000,0xE7F80000,0xE7F80000,0xEC000001,0xE7F80000,0xE7F80000,0xEC000001,0xEC000001,0xE7F80000,
-0xE7F80000,0xEC000001,0xEC000001,0xEC000001,0x89FC0000,0x1FC0000,0x1DC0000,0xC1FC0000,0xD7FC0000,0xDFFC0000,0xE3FC0000,0xE9F40000,0xADFC0000,0xCDFC0000,0xDFFC0000,0xEC000001,0xDFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1982295,0xFF8C1C24,0xFF7C16F9,0xFF78152C,0xFF74162D,0xFF640F5A,0xFF5C0C99,0xFF4C0A19,0xFF4C06D0,0xF74406B1,0xFF68172A,0xFF500E8F,0xFF440B02,0xFF2C06C9,0xFF200231,0xF7200185,0xFF200A26,0xFF0C0482,0xF30001AE,0xE30C08E2,0x65FC2295,0xFF401914,0xFF34152B,0xFF140F05,0xFEFC08D9,0xF6F006B1,0xFEE40F9A,0xFEB80619,0xF49C0096,0xE2B008E2,0xB3FC2295,
-0xFE60152B,0xF40006C7,0xE0000A17,0xCC002297,0xFF801C65,0xFF8C2049,0xF598216C,0xFF701581,0xFF540E23,0xFF3806FD,0xFF340421,0xFF1400E4,0xFF781BC6,0xFF5C147B,0xFF1806AC,0xF49C0096,0x9FFC2295,0x1C808E1,0xFFBC06E4,0xFFAC0529,0xFFAC0480,0xFFB00599,0xFF9C0300,0xFF9801E5,0xFF880191,0xFF7C0010,0xF77C0091,0xAFFC08E1,0xFF9405D3,0xFF840480,0xFF700356,0xFF4C00A9,
-0xF7440091,0xD7FC08E1,0xFF040480,0xF6900091,0xE20008E2,0xAFFC08E1,0xFF9405D3,0xFF840480,0xFF700356,0xFF4C00A9,0xF7440091,0xD7FC08E1,0xFF040480,0xF6900091,0xE20008E2,0xD7FC08E1,0xFF040480,0xF6900091,0xE20008E2,0xE20008E2,0xFFBC07A1,0xFBC4084D,0xFBC40879,0xFFB00662,0xFF90049D,0xFF740269,0xFF640104,0xFF300074,0xFFB407AA,0xFFA40632,0xFF500499,0xF6900091,
-0xCDFC08E1,0x178152C,0x178152C,0x178152C,0x178152C,0xFF5C0C99,0xFF5C0C99,0xFF5C0C99,0xFF4C06D0,0xFF4C06D0,0xED440621,0xFF440B02,0xFF440B02,0xFF440B02,0xFF200231,0xFF200231,0xF1200131,0xFD0C0480,0xFD0C0480,0xE904007A,0xD50C0482,0x31FC152B,0x31FC152B,0x31FC152B,0xFEFC08D9,0xFEFC08D9,0xECFC0621,0xFEB80619,0xFEB80619,0xEEA80006,0xD4C00482,0x9BF8152B,
-0x9BF8152B,0xEC100621,0xD20004D2,0xBA00152B,0xFF6810FE,0xFF6C1369,0x178152C,0xFF540D39,0xFF440905,0xFF340565,0xFF340421,0xFF1400E4,0xFF5810BE,0xFF500C92,0xFF18067B,0xEEA80006,0x7FFC152B,0x1AC0480,0x1AC0480,0x1AC0480,0x1AC0480,0xFF9801E5,0xFF9801E5,0xFF9801E5,0xFF7C0010,0xFF7C0010,0xED7C0001,0x83FC0480,0x83FC0480,0x83FC0480,0xFF4C00A9,0xFF4C00A9,
-0xEF4C0001,0xC3F80480,0xC3F80480,0xECBC0001,0xD4000482,0x83FC0480,0x83FC0480,0x83FC0480,0xFF4C00A9,0xFF4C00A9,0xEF4C0001,0xC3F80480,0xC3F80480,0xECBC0001,0xD4000482,0xC3F80480,0xC3F80480,0xECBC0001,0xD4000482,0xD4000482,0xFFA003CA,0xFDA803F5,0x1AC0480,0xFF980349,0xFF840269,0xFF6C019A,0xFF640104,0xFF300074,0xFF9403DA,0xFF840340,0xB3FC0480,0xECBC0001,
-0xB3FC0480,0x1EC0091,0xFFE80055,0xFFE0001D,0xFFDC0000,0xE5FC0091,0xFFD80034,0xFFCC0000,0xF3F80091,0xFF940000,0xF6000091,0xE5FC0091,0xFFD80034,0xFFCC0000,0xF3F80091,0xFF940000,0xF6000091,0xF3F80091,0xFF940000,0xF6000091,0xF6000091,0xE5FC0091,0xFFD80034,0xFFCC0000,0xF3F80091,0xFF940000,0xF6000091,0xF3F80091,0xFF940000,0xF6000091,0xF6000091,0xF3F80091,
-0xFF940000,0xF6000091,0xF6000091,0xF6000091,0xF9EC0080,0x77FC0091,0xFFEC0080,0xFDE80071,0xFFD0005A,0xFFC00028,0xFFA80000,0xFF7C0000,0xFFE80080,0xFFE00071,0xFFB40004,0xF6000091,0xEFFC0091,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0x1440620,0xFF1800B4,0xFF1800B4,0xFF1800B4,0xFF1800B4,0xFF1800B4,
-0xFF1800B4,0xD90C0000,0xD90C0000,0xD90C0000,0xBD0C0001,0x1E00620,0x1E00620,0x1E00620,0x1E00620,0x1E00620,0x1E00620,0xF0A40001,0xF0A40001,0xF0A40001,0xBED40001,0x75FC0620,0x75FC0620,0x75FC0620,0xBC280001,0xA0000622,0xF73C04E4,0x1440620,0x1440620,0xFF2C0371,0xFF280254,0xFF240171,0xFF240171,0xFF140020,0xF9300480,0xFF200332,0xF6FC0000,0xF0A40001,
-0x51FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table187[] = {
-0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x57FC0000,
-0x57FC0000,0x57FC0000,0x57FC0000,0x8C000001,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0xF2C0000,0xF2C0000,0xF2C0000,0x3A40000,0x2BFC0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,
-0x53FC0000,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0xBA40000,0x18C0000,0x18C0000,0x1CC0000,0x1F00000,0x2BFC0000,0x2BFC0000,0x73FC0000,0x1CC0000,0x1F00000,0x95FC0000,0xABF80000,
-0x95FC0000,0x1EC0000,0x1EC0000,0x1EC0000,0x1EC0000,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xF3F80000,0xF3F80000,0xF4000001,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xF3F80000,0xF3F80000,0xF4000001,0xF3F80000,0xF3F80000,0xF4000001,0xF4000001,0xE5FC0000,0xE5FC0000,0xE5FC0000,0xF3F80000,0xF3F80000,0xF4000001,0xF3F80000,0xF3F80000,0xF4000001,0xF4000001,0xF3F80000,
-0xF3F80000,0xF4000001,0xF4000001,0xF4000001,0xC3FC0000,0x77FC0000,0x1EC0000,0xDFFC0000,0xEBFC0000,0xEFFC0000,0xF1FC0000,0xF3FC0000,0xD5FC0000,0xE5FC0000,0xEFFC0000,0xF4000001,0xEFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1A41F99,0xFF981A90,0xFF881695,0xFF88152C,0xFF80150D,0xFF740F8A,0xFF680D79,0xFF640A11,0xFF58076C,0xFB540649,0xFF7415A6,0xFF600EC8,0xFF5C0BF2,0xFF40071A,0xFF380339,0xFD2C0139,0xFF2C0946,0xFF200496,0xF71400FA,0xE91C0756,0x77FC1F99,0xFF58182C,0xFF4C152B,0xFF2C0E9D,0xFF1409B1,0xFB040649,0xFF080EA6,0xFED80706,0xFAB0002A,0xE8C40756,0xBDF81F99,
-0xFE90152B,0xFA080649,0xE600080B,0xD2001F9B,0xFF901A71,0xF9A01DC1,0xFBA41E94,0xFF80147F,0xFF640E3A,0xFF4C0805,0xFF3C0548,0xFF3001B8,0xFF9019F6,0xFF7413B6,0xFF300766,0xFAB0002A,0xABFC1F99,0x1D00759,0xFFC40609,0xFFC004EC,0xFFBC0480,0xFFBC04CD,0xFFB0030E,0xFFA40249,0xFFA00169,0xFF940050,0xFB8C0029,0xBDFC0756,0xFFAC055B,0xFF9C0480,0xFF8802EE,0xFF700115,
-0xFB580029,0xDFF80756,0xFF340480,0xFAB00029,0xE8000756,0xBDFC0756,0xFFAC055B,0xFF9C0480,0xFF8802EE,0xFF700115,0xFB580029,0xDFF80756,0xFF340480,0xFAB00029,0xE8000756,0xDFF80756,0xFF340480,0xFAB00029,0xE8000756,0xE8000756,0xFDCC067A,0xFFCC06D5,0xFFCC0711,0xFFC0058C,0xFFA40441,0xFF8402B0,0xFF7C0172,0xFF5000DD,0xFFC4067D,0xFFB0057E,0xFF700490,0xFAB00029,
-0xD7FC0756,0x188152C,0x188152C,0x188152C,0x188152C,0xFF680D79,0xFF680D79,0xFF680D79,0xFF58076C,0xFF58076C,0xF5540621,0xFF5C0BF2,0xFF5C0BF2,0xFF5C0BF2,0xFF380339,0xFF380339,0xF9300131,0xFF200496,0xFF200496,0xF114007A,0xDD1C0482,0x49FC152B,0x49FC152B,0x49FC152B,0xFF1409B1,0xFF1409B1,0xF50C0621,0xFED80706,0xFED80706,0xF6B80006,0xDCD00482,0xA7F8152B,
-0xA7F8152B,0xF4200621,0xDC0004A6,0xC200152B,0xFF741194,0xFB8413A0,0x188152C,0xFF680E19,0xFF580A15,0xFF4C06C1,0xFF3C0548,0xFF3001B8,0xFF741148,0xFF5C0D82,0xFF300742,0xF6B80006,0x8FFC152B,0x1BC0480,0x1BC0480,0x1BC0480,0x1BC0480,0xFFA40249,0xFFA40249,0xFFA40249,0xFF940050,0xFF940050,0xF58C0001,0x9BFC0480,0x9BFC0480,0x9BFC0480,0xFF700115,0xFF700115,
-0xF75C0001,0xCFF80480,0xCFF80480,0xF4CC0001,0xDC000482,0x9BFC0480,0x9BFC0480,0x9BFC0480,0xFF700115,0xFF700115,0xF75C0001,0xCFF80480,0xCFF80480,0xF4CC0001,0xDC000482,0xCFF80480,0xCFF80480,0xF4CC0001,0xDC000482,0xDC000482,0xFBB403F5,0xF5B80424,0x1BC0480,0xFFA40384,0xFF9802B9,0xFF7C020A,0xFF7C0172,0xFF5000DD,0xFDB003F5,0xFF980371,0xC1FC0480,0xF4CC0001,
-0xC1FC0480,0x1F40029,0xFFF40019,0xFFF00008,0xFFEC0000,0xF1FC0029,0xFFE40010,0xFFE40000,0xF9F80029,0xFFC80000,0xFA000029,0xF1FC0029,0xFFE40010,0xFFE40000,0xF9F80029,0xFFC80000,0xFA000029,0xF9F80029,0xFFC80000,0xFA000029,0xFA000029,0xF1FC0029,0xFFE40010,0xFFE40000,0xF9F80029,0xFFC80000,0xFA000029,0xF9F80029,0xFFC80000,0xFA000029,0xFA000029,0xF9F80029,
-0xFFC80000,0xFA000029,0xFA000029,0xFA000029,0xFDF40020,0xB7FC0029,0xF3F40029,0xFFEC001D,0xFFE80014,0xFFDC000A,0xFFD00000,0xFFB80000,0xFFF00022,0xFFF00020,0xFFD80001,0xFA000029,0xF7FC0029,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0x1540620,0xFF3000F4,0xFF3000F4,0xFF3000F4,0xFF3000F4,0xFF3000F4,
-0xFF3000F4,0xE11C0000,0xE11C0000,0xE11C0000,0xC51C0001,0x1F80620,0x1F80620,0x1F80620,0x1F80620,0x1F80620,0x1F80620,0xF8B40001,0xF8B40001,0xF8B40001,0xC6E40001,0x81FC0620,0x81FC0620,0x81FC0620,0xC4380001,0xA8000622,0xFF4C04E4,0x1540620,0x1540620,0xFD48039D,0xFF3C0290,0xFF3401B1,0xFF3401B1,0xFF240050,0xFF3C0488,0xFF340355,0xFF0C0000,0xF8B40001,
-0x5FFC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table188[] = {
-0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x65F80000,
-0x65F80000,0x65F80000,0x65F80000,0x96000000,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x9400000,0x9400000,0x9400000,0x1C00000,0x3DF80000,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x19C0001,0x6FFC0000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0x6FFC0000,
-0x6FFC0000,0xB9F80000,0xB9F80000,0xB9F80000,0xCE000000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0x6FFC0000,0xB9F80000,0xB9F80000,0xB9F80000,0xCE000000,0xB9F80000,0xB9F80000,0xB9F80000,0xCE000000,0xCE000000,0x5B80000,0x19C0001,0x19C0001,0x3E00000,0x15FC0000,0x4DFC0000,0x4DFC0000,0x8BFC0000,0x3E00000,0x15FC0000,0xA5FC0000,0xB9F80000,
-0xA5FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B01CB4,0xFFA4190F,0xFF9C1633,0xFF98152B,0xFF98140A,0xFF8C0FFF,0xFF800E56,0xFF700A74,0xFF700863,0xFF640622,0xFF8C144D,0xFF740F22,0xFF680D11,0xFF5807AB,0xFF4C04BA,0xFF44013D,0xFF4C08DC,0xFF380519,0xFB28008D,0xEF2C0601,0x8DFC1CB0,0xFF701751,0xFF68152C,0xFF4C0EB2,0xFF380AC2,0xFF1C0620,0xFF200DF9,0xFEFC0829,0xFECC0005,0xEED80600,0xC7FC1CB0,
-0xFEC8152B,0xFE2C0620,0xEC00064C,0xD8001CB0,0xFFA0185A,0xFFAC1AFA,0xFFAC1BE7,0xFF9413DF,0xFF780E8B,0xFF680924,0xFF5C06CB,0xFF400302,0xFF98183A,0xFF801312,0xFF3C08BF,0xFECC0005,0xB9FC1CB0,0x1DC0603,0xFFD40556,0xFFCC04C2,0xFFCC0482,0xFFD00433,0xFFC40336,0xFFBC02C5,0xFFAC0183,0xFFAC00DA,0xFF9C0001,0xCFFC0600,0xFFC004F6,0xFFB80480,0xFFA002C2,0xFF8801A9,
-0xFF700000,0xE7FC0600,0xFF6C0480,0xFED80000,0xEE000600,0xCFFC0600,0xFFC004F6,0xFFB80480,0xFFA002C2,0xFF8801A9,0xFF700000,0xE7FC0600,0xFF6C0480,0xFED80000,0xEE000600,0xE7FC0600,0xFF6C0480,0xFED80000,0xEE000600,0xEE000600,0xFFD80565,0xF7DC05C1,0xF7DC05E2,0xFFC404EB,0xFFC003FE,0xFFA802E1,0xFFA00212,0xFF7C0172,0xFFD00575,0xFFC804AC,0xFF980489,0xFED80000,
-0xE1FC0600,0x198152B,0x198152B,0x198152B,0x198152B,0xFF800E56,0xFF800E56,0xFF800E56,0xFF700863,0xFF700863,0xFF640622,0xFF680D11,0xFF680D11,0xFF680D11,0xFF4C04BA,0xFF4C04BA,0xFF44013D,0xFF380519,0xFF380519,0xF924007D,0xE72C0481,0x65FC152B,0x65FC152B,0x65FC152B,0xFF380AC2,0xFF380AC2,0xFF1C0620,0xFEFC0829,0xFEFC0829,0xFECC0005,0xE6E00480,0xB3FC152B,
-0xB3FC152B,0xFE2C0620,0xE6000490,0xCC00152C,0xFF901212,0xFF8C13EB,0x198152B,0xFF800EF9,0xFF6C0B5E,0xFF5C082E,0xFF5C06CB,0xFF400302,0xFF8011D1,0xFF740E66,0xFF3C089B,0xFECC0005,0x9FFC152B,0x1CC0482,0x1CC0482,0x1CC0482,0x1CC0482,0xFFBC02C5,0xFFBC02C5,0xFFBC02C5,0xFFAC00DA,0xFFAC00DA,0xFF9C0001,0xB7FC0480,0xB7FC0480,0xB7FC0480,0xFF8801A9,0xFF8801A9,
-0xFF700000,0xDDF40480,0xDDF40480,0xFED80000,0xE6000480,0xB7FC0480,0xB7FC0480,0xB7FC0480,0xFF8801A9,0xFF8801A9,0xFF700000,0xDDF40480,0xDDF40480,0xFED80000,0xE6000480,0xDDF40480,0xDDF40480,0xFED80000,0xE6000480,0xE6000480,0xFDC80422,0xFFCC0422,0x1CC0482,0xFDC003CA,0xFFAC0321,0xFFA0028D,0xFFA00212,0xFF7C0172,0xFFC40422,0xFFB003C5,0xD3FC0480,0xFED80000,
-0xD3FC0480,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0x1640622,0xFF44013D,0xFF44013D,0xFF44013D,0xFF44013D,0xFF44013D,
-0xFF44013D,0xEB2C0001,0xEB2C0001,0xEB2C0001,0xCF2C0001,0x19FC0620,0x19FC0620,0x19FC0620,0x19FC0620,0x19FC0620,0x19FC0620,0xFECC0005,0xFECC0005,0xFECC0005,0xCEF80000,0x8FF80620,0x8FF80620,0x8FF80620,0xCE480000,0xB2000620,0xF9600515,0x1640622,0x1640622,0xFF5803CA,0xFF5002DA,0xFF4C0202,0xFF4C0202,0xFF400091,0xFD5404B1,0xFF50039D,0xFF24000D,0xFECC0005,
-0x71FC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table189[] = {
-0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x71F80000,
-0x71F80000,0x71F80000,0x71F80000,0x9E000000,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x1540000,0x1540000,0x1540000,0x1D80000,0x4BFC0000,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x1AC0001,0x87FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0x87FC0000,
-0x87FC0000,0xC5F80000,0xC5F80000,0xC5F80000,0xD6000000,0x87FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0x87FC0000,0xC5F80000,0xC5F80000,0xC5F80000,0xD6000000,0xC5F80000,0xC5F80000,0xC5F80000,0xD6000000,0xD6000000,0xDC80000,0x1AC0001,0x1AC0001,0x1F40000,0x3DFC0000,0x6BFC0000,0x6BFC0000,0x9DFC0000,0x1F40000,0x3DFC0000,0xB5FC0000,0xC5F80000,
-0xB5FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B81834,0xFFB01547,0xFFA812FF,0xFFA4122B,0xFFA4114A,0xFF980E07,0xFF8C0CB6,0xFF880994,0xFF7C07EB,0xFF740622,0xFF981115,0xFF800CE2,0xFF800B29,0xFF7006D3,0xFF64046A,0xFF500195,0xFF580704,0xFF4C03F3,0xFD380035,0xF13C042D,0x99FC1830,0xFF7C13E9,0xFF78122B,0xFF580CF2,0xFF4C09E9,0xFF340620,0xFF380B89,0xFF1406C9,0xFEE40025,0xF0EC042D,0xCDFC1830,
-0xFEEC122B,0xFE600620,0xF0000435,0xDC001830,0xFFA814D6,0xFFAC171A,0xF5B817AC,0xFF9810D6,0xFF880C93,0xFF700806,0xFF68061E,0xFF5802DE,0xFFA014B2,0xFF901042,0xFF58074F,0xFEE40025,0xBFFC1830,0x1E4042B,0xFFDC03AB,0xFFD80346,0xFFD40322,0xFFD002E3,0xFFC8023E,0xFFC401E2,0xFFB8010B,0xFFB80092,0xFFAC0001,0xD9FC042B,0xFFCC0372,0xFFC00322,0xFFAC01E2,0xFFA00121,
-0xFF880000,0xEDF8042B,0xFF840320,0xFF080000,0xF000042C,0xD9FC042B,0xFFCC0372,0xFFC00322,0xFFAC01E2,0xFFA00121,0xFF880000,0xEDF8042B,0xFF840320,0xFF080000,0xF000042C,0xEDF8042B,0xFF840320,0xFF080000,0xF000042C,0xF000042C,0xFFD803C5,0xF9E003F9,0xFBE40412,0xFFD40355,0xFFC802BD,0xFFB801F5,0xFFA80161,0xFF8C0105,0xFFDC03C5,0xFFC8034C,0xFFA80329,0xFF080000,
-0xE7FC042B,0x1A4122B,0x1A4122B,0x1A4122B,0x1A4122B,0xFF8C0CB6,0xFF8C0CB6,0xFF8C0CB6,0xFF7C07EB,0xFF7C07EB,0xFF740622,0xFF800B29,0xFF800B29,0xFF800B29,0xFF64046A,0xFF64046A,0xFF500195,0xFF4C03F3,0xFF4C03F3,0xFB3C002A,0xEB3C0321,0x77FC122B,0x77FC122B,0x77FC122B,0xFF4C09E9,0xFF4C09E9,0xFF340620,0xFF1406C9,0xFF1406C9,0xFEE40025,0xEAF40320,0xBDF8122B,
-0xBDF8122B,0xFE600620,0xEA040320,0xD000122C,0xFD9C0FA9,0xF9A01122,0x1A4122B,0xFF900D02,0xFF8009FE,0xFF700742,0xFF68061E,0xFF5802DE,0xFF940F4C,0xFF800CA1,0xFF580736,0xFEE40025,0xABFC122B,0x1D40322,0x1D40322,0x1D40322,0x1D40322,0xFFC401E2,0xFFC401E2,0xFFC401E2,0xFFB80092,0xFFB80092,0xFFAC0001,0xC3FC0320,0xC3FC0320,0xC3FC0320,0xFFA00121,0xFFA00121,
-0xFF880000,0xE1FC0320,0xE1FC0320,0xFF080000,0xEA000320,0xC3FC0320,0xC3FC0320,0xC3FC0320,0xFFA00121,0xFFA00121,0xFF880000,0xE1FC0320,0xE1FC0320,0xFF080000,0xEA000320,0xE1FC0320,0xE1FC0320,0xFF080000,0xEA000320,0xEA000320,0xFBD002D4,0xFFCC02F2,0x1D40322,0xFFC4029A,0xFFC00225,0xFFB001BD,0xFFA80161,0xFF8C0105,0xFDCC02D4,0xFFC80288,0xDBFC0320,0xFF080000,
-0xDBFC0320,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0x1740622,0xFF500195,0xFF500195,0xFF500195,0xFF500195,0xFF500195,
-0xFF500195,0xF33C0001,0xF33C0001,0xF33C0001,0xD73C0001,0x31FC0620,0x31FC0620,0x31FC0620,0x31FC0620,0x31FC0620,0x31FC0620,0xFEE40025,0xFEE40025,0xFEE40025,0xD7080000,0x9BF80620,0x9BF80620,0x9BF80620,0xD6580000,0xBA000620,0xFF6C0521,0x1740622,0x1740622,0xFF6803F9,0xFF640322,0xFF5C024A,0xFF5C024A,0xFF4C00DA,0xF96804E2,0xFF5803E8,0xFF38002D,0xFEE40025,
-0x7FFC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table190[] = {
-0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x7DF80000,
-0x7DF80000,0x7DF80000,0x7DF80000,0xA6000000,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x1640000,0x1640000,0x1640000,0x1F00000,0x5BFC0000,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x1BC0001,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,
-0x9FFC0000,0xD1F80000,0xD1F80000,0xD1F80000,0xDE000000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0x9FFC0000,0xD1F80000,0xD1F80000,0xD1F80000,0xDE000000,0xD1F80000,0xD1F80000,0xD1F80000,0xDE000000,0xDE000000,0x1DC0000,0x1BC0001,0x1BC0001,0x1FFC0000,0x63FC0000,0x89FC0000,0x89FC0000,0xB1FC0000,0x1FFC0000,0x63FC0000,0xC3FC0000,0xD1F80000,
-0xC3FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1C01434,0xFFBC11F7,0xFFB4102B,0xFFAC0F83,0xFFB00EDA,0xFFA40C47,0xFF980B46,0xFF9408C4,0xFF88078B,0xFF840622,0xFFA40E45,0xFF8C0AF2,0xFF8C0989,0xFF7C0613,0xFF700432,0xFF6801ED,0xFF64058C,0xFF58030B,0xFD4C000A,0xF54C02AD,0xA5FC1430,0xFF9410D1,0xFF880F80,0xFF700B62,0xFF640911,0xFF4C0620,0xFF4C09AB,0xFF2C05A9,0xFF080059,0xF50002AD,0xD3FC1430,
-0xFF080F80,0xFE900620,0xF40C02AC,0xE0001430,0xFFB4117B,0xF9C01344,0xF9C013BC,0xFFAC0E6E,0xFF940AD2,0xFF7C073E,0xFF7805B3,0xFF6C02C3,0xFFB4111B,0xFF980DEE,0xFF70060B,0xFF080059,0xC7FC1430,0x1E802AB,0xFDE40263,0xFFE0021B,0xFFDC0202,0xFFDC01D3,0xFFD4016E,0xFFD00132,0xFFCC00B1,0xFFC00065,0xFFBC0001,0xDFFC02AB,0xFFD80236,0xFFCC0202,0xFFB80132,0xFFAC00B9,
-0xFFA00000,0xEFFC02AB,0xFF9C0200,0xFF380000,0xF40002AC,0xDFFC02AB,0xFFD80236,0xFFCC0202,0xFFB80132,0xFFAC00B9,0xFFA00000,0xEFFC02AB,0xFF9C0200,0xFF380000,0xF40002AC,0xEFFC02AB,0xFF9C0200,0xFF380000,0xF40002AC,0xF40002AC,0xFFE4026D,0xFDE80281,0xFDE80296,0xFFDC0215,0xFFD401C2,0xFFBC0152,0xFFB800E8,0xFFA8009D,0xFFE00272,0xFFDC020E,0xFFB80204,0xFF380000,
-0xEBFC02AB,0x1AC0F83,0x1AC0F83,0x1AC0F83,0x1AC0F83,0xFF980B46,0xFF980B46,0xFF980B46,0xFF88078B,0xFF88078B,0xFF840622,0xFF8C0989,0xFF8C0989,0xFF8C0989,0xFF700432,0xFF700432,0xFF6801ED,0xFF58030B,0xFF58030B,0xFD4C0006,0xEF4C0201,0x87FC0F80,0x87FC0F80,0x87FC0F80,0xFF640911,0xFF640911,0xFF4C0620,0xFF2C05A9,0xFF2C05A9,0xFF080059,0xEF080200,0xC5F80F80,
-0xC5F80F80,0xFE900620,0xEE280200,0xD6000F80,0xFFA00D61,0xFFAC0E96,0x1AC0F83,0xFF980B42,0xFF8C08EE,0xFF7C0695,0xFF7805B3,0xFF6C02C3,0xFFA00D29,0xFF900AF9,0xFF7005FB,0xFF080059,0xB5FC0F80,0x1DC0202,0x1DC0202,0x1DC0202,0x1DC0202,0xFFD00132,0xFFD00132,0xFFD00132,0xFFC00065,0xFFC00065,0xFFBC0001,0xCFFC0200,0xCFFC0200,0xCFFC0200,0xFFAC00B9,0xFFAC00B9,
-0xFFA00000,0xE7FC0200,0xE7FC0200,0xFF380000,0xEE000200,0xCFFC0200,0xCFFC0200,0xCFFC0200,0xFFAC00B9,0xFFAC00B9,0xFFA00000,0xE7FC0200,0xE7FC0200,0xFF380000,0xEE000200,0xE7FC0200,0xE7FC0200,0xFF380000,0xEE000200,0xEE000200,0xFFD801C4,0xF7DC01E1,0x1DC0202,0xFDD801A5,0xFFC80164,0xFFBC0121,0xFFB800E8,0xFFA8009D,0xFFD001D4,0xFBD401A5,0xE1FC0200,0xFF380000,
-0xE1FC0200,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0x1840622,0xFF6801ED,0xFF6801ED,0xFF6801ED,0xFF6801ED,0xFF6801ED,
-0xFF6801ED,0xFB4C0001,0xFB4C0001,0xFB4C0001,0xDF4C0001,0x49FC0620,0x49FC0620,0x49FC0620,0x49FC0620,0x49FC0620,0x49FC0620,0xFF080059,0xFF080059,0xFF080059,0xDF180000,0xA7F80620,0xA7F80620,0xA7F80620,0xDE680000,0xC2000620,0xF980054A,0x1840622,0x1840622,0xFF740442,0xFF780372,0xFF7002B1,0xFF7002B1,0xFF5C0131,0xFF7404EA,0xFD740422,0xFF500075,0xFF080059,
-0x8FFC0620,};
-static const uint32_t g_etc1_to_bc7_m6_table191[] = {
-0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0x89F80000,
-0x89F80000,0x89F80000,0x89F80000,0xAE000000,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x3740000,0x3740000,0x3740000,0xDFC0000,0x69FC0000,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,
-0xB7FC0000,0xDDF40000,0xDDF40000,0xDDF40000,0xE6000000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xDDF40000,0xE6000000,0xDDF40000,0xDDF40000,0xDDF40000,0xE6000000,0xE6000000,0x1EC0000,0x1CC0001,0x1CC0001,0x57FC0000,0x8BFC0000,0xA7FC0000,0xA7FC0000,0xC5FC0000,0x57FC0000,0x8BFC0000,0xD3FC0000,0xDDF40000,
-0xD3FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1C810B4,0xFFC40EF4,0xFFB80DA4,0xFFB80D2B,0xFFB00CBA,0xFFB00ABF,0xFFA40A06,0xFFA00814,0xFF9C0732,0xFF940622,0xFFB00BDD,0xFF980952,0xFF980831,0xFF88058B,0xFF84041A,0xFF74025D,0xFF7C045C,0xFF700263,0xFF600005,0xF75C0181,0xB1FC10B0,0xFFA00E21,0xFF980D2C,0xFF7C0A12,0xFF700851,0xFF640620,0xFF6407F3,0xFF3804E9,0xFF2000A9,0xF7180180,0xD9FC10B0,
-0xFF280D2B,0xFEC00620,0xF6380180,0xE40010B0,0xFFBC0E86,0xFDC80FE4,0xFDC8104C,0xFFAC0C2E,0xFFA00963,0xFF90068E,0xFF900555,0xFF7402DE,0xFFB40E4B,0xFFB00BC7,0xFF7C0521,0xFF2000A9,0xCFFC10B0,0x1EC0183,0xFFE80153,0xFFE40132,0xFFE40122,0xFFE8010B,0xFFDC00CE,0xFFDC00AA,0xFFD80061,0xFFD4003A,0xFFCC0001,0xE9FC0180,0xFFE00141,0xFFD80122,0xFFCC00B1,0xFFC0006D,
-0xFFB80000,0xF3FC0180,0xFFB40120,0xFF6C0000,0xF6000180,0xE9FC0180,0xFFE00141,0xFFD80122,0xFFCC00B1,0xFFC0006D,0xFFB80000,0xF3FC0180,0xFFB40120,0xFF6C0000,0xF6000180,0xF3FC0180,0xFFB40120,0xFF6C0000,0xF6000180,0xF6000180,0xFBEC0161,0xFFEC0161,0xFFEC0172,0xFFE00143,0xFFDC00F9,0xFFD000C3,0xFFD00082,0xFFB8005A,0xF9EC0161,0xFFDC012E,0xFFD00123,0xFF6C0000,
-0xF1FC0180,0x1B80D2B,0x1B80D2B,0x1B80D2B,0x1B80D2B,0xFFA40A06,0xFFA40A06,0xFFA40A06,0xFF9C0732,0xFF9C0732,0xFF940622,0xFF980831,0xFF980831,0xFF980831,0xFF84041A,0xFF84041A,0xFF74025D,0xFF700263,0xFF700263,0xFF600005,0xF35C0121,0x95FC0D2B,0x95FC0D2B,0x95FC0D2B,0xFF700851,0xFF700851,0xFF640620,0xFF3804E9,0xFF3804E9,0xFF2000A9,0xF31C0120,0xCBFC0D2B,
-0xCBFC0D2B,0xFEC00620,0xF2480120,0xDC000D2C,0xFFAC0B7E,0xF5B80CA3,0x1B80D2B,0xFFA409EB,0xFF9407DE,0xFF900615,0xFF900555,0xFF7402DE,0xFDB00B5E,0xFF9809A5,0xFF7C0511,0xFF2000A9,0xBFF80D2B,0x1E40122,0x1E40122,0x1E40122,0x1E40122,0xFFDC00AA,0xFFDC00AA,0xFFDC00AA,0xFFD4003A,0xFFD4003A,0xFFCC0001,0xDBFC0120,0xDBFC0120,0xDBFC0120,0xFFC0006D,0xFFC0006D,
-0xFFB80000,0xEDFC0120,0xEDFC0120,0xFF6C0000,0xF2000120,0xDBFC0120,0xDBFC0120,0xDBFC0120,0xFFC0006D,0xFFC0006D,0xFFB80000,0xEDFC0120,0xEDFC0120,0xFF6C0000,0xF2000120,0xEDFC0120,0xEDFC0120,0xFF6C0000,0xF2000120,0xF2000120,0xF7E40109,0xFBE40109,0x1E40122,0xFFDC00E1,0xFFD400C1,0xFFD000AA,0xFFD00082,0xFFB8005A,0xF5E40109,0xFFDC00DD,0xE9FC0120,0xFF6C0000,
-0xE9FC0120,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0x1940622,0xFF74025D,0xFF74025D,0xFF74025D,0xFF74025D,0xFF74025D,
-0xFF74025D,0xFF600005,0xFF600005,0xFF600005,0xE75C0001,0x63FC0620,0x63FC0620,0x63FC0620,0x63FC0620,0x63FC0620,0x63FC0620,0xFF2000A9,0xFF2000A9,0xFF2000A9,0xE7280000,0xB3F80620,0xB3F80620,0xB3F80620,0xE6780000,0xCA000620,0xFF8C055A,0x1940622,0x1940622,0xFF840479,0xFF8003C5,0xFF800305,0xFF800305,0xFF700195,0xFD8C0515,0xFF80045D,0xFF6000CD,0xFF2000A9,
-0x9FF80620,};
-static const uint32_t g_etc1_to_bc7_m6_table192[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x400001,0x400001,0x400001,0x400001,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0x20000000,0x20000000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0x20000000,0x20000000,0xC40000,
-0xC40000,0x20000000,0x20000000,0x20000000,0x4C0000,0xA440000,0x400001,0x580000,0x26C0000,0x8C0000,0xA00000,0xF00000,0x4500000,0x600000,0x8C0000,0x20000000,0x8C0000,0xD40000,0x13C0000,0x21FC0000,0x68000001,0x13C0000,0x21FC0000,0x68000001,0x21FC0000,0x68000001,0x68000001,0x13C0000,0x21FC0000,0x68000001,0x21FC0000,0x68000001,
-0x68000001,0x21FC0000,0x68000001,0x68000001,0x68000001,0x13C0000,0x21FC0000,0x68000001,0x21FC0000,0x68000001,0x68000001,0x21FC0000,0x68000001,0x68000001,0x68000001,0x21FC0000,0x68000001,0x68000001,0x68000001,0x68000001,0x3080000,0x8E00000,0x8E00000,0x1640000,0x5FC0000,0x45F40000,0x68000001,0x68000001,0x1200000,0x18C0000,0x61D40000,0x68000001,
-0x1C00000,0x441D49,0xFE1C039A,0x981402D9,0x681402DA,0xD0000A69,0x92000112,0x68000002,0x66000A69,0x560003DA,0x44000A69,0x8A0016FD,0x7A000882,0x62000432,0x58000E66,0x520006FB,0x40000C4A,0x440016FD,0x44000E46,0x38001145,0x2E0016FE,0x681D47,0x68000DD2,0x580007C3,0x52001187,0x4C000994,0x40000E03,0x400018C6,0x3E001027,0x360012AA,0x2E0017DF,0xD01D47,
-0x380014C2,0x3200161D,0x26001A42,0x22001D47,0xFE140B92,0xF4381671,0xF8401611,0xC80006E0,0x8A000732,0x660006FD,0x5A0004DA,0x4A000929,0xF2000CF6,0x9E00098E,0x52000B32,0x360012AA,0x941D47,0x5C16FD,0xFE240289,0x94200222,0x68200222,0xD0000A69,0x92000112,0x68000002,0x66000A69,0x560003DA,0x44000A69,0x8816FD,0x7A000882,0x62000432,0x58000E66,0x520006FB,
-0x40000C4A,0x11416FD,0x44000E46,0x38001145,0x2E0016FE,0x8816FD,0x7A000882,0x62000432,0x58000E66,0x520006FB,0x40000C4A,0x11416FD,0x44000E46,0x38001145,0x2E0016FE,0x11416FD,0x44000E46,0x38001145,0x2E0016FE,0x2E0016FE,0xFE140B2E,0xFC481271,0xFE4C1121,0xC80006E0,0x8A000732,0x660006FD,0x5A0004DA,0x4A000929,0xFA000BFE,0x9E00092A,0x52000B19,0x38001145,
-0xC016FD,0x1402D9,0x1402D9,0x1402D9,0x1402D9,0x64000000,0x64000000,0x64000000,0x30000000,0x30000000,0x20000000,0x30000221,0x30000221,0x30000221,0x280000C2,0x280000C2,0x1E000068,0x18000221,0x18000221,0x16000145,0x10000221,0x2002D6,0x2002D6,0x2002D6,0x22000143,0x22000143,0x180000C0,0x1200025D,0x1200025D,0x14000185,0xE00023E,0x3C02D6,
-0x3C02D6,0x100001F2,0xE000289,0xA0002D6,0xC40000A9,0xFE0C00C1,0x1402D9,0x640000E8,0x3C0000D0,0x2E0000D0,0x2E0000A9,0x220000F5,0x64000178,0x44000138,0x1E000225,0x14000185,0x2C02D6,0x200221,0x200221,0x200221,0x200221,0x64000000,0x64000000,0x64000000,0x30000000,0x30000000,0x20000000,0x300221,0x300221,0x300221,0x280000C2,0x280000C2,
-0x1E000068,0x5C0221,0x5C0221,0x16000145,0x10000221,0x300221,0x300221,0x300221,0x280000C2,0x280000C2,0x1E000068,0x5C0221,0x5C0221,0x16000145,0x10000221,0x5C0221,0x5C0221,0x16000145,0x10000221,0x10000221,0xC40000A9,0xFE0C009D,0x200221,0x640000E8,0x3C0000D0,0x2E0000D0,0x2E0000A9,0x220000F5,0x74000151,0x4A000121,0x400221,0x16000145,
-0x400221,0x8C0A69,0xFC440001,0x8C440001,0x68400002,0xCC0A69,0x92000112,0x68000002,0x1A00A69,0x560003DA,0x44000A69,0xCC0A69,0x92000112,0x68000002,0x1A00A69,0x560003DA,0x44000A69,0x1A00A69,0x560003DA,0x44000A69,0x44000A69,0xCC0A69,0x92000112,0x68000002,0x1A00A69,0x560003DA,0x44000A69,0x1A00A69,0x560003DA,0x44000A69,0x44000A69,0x1A00A69,
-0x560003DA,0x44000A69,0x44000A69,0x44000A69,0xFE3406B2,0x940A69,0xF8800745,0xDE000385,0x96000410,0x720003D4,0x5E00028A,0x54000502,0xFE180659,0xBA000454,0x64000232,0x44000A69,0x1240A69,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table193[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x500001,0x500001,0x500001,0x500001,0x780000,0x780000,0x780000,0xF40000,0xF40000,0x28000000,0x780000,0x780000,0x780000,0xF40000,0xF40000,0x28000000,0xF40000,0xF40000,0x28000000,0x28000000,0x780000,0x780000,0x780000,0xF40000,0xF40000,0x28000000,0xF40000,0xF40000,0x28000000,0x28000000,0xF40000,
-0xF40000,0x28000000,0x28000000,0x28000000,0x600000,0x580000,0x500001,0x700000,0x880000,0xAC0000,0xC80000,0x12C0000,0x4640000,0x780000,0xAC0000,0x28000000,0xAC0000,0xE40000,0x3500000,0x2DFC0000,0x70000001,0x3500000,0x2DFC0000,0x70000001,0x2DFC0000,0x70000001,0x70000001,0x3500000,0x2DFC0000,0x70000001,0x2DFC0000,0x70000001,
-0x70000001,0x2DFC0000,0x70000001,0x70000001,0x70000001,0x3500000,0x2DFC0000,0x70000001,0x2DFC0000,0x70000001,0x70000001,0x2DFC0000,0x70000001,0x70000001,0x70000001,0x2DFC0000,0x70000001,0x70000001,0x70000001,0x70000001,0x31C0000,0xF40000,0xF40000,0x1800000,0x13F80000,0x4FF40000,0x70000001,0x70000001,0x3340000,0x1AC0000,0x69E40000,0x70000001,
-0x1E40000,0x4C21E1,0xFE2405DD,0xA21C0461,0x701C0462,0xEA000A69,0xA20000A0,0x7200000A,0x72000A69,0x60000361,0x4C000A69,0x9C001A0D,0x860009AA,0x68000506,0x68000F41,0x58000723,0x46000CCE,0x4C001A0D,0x4A001006,0x3E0012F1,0x32001A0E,0x7421DF,0x7400101A,0x62000932,0x62001345,0x52000A54,0x46000EDF,0x46001C46,0x44001257,0x3C0014AA,0x30001B22,0xE821DF,
-0x3E0017F2,0x320018ED,0x2C001E16,0x260021DF,0xFE140E42,0xF8401A29,0xFC481A39,0xDE000716,0x9A000789,0x7000072D,0x5E0004BB,0x560009B3,0xFE000ECD,0xB6000A4B,0x56000C76,0x3C0014AA,0xA421DF,0x681A0D,0xFE300425,0x9E28034A,0x7028034A,0xEA000A69,0xA20000A0,0x72040009,0x72000A69,0x60000361,0x4C000A69,0x2981A0D,0x860009AA,0x68000506,0x68000F41,0x58000723,
-0x46000CCE,0x1381A0D,0x4A001006,0x3E0012F1,0x32001A0E,0x2981A0D,0x860009AA,0x68000506,0x68000F41,0x58000723,0x46000CCE,0x1381A0D,0x4A001006,0x3E0012F1,0x32001A0E,0x1381A0D,0x4A001006,0x3E0012F1,0x32001A0E,0x32001A0E,0xFE200D73,0xFE4C1521,0xF860142A,0xDE000716,0x9A000789,0x7000072D,0x5E0004BB,0x560009B3,0xFE000DCD,0xBC0009CE,0x56000C5D,0x3E0012F1,
-0xDC1A0D,0x1C0461,0x1C0461,0x1C0461,0x1C0461,0x7C000000,0x7C000000,0x7C000000,0x3C000000,0x3C000000,0x28000000,0x3C000349,0x3C000349,0x3C000349,0x2E000132,0x2E000132,0x2200009D,0x1E000349,0x1E000349,0x1C0001F9,0x14000349,0x280461,0x280461,0x280461,0x280001FB,0x280001FB,0x22000116,0x180003A1,0x180003A1,0x1A000259,0x12000371,0x500461,
-0x500461,0x16000306,0x100003DA,0xC000462,0xF6000105,0xFE0C01B1,0x1C0461,0x7A000161,0x50000140,0x40000145,0x38000112,0x2A000172,0x82000248,0x500001DD,0x2400034D,0x1A000259,0x380461,0x280349,0x280349,0x280349,0x280349,0x7C000000,0x7C000000,0x7C000000,0x3C000000,0x3C000000,0x28000000,0x3C0349,0x3C0349,0x3C0349,0x2E000132,0x2E000132,
-0x2200009D,0x740349,0x740349,0x1C0001F9,0x14000349,0x3C0349,0x3C0349,0x3C0349,0x2E000132,0x2E000132,0x2200009D,0x740349,0x740349,0x1C0001F9,0x14000349,0x740349,0x740349,0x1C0001F9,0x14000349,0x14000349,0xF6000105,0xF4180154,0x280349,0x7A000161,0x50000140,0x40000145,0x38000112,0x2A000172,0x82000208,0x5A0001BD,0x540349,0x1C0001F9,
-0x540349,0x9C0A69,0xFE540005,0x94540001,0x70500002,0xE40A69,0xA20000A0,0x700C0001,0x1D00A69,0x60000361,0x4C000A69,0xE40A69,0xA20000A0,0x700C0001,0x1D00A69,0x60000361,0x4C000A69,0x1D00A69,0x60000361,0x4C000A69,0x4C000A69,0xE40A69,0xA20000A0,0x700C0001,0x1D00A69,0x60000361,0x4C000A69,0x1D00A69,0x60000361,0x4C000A69,0x4C000A69,0x1D00A69,
-0x60000361,0x4C000A69,0x4C000A69,0x4C000A69,0xFE5006CD,0xA40A69,0xFE8C0749,0xF4000304,0xA0000384,0x7C000335,0x66000209,0x5C000492,0xFE2C0694,0xCC0003E8,0x6E0001A8,0x4C000A69,0x1480A69,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table194[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x600001,0x600001,0x600001,0x600001,0x900000,0x900000,0x900000,0x1240000,0x1240000,0x30000000,0x900000,0x900000,0x900000,0x1240000,0x1240000,0x30000000,0x1240000,0x1240000,0x30000000,0x30000000,0x900000,0x900000,0x900000,0x1240000,0x1240000,0x30000000,0x1240000,0x1240000,0x30000000,0x30000000,0x1240000,
-0x1240000,0x30000000,0x30000000,0x30000000,0x2700000,0x680000,0x600001,0x840000,0xA40000,0xD00000,0xEC0000,0x1680000,0x4780000,0x900000,0xD00000,0x30000000,0xD00000,0xF40000,0x3680000,0x39FC0000,0x78000001,0x3680000,0x39FC0000,0x78000001,0x39FC0000,0x78000001,0x78000001,0x3680000,0x39FC0000,0x78000001,0x39FC0000,0x78000001,
-0x78000001,0x39FC0000,0x78000001,0x78000001,0x78000001,0x3680000,0x39FC0000,0x78000001,0x39FC0000,0x78000001,0x78000001,0x39FC0000,0x78000001,0x78000001,0x78000001,0x39FC0000,0x78000001,0x78000001,0x78000001,0x78000001,0x3300000,0x1040000,0x1040000,0x1980000,0x1FFC0000,0x59F40000,0x78000001,0x78000001,0x14C0000,0x1C80000,0x71F40000,0x78000001,
-0x5FC0000,0x5426F9,0xFE3008D5,0xAC200642,0x78200642,0xFE000A6D,0xAE000050,0x7A04003A,0x7E000A69,0x6C0002E9,0x54000A69,0xAC001D72,0x92000B12,0x6E000632,0x6E001055,0x62000755,0x52000D4E,0x54001D72,0x5000120E,0x440014CD,0x38001D72,0x8026F7,0x7A0012C2,0x68000B0A,0x62001515,0x5E000B3C,0x4C000FDB,0x4C00202E,0x4A0014D7,0x440016DE,0x36001EC2,0x10026F7,
-0x44001B8A,0x38001C09,0x32002262,0x2A0026F7,0xFE201173,0xFC481E61,0xFE4C1EE5,0xF4000768,0xA20007D3,0x7C000746,0x660004D2,0x60000A46,0xFE00117D,0xC6000B19,0x5E000E2A,0x440016DE,0xB426F7,0x701D75,0xFE3C0631,0xA83004B2,0x783004B2,0xFE000A6D,0xAE000050,0x7C080032,0x7E000A69,0x6C0002E9,0x54000A69,0xA81D72,0x92000B12,0x6E000632,0x6E001055,0x62000755,
-0x52000D4E,0x1581D72,0x5000120E,0x440014CD,0x38001D72,0xA81D72,0x92000B12,0x6E000632,0x6E001055,0x62000755,0x52000D4E,0x1581D72,0x5000120E,0x440014CD,0x38001D72,0x1581D72,0x5000120E,0x440014CD,0x38001D72,0x38001D72,0xFE340FF2,0xF8601819,0xFC68174A,0xF4000768,0xA20007D3,0x7C000746,0x660004D2,0x60000A46,0xFE08103E,0xC6000A89,0x5E000E06,0x440014CD,
-0xF01D72,0x200641,0x200641,0x200641,0x200641,0x94000000,0x94000000,0x94000000,0x48000000,0x48000000,0x30000000,0x480004B1,0x480004B1,0x480004B1,0x3A0001BA,0x3A0001BA,0x2E0000E5,0x220004B1,0x220004B1,0x200002E4,0x180004B1,0x300641,0x300641,0x300641,0x2E0002E3,0x2E0002E3,0x28000192,0x2200052A,0x2200052A,0x1C00035D,0x180004F1,0x5C0641,
-0x5C0641,0x1C000462,0x1400058D,0x10000642,0xF60001A5,0xF4180304,0x200641,0x900001F4,0x5E0001CD,0x480001CD,0x44000184,0x2E00022D,0x9600034E,0x720002BB,0x2C0004BA,0x1C00035D,0x400641,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x94000000,0x94000000,0x94000000,0x48000000,0x48000000,0x30000000,0x4804B1,0x4804B1,0x4804B1,0x3A0001BA,0x3A0001BA,
-0x2E0000E5,0x8C04B1,0x8C04B1,0x200002E4,0x180004B1,0x4804B1,0x4804B1,0x4804B1,0x3A0001BA,0x3A0001BA,0x2E0000E5,0x8C04B1,0x8C04B1,0x200002E4,0x180004B1,0x8C04B1,0x8C04B1,0x200002E4,0x180004B1,0x180004B1,0xF60001A5,0xF8200244,0x3004B1,0x900001F4,0x5E0001CD,0x480001CD,0x44000184,0x2E00022D,0xA40002F2,0x7200028A,0x6404B1,0x200002E4,
-0x6404B1,0xAC0A69,0xFE680012,0x9C640001,0x78600002,0xFC0A69,0xAE000050,0x781C0001,0x3F80A69,0x6C0002E9,0x54000A69,0xFC0A69,0xAE000050,0x781C0001,0x3F80A69,0x6C0002E9,0x54000A69,0x3F80A69,0x6C0002E9,0x54000A69,0x54000A69,0xFC0A69,0xAE000050,0x781C0001,0x3F80A69,0x6C0002E9,0x54000A69,0x3F80A69,0x6C0002E9,0x54000A69,0x54000A69,0x3F80A69,
-0x6C0002E9,0x54000A69,0x54000A69,0x54000A69,0xF6680708,0xB80A69,0xF8A00782,0xFC0002AD,0xB600031A,0x860002D5,0x70000195,0x6400042A,0xF44806CD,0xDE000361,0x76000128,0x54000A69,0x1680A69,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table195[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x38000000,0x38000000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x38000000,0x38000000,0x1580000,
-0x1580000,0x38000000,0x38000000,0x38000000,0x840000,0x4780000,0x700001,0x2980000,0xC00000,0xF00000,0x1140000,0x1A40000,0x48C0000,0xA80000,0xF00000,0x38000000,0xF00000,0x1040000,0x3800000,0x45FC0000,0x80000001,0x3800000,0x45FC0000,0x80000001,0x45FC0000,0x80000001,0x80000001,0x3800000,0x45FC0000,0x80000001,0x45FC0000,0x80000001,
-0x80000001,0x45FC0000,0x80000001,0x80000001,0x80000001,0x3800000,0x45FC0000,0x80000001,0x45FC0000,0x80000001,0x80000001,0x45FC0000,0x80000001,0x80000001,0x80000001,0x45FC0000,0x80000001,0x80000001,0x80000001,0x80000001,0x3440000,0x3140000,0x3140000,0x1B40000,0x2DFC0000,0x63F40000,0x80000001,0x80000001,0x1600000,0x1E80000,0x7BC80000,0x80000001,
-0x15FC0000,0x5C2C91,0xFE300C45,0xB6280879,0x8024087A,0xFE0C0AED,0xBA000020,0x8208009A,0x8A000A69,0x72000271,0x5C000A69,0xBC00212D,0xA2000C99,0x7A000792,0x7A001185,0x680007B9,0x58000DDA,0x5C00212D,0x5600145E,0x4A0016D9,0x3E00212E,0x8C2C8F,0x860015BA,0x6E000D42,0x6E001735,0x62000C25,0x520010F7,0x5200247E,0x500017A7,0x4A00194A,0x3A0022B7,0x1182C8F,
-0x44001F8A,0x3E001F75,0x38002726,0x2E002C8F,0xFE28156D,0xFE4C2325,0xFE4C2475,0xFE00082D,0xB6000833,0x880007C9,0x740004FA,0x60000AF6,0xFE08151E,0xDA000C0E,0x6600103A,0x4A00194A,0xC82C8F,0x7C212D,0xFE4408A6,0xB238065A,0x8038065A,0xFE0C0AC9,0xBA000020,0x840C007E,0x8A000A69,0x72000271,0x5C000A69,0xB8212D,0xA2000C99,0x7A000792,0x7A001185,0x680007B9,
-0x58000DDA,0x174212D,0x5600145E,0x4A0016D9,0x3E00212E,0xB8212D,0xA2000C99,0x7A000792,0x7A001185,0x680007B9,0x58000DDA,0x174212D,0x5600145E,0x4A0016D9,0x3E00212E,0x174212D,0x5600145E,0x4A0016D9,0x3E00212E,0x3E00212E,0xFE3412F2,0xFE6C1B49,0xFE6C1AEE,0xFE00082D,0xB6000833,0x880007C9,0x740004FA,0x60000AF6,0xFE18134D,0xDA000B4A,0x66001016,0x4A0016D9,
-0x108212D,0x240879,0x240879,0x240879,0x240879,0xAC000000,0xAC000000,0xAC000000,0x54000000,0x54000000,0x38000000,0x54000659,0x54000659,0x54000659,0x40000262,0x40000262,0x34000131,0x28000659,0x28000659,0x260003E8,0x1C000659,0x380876,0x380876,0x380876,0x3A0003F3,0x3A0003F3,0x2E000226,0x220006FA,0x220006FA,0x24000491,0x1A0006AE,0x700876,
-0x700876,0x1C0005E2,0x16000776,0x12000876,0xFE0402B1,0xF61C0498,0x240879,0xAC0002B9,0x72000275,0x5600028A,0x4C000209,0x380002F2,0xB400047A,0x720003AB,0x34000662,0x24000491,0x500876,0x380659,0x380659,0x380659,0x380659,0xAC000000,0xAC000000,0xAC000000,0x54000000,0x54000000,0x38000000,0x540659,0x540659,0x540659,0x40000262,0x40000262,
-0x34000131,0xA40659,0xA40659,0x260003E8,0x1C000659,0x540659,0x540659,0x540659,0x40000262,0x40000262,0x34000131,0xA40659,0xA40659,0x260003E8,0x1C000659,0xA40659,0xA40659,0x260003E8,0x1C000659,0x1C000659,0xFA0802AD,0xFC280374,0x380659,0xAC0002B9,0x72000275,0x5600028A,0x4C000209,0x380002F2,0xC20003FA,0x7C000371,0x740659,0x260003E8,
-0x740659,0xBC0A69,0xFE78002D,0xA4740001,0x80700002,0x1140A69,0xBA000020,0x802C0001,0xFF80A69,0x72000271,0x5C000A69,0x1140A69,0xBA000020,0x802C0001,0xFF80A69,0x72000271,0x5C000A69,0xFF80A69,0x72000271,0x5C000A69,0x5C000A69,0x1140A69,0xBA000020,0x802C0001,0xFF80A69,0x72000271,0x5C000A69,0xFF80A69,0x72000271,0x5C000A69,0x5C000A69,0xFF80A69,
-0x72000271,0x5C000A69,0x5C000A69,0x5C000A69,0xFE780708,0xC80A69,0xFEAC078A,0xFE1402D4,0xC000029A,0x90000254,0x7A000131,0x720003BA,0xFC5806CD,0xF40002F2,0x820000DA,0x5C000A69,0x18C0A69,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table196[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x840000,0x840000,0x840000,0x840000,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x40000001,0x40000001,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x40000001,0x40000001,0x18C0000,
-0x18C0000,0x40000001,0x40000001,0x40000001,0x2980000,0x8C0000,0x840000,0xB40000,0xDC0000,0x1180000,0x1400000,0x1E80000,0xA40000,0xC40000,0x1180000,0x40000001,0x1180000,0x1140001,0x19C0000,0x53FC0000,0x8A000000,0x19C0000,0x53FC0000,0x8A000000,0x53FC0000,0x8A000000,0x8A000000,0x19C0000,0x53FC0000,0x8A000000,0x53FC0000,0x8A000000,
-0x8A000000,0x53FC0000,0x8A000000,0x8A000000,0x8A000000,0x19C0000,0x53FC0000,0x8A000000,0x53FC0000,0x8A000000,0x8A000000,0x53FC0000,0x8A000000,0x8A000000,0x8A000000,0x53FC0000,0x8A000000,0x8A000000,0x8A000000,0x8A000000,0x15C0000,0x1280000,0x1280000,0x1D40000,0x3DF80000,0x6FF00000,0x8A000000,0x8A000000,0x3780000,0x9FC0000,0x83F80000,0x8A000000,
-0x25FC0000,0x683375,0xFE3C10E1,0xC22C0B59,0x8A2C0B58,0xFE0C0C55,0xCC000002,0x8E080130,0x98000A69,0x7E0001F9,0x66000A69,0xCE0025C5,0xA8000EBB,0x86000984,0x86001301,0x74000831,0x62000E81,0x640025C5,0x5C001758,0x50001969,0x440025C6,0x983373,0x92001984,0x7A001024,0x7A0019E1,0x6E000D81,0x5E00125B,0x5E0029BE,0x5C001B19,0x50001C42,0x4000279F,0x1303373,
-0x5000247C,0x440023B5,0x38002CEA,0x32003373,0xFE341A28,0xFE4C299D,0xF8602B08,0xFE0009DD,0xC00008C1,0x92000865,0x7C000545,0x6C000BD8,0xFE081A0A,0xDE000D62,0x7400128E,0x50001C42,0xD83373,0x8825C5,0xFE500BF4,0xBE400884,0x8A400884,0xFE180BD5,0xCC000002,0x8E140104,0x98000A69,0x7E0001F9,0x66000A69,0xC825C5,0xA8000EBB,0x86000984,0x86001301,0x74000831,
-0x62000E81,0x19825C5,0x5C001758,0x50001969,0x440025C6,0xC825C5,0xA8000EBB,0x86000984,0x86001301,0x74000831,0x62000E81,0x19825C5,0x5C001758,0x50001969,0x440025C6,0x19825C5,0x5C001758,0x50001969,0x440025C6,0x440025C6,0xFE5016DA,0xFE6C1FA5,0xF8801F85,0xFE0009DD,0xC00008C1,0x92000865,0x7C000545,0x6C000BD8,0xFE24174D,0xE8000C89,0x7400125D,0x50001969,
-0x12025C5,0x2C0B58,0x2C0B58,0x2C0B58,0x2C0B58,0xC8000000,0xC8000000,0xC8000000,0x60000001,0x60000001,0x40000001,0x64000882,0x64000882,0x64000882,0x4C000335,0x4C000335,0x3A00019A,0x30000882,0x30000882,0x2C00053D,0x20000882,0x400B58,0x400B58,0x400B58,0x40000556,0x40000556,0x340002EB,0x2800095D,0x2800095D,0x2A00061E,0x1E0008EE,0x800B58,
-0x800B58,0x20000809,0x1C000A0B,0x14000B5B,0xFC0C044E,0xF82006C9,0x2C0B58,0xC200039D,0x7C000352,0x6200034D,0x5A0002D0,0x400003E8,0xC200060D,0x900004DE,0x3E000892,0x2A00061E,0x5C0B58,0x400884,0x400884,0x400884,0x400884,0xC8000000,0xC8000000,0xC8000000,0x60000001,0x60000001,0x40000001,0x600882,0x600882,0x600882,0x4C000335,0x4C000335,
-0x3A00019A,0xC40882,0xC40882,0x2C00053D,0x20000882,0x600882,0x600882,0x600882,0x4C000335,0x4C000335,0x3A00019A,0xC40882,0xC40882,0x2C00053D,0x20000882,0xC40882,0xC40882,0x2C00053D,0x20000882,0x20000882,0xFE100422,0xFE2C052D,0x400884,0xC200039D,0x7C000352,0x6200034D,0x5A0002D0,0x400003E8,0xD6000568,0x9000048D,0x8C0882,0x2C00053D,
-0x8C0882,0xCC0A69,0xFE8C0050,0xAE840000,0x8A840000,0x1300A69,0xCC000002,0x8A3C0000,0x1DF40A69,0x7E0001F9,0x66000A69,0x1300A69,0xCC000002,0x8A3C0000,0x1DF40A69,0x7E0001F9,0x66000A69,0x1DF40A69,0x7E0001F9,0x66000A69,0x66000A69,0x1300A69,0xCC000002,0x8A3C0000,0x1DF40A69,0x7E0001F9,0x66000A69,0x1DF40A69,0x7E0001F9,0x66000A69,0x66000A69,0x1DF40A69,
-0x7E0001F9,0x66000A69,0x66000A69,0x66000A69,0xFC900745,0xDC0A69,0xFAC407C1,0xFE300322,0xD6000232,0x9E0001D4,0x840000CD,0x7A000340,0xFE700708,0xFC0402AD,0x8E00007D,0x66000A69,0x1B00A69,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table197[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x180000,0x180000,0x180000,0x180000,0x180000,
-0x180000,0x2C0000,0x2C0000,0x2C0000,0x6000001,0x180000,0x180000,0x180000,0x180000,0x180000,0x180000,0x2C0000,0x2C0000,0x2C0000,0x6000001,0x2C0000,0x2C0000,0x2C0000,0x6000001,0x6000001,0x100000,0x100000,0x100000,0x4100000,0x140000,0x140000,0x140000,0x180000,0x4100000,0x140000,0x200000,0x2C0000,
-0x200000,0x940000,0x940000,0x940000,0x940000,0xDC0000,0xDC0000,0xDC0000,0x1BC0000,0x1BC0000,0x48000001,0xDC0000,0xDC0000,0xDC0000,0x1BC0000,0x1BC0000,0x48000001,0x1BC0000,0x1BC0000,0x48000001,0x48000001,0xDC0000,0xDC0000,0xDC0000,0x1BC0000,0x1BC0000,0x48000001,0x1BC0000,0x1BC0000,0x48000001,0x48000001,0x1BC0000,
-0x1BC0000,0x48000001,0x48000001,0x48000001,0xAC0000,0x69C0000,0x940000,0xC80000,0xF80000,0x1380000,0x1680000,0x9F80000,0xB80000,0xDC0000,0x1380000,0x48000001,0x1380000,0x1240001,0x1B40000,0x5FF80000,0x92000000,0x1B40000,0x5FF80000,0x92000000,0x5FF80000,0x92000000,0x92000000,0x1B40000,0x5FF80000,0x92000000,0x5FF80000,0x92000000,
-0x92000000,0x5FF80000,0x92000000,0x92000000,0x92000000,0x1B40000,0x5FF80000,0x92000000,0x5FF80000,0x92000000,0x92000000,0x5FF80000,0x92000000,0x92000000,0x92000000,0x5FF80000,0x92000000,0x92000000,0x92000000,0x92000000,0x1700000,0x5380000,0x5380000,0x3EC0000,0x49FC0000,0x79F00000,0x92000000,0x92000000,0x1900000,0x19FC0000,0x8DCC0000,0x92000000,
-0x35FC0000,0x7436D9,0xFE4413A8,0xCC380CE4,0x92380CE4,0xFE180DB5,0xD8080020,0x981001A8,0xA2080A89,0x860401D9,0x6E080A89,0xE60025C5,0xBA000D5B,0x8C0008D8,0x920011B1,0x80000671,0x68000D49,0x700025C5,0x6C001626,0x5C001831,0x4C0025C6,0xA836D7,0xA20019F4,0x800010AC,0x86001A01,0x74000CC5,0x620011B1,0x68002A9E,0x66001AB6,0x56001BC6,0x48002826,0x15836D7,
-0x560025F8,0x500024DD,0x44002EAA,0x380036D7,0xFE3C1CD5,0xFA642CB1,0xFE6C2E38,0xFE080A6B,0xD000070E,0x9E000678,0x880003A5,0x760009F9,0xFE181C4A,0xFC000B9A,0x7E00111A,0x56001BC6,0xF036D7,0x9825C5,0xFE680C84,0xC6500884,0x92500884,0xFE300C45,0xD4100002,0x96240104,0xA0100A69,0x860401D5,0x6E100A69,0xE025C5,0xBA000D5B,0x8C0008D8,0x920011B1,0x80000671,
-0x68000D49,0x1CC25C5,0x6C001626,0x5C001831,0x4C0025C6,0xE025C5,0xBA000D5B,0x8C0008D8,0x920011B1,0x80000671,0x68000D49,0x1CC25C5,0x6C001626,0x5C001831,0x4C0025C6,0x1CC25C5,0x6C001626,0x5C001831,0x4C0025C6,0x4C0025C6,0xFE581771,0xFC881FBD,0xFE8C1F95,0xFE080A5B,0xD000070E,0x9E000678,0x880003A5,0x760009F9,0xFE3417D5,0xFC000A9A,0x7E0010DA,0x5C001831,
-0x14025C5,0x380CE4,0x380CE4,0x380CE4,0x380CE4,0xD8080020,0xD8080020,0xD8080020,0x6A080021,0x6A080021,0x48080021,0x7C000882,0x7C000882,0x7C000882,0x62000271,0x62000271,0x460000EA,0x3C000882,0x3C000882,0x380004A5,0x28000882,0x500CE3,0x500CE3,0x500CE3,0x4C000576,0x4C000576,0x400002B3,0x340009D5,0x340009D5,0x320005D6,0x2800092B,0xA00CE3,
-0xA00CE3,0x260008B1,0x20000AE6,0x1A000CE3,0xFE1004E2,0xFE2C0801,0x380CE4,0xE40002E9,0x9A00028A,0x6E00029A,0x6A0001F4,0x4C000332,0xF40005C5,0xB200046A,0x4C00089B,0x320005D6,0x700CE3,0x500884,0x500884,0x500884,0x500884,0xD0100000,0xD0100000,0xD0100000,0x68100001,0x68100001,0x48100001,0x780882,0x780882,0x780882,0x62000271,0x62000271,
-0x460000EA,0xF40882,0xF40882,0x380004A5,0x28000882,0x780882,0x780882,0x780882,0x62000271,0x62000271,0x460000EA,0xF40882,0xF40882,0x380004A5,0x28000882,0xF40882,0xF40882,0x380004A5,0x28000882,0x28000882,0xFE200451,0xFA440548,0x500884,0xE40002E9,0x9A00028A,0x6E00029A,0x6A0001F4,0x4C000332,0xF40004E4,0xB20003F1,0xAC0882,0x380004A5,
-0xAC0882,0xDC0A69,0xFEA40080,0xB6940000,0x92940000,0x1480A69,0xD8080000,0x924C0000,0x27FC0A69,0x840001A5,0x6E000A69,0x1480A69,0xD8080000,0x924C0000,0x27FC0A69,0x840001A5,0x6E000A69,0x27FC0A69,0x840001A5,0x6E000A69,0x6E000A69,0x1480A69,0xD8080000,0x924C0000,0x27FC0A69,0x840001A5,0x6E000A69,0x27FC0A69,0x840001A5,0x6E000A69,0x6E000A69,0x27FC0A69,
-0x840001A5,0x6E000A69,0x6E000A69,0x6E000A69,0xFE940781,0xEC0A69,0xFECC07D9,0xFE400361,0xE00001CA,0xA8000190,0x8C000088,0x820002E4,0xF6880745,0xFE1402E4,0x9600003D,0x6E000A69,0x1D40A69,0x80020,0x80020,0x80020,0x80020,0x80020,0x80020,0x80020,0x80020,0x80020,0x80020,0x16000000,0x16000000,0x16000000,0x16000000,0x16000000,
-0x16000000,0xA000001,0xA000001,0xA000001,0x6000001,0xC0020,0xC0020,0xC0020,0xC0020,0xC0020,0xC0020,0x600000D,0x600000D,0x600000D,0x6000005,0x140020,0x140020,0x140020,0x4000012,0x2000022,0x78000000,0x80020,0x80020,0x36000000,0x24000000,0x1C000000,0x1C000000,0x12000000,0x36000009,0x24000004,0xE000001,0x600000D,
-0x100020,};
-static const uint32_t g_etc1_to_bc7_m6_table198[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x200000,0x300000,0x300000,0x300000,0x300000,0x300000,
-0x300000,0x5C0000,0x5C0000,0x5C0000,0xE000001,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x5C0000,0x5C0000,0x5C0000,0xE000001,0x5C0000,0x5C0000,0x5C0000,0xE000001,0xE000001,0x8200000,0x200000,0x200000,0x240000,0x280000,0x2C0000,0x2C0000,0x340000,0x240000,0x280000,0x400000,0x5C0000,
-0x400000,0xA40000,0xA40000,0xA40000,0xA40000,0xF40000,0xF40000,0xF40000,0x1F00000,0x1F00000,0x50000001,0xF40000,0xF40000,0xF40000,0x1F00000,0x1F00000,0x50000001,0x1F00000,0x1F00000,0x50000001,0x50000001,0xF40000,0xF40000,0xF40000,0x1F00000,0x1F00000,0x50000001,0x1F00000,0x1F00000,0x50000001,0x50000001,0x1F00000,
-0x1F00000,0x50000001,0x50000001,0x50000001,0x6BC0000,0xEAC0000,0xA40000,0x2DC0000,0x1140000,0x15C0000,0x1900000,0x15F40000,0x2CC0000,0xF40000,0x15C0000,0x50000001,0x15C0000,0x1340001,0x1CC0000,0x6BF80000,0x9A000000,0x1CC0000,0x6BF80000,0x9A000000,0x6BF80000,0x9A000000,0x9A000000,0x1CC0000,0x6BF80000,0x9A000000,0x6BF80000,0x9A000000,
-0x9A000000,0x6BF80000,0x9A000000,0x9A000000,0x9A000000,0x1CC0000,0x6BF80000,0x9A000000,0x6BF80000,0x9A000000,0x9A000000,0x6BF80000,0x9A000000,0x9A000000,0x9A000000,0x6BF80000,0x9A000000,0x9A000000,0x9A000000,0x9A000000,0x1840000,0xD480000,0xD480000,0xBFC0000,0x57FC0000,0x83F00000,0x9A000000,0x9A000000,0x3A40000,0x2BFC0000,0x95DC0000,0x9A000000,
-0x43FC0000,0x803A9D,0xFE5016F4,0xD6400EC4,0x9A400EC4,0xFE300F95,0xE4100082,0x9E1C0268,0xAA100AE9,0x900C020D,0x76100AE9,0xFE0025C5,0xCC000C2B,0x98000888,0xA2001052,0x860004F1,0x74000C51,0x7C0025C5,0x720014E2,0x66001742,0x540025C6,0xBC3A9B,0xAE001B0C,0x8C00119C,0x92001A61,0x80000C45,0x6E001179,0x74002BBE,0x6C001A56,0x60001B66,0x520028A3,0x17C3A9B,
-0x5C0027DC,0x56002631,0x4A00308E,0x3E003A9B,0xFE501FDA,0xFE6C3029,0xFE6C3258,0xFE140B8E,0xE2000578,0xAE0004FD,0x9200026A,0x8000085D,0xFE241F6E,0xFC000ADA,0x86000FF4,0x60001B66,0x10C3A9B,0xA825C5,0xFE740D24,0xCE600884,0x9A600884,0xFE440CB5,0xDC200002,0x9E340104,0xA8200A69,0x8E1401D5,0x76200A69,0xF825C5,0xCC000C2B,0x98000888,0xA2001052,0x860004F1,
-0x74000C51,0x1FC25C5,0x720014E2,0x66001742,0x540025C6,0xF825C5,0xCC000C2B,0x98000888,0xA2001052,0x860004F1,0x74000C51,0x1FC25C5,0x720014E2,0x66001742,0x540025C6,0x1FC25C5,0x720014E2,0x66001742,0x540025C6,0x540025C6,0xFC74181E,0xFE8C2035,0xF8A02004,0xFE140B2A,0xE2000578,0xAE0004FD,0x9200026A,0x8000085D,0xFE44186E,0xFC0009DA,0x86000FB4,0x66001742,
-0x16425C5,0x400EC4,0x400EC4,0x400EC4,0x400EC4,0xE8100080,0xE8100080,0xE8100080,0x74100081,0x74100081,0x50100081,0x94000882,0x94000882,0x94000882,0x680001BD,0x680001BD,0x4C00006A,0x48000882,0x48000882,0x3E00040D,0x30000882,0x600EC3,0x600EC3,0x600EC3,0x580005D6,0x580005D6,0x460002BB,0x40000A6D,0x40000A6D,0x3A0005BA,0x2E00096B,0xC40EC3,
-0xC40EC3,0x32000989,0x26000BEE,0x20000EC3,0xFE2005E1,0xF43809E0,0x400EC4,0xFA00024A,0xB20001E1,0x880001E1,0x8000015D,0x6000028A,0xFE000614,0xC40003F1,0x5C0008A6,0x3A0005BA,0x8C0EC3,0x600884,0x600884,0x600884,0x600884,0xD8200000,0xD8200000,0xD8200000,0x70200001,0x70200001,0x50200001,0x900882,0x900882,0x900882,0x680001BD,0x680001BD,
-0x4C00006A,0x1240882,0x1240882,0x3E00040D,0x30000882,0x900882,0x900882,0x900882,0x680001BD,0x680001BD,0x4C00006A,0x1240882,0x1240882,0x3E00040D,0x30000882,0x1240882,0x1240882,0x3E00040D,0x30000882,0x30000882,0xFA340480,0xFE4C0568,0x600884,0xFA00024A,0xB20001E1,0x880001E1,0x8000015D,0x6000028A,0xFE0C04E2,0xD000034D,0xD00882,0x3E00040D,
-0xD00882,0xEC0A69,0xFEB000B4,0xBEA40000,0x9AA40000,0x1600A69,0xE0180000,0x9A5C0000,0x33FC0A69,0x90000145,0x76000A69,0x1600A69,0xE0180000,0x9A5C0000,0x33FC0A69,0x90000145,0x76000A69,0x33FC0A69,0x90000145,0x76000A69,0x76000A69,0x1600A69,0xE0180000,0x9A5C0000,0x33FC0A69,0x90000145,0x76000A69,0x33FC0A69,0x90000145,0x76000A69,0x76000A69,0x33FC0A69,
-0x90000145,0x76000A69,0x76000A69,0x76000A69,0xFEB40782,0xFC0A69,0xFAE40800,0xFE5803B5,0xF6000184,0xB6000132,0x94000048,0x8A000290,0xFE980745,0xFE2C0340,0xA200001D,0x76000A69,0x1F40A69,0x100080,0x100080,0x100080,0x100080,0x100080,0x100080,0x100080,0x100080,0x100080,0x100080,0x30000000,0x30000000,0x30000000,0x30000000,0x30000000,
-0x30000000,0x16000001,0x16000001,0x16000001,0xE000001,0x180080,0x180080,0x180080,0x180080,0x180080,0x180080,0x1200002D,0x1200002D,0x1200002D,0xC000019,0x2C0080,0x2C0080,0x2C0080,0xA00004A,0x6000082,0xF8000000,0x100080,0x100080,0x6E000000,0x4C000000,0x3A000000,0x3A000000,0x26000000,0x60000028,0x42000014,0x1E000004,0x1200002D,
-0x200080,};
-static const uint32_t g_etc1_to_bc7_m6_table199[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x300000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,
-0x2440000,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x2440000,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x8C0000,0x8C0000,0x8C0000,0x16000001,0x16000001,0x340000,0x300000,0x300000,0x380000,0x3C0000,0x400000,0x400000,0x500000,0x380000,0x3C0000,0x640000,0x8C0000,
-0x640000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0x58000001,0x58000001,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0x58000001,0x58000001,0xBF80000,
-0xBF80000,0x58000001,0x58000001,0x58000001,0xD00000,0xC00000,0xB40000,0xF40000,0x32C0000,0x17C0000,0x1B80000,0x1FF80000,0x2E00000,0x10C0000,0x17C0000,0x58000001,0x17C0000,0x1440001,0x1E40000,0x77F80000,0xA2000000,0x1E40000,0x77F80000,0xA2000000,0x77F80000,0xA2000000,0xA2000000,0x1E40000,0x77F80000,0xA2000000,0x77F80000,0xA2000000,
-0xA2000000,0x77F80000,0xA2000000,0xA2000000,0xA2000000,0x1E40000,0x77F80000,0xA2000000,0x77F80000,0xA2000000,0xA2000000,0x77F80000,0xA2000000,0xA2000000,0xA2000000,0x77F80000,0xA2000000,0xA2000000,0xA2000000,0xA2000000,0x1980000,0x15C0000,0x15C0000,0x1DFC0000,0x65FC0000,0x8DF00000,0xA2000000,0xA2000000,0x1BC0000,0x3BFC0000,0x9DEC0000,0xA2000000,
-0x53FC0000,0x8C3EC1,0xFE5C1AB8,0xDE4C10F9,0xA24C10F8,0xFE3C1205,0xEC180130,0xA8240370,0xB4180B89,0x98100285,0x7E180B89,0xFE0C2621,0xE2000B28,0xA204088E,0xAE000F3A,0x92000391,0x7A000B81,0x880025C5,0x7E0013B2,0x6C00162E,0x5C0025C6,0xCC3EBF,0xBA001C84,0x9800130C,0xA2001ACB,0x8C000C25,0x74001185,0x80002CFE,0x76001A25,0x66001B26,0x5800292F,0x1A03EBF,
-0x66002A4B,0x5C0027D5,0x500032AA,0x44003EBF,0xFE582361,0xF8803495,0xFA843661,0xFE180DAD,0xF6000408,0xBC000398,0x9E000164,0x8A0006D0,0xFE342275,0xFE000B6D,0x94000EAC,0x66001B26,0x1243EBF,0xB825C5,0xFE8C0DD4,0xD6700884,0xA2700884,0xFE500D49,0xE4300002,0xA6440104,0xB0300A69,0x962401D5,0x7E300A69,0x11025C5,0xE2000B28,0xA2080884,0xAE000F3A,0x92000391,
-0x7A000B81,0xDFC25C5,0x7E0013B2,0x6C00162E,0x5C0025C6,0x11025C5,0xE2000B28,0xA2080884,0xAE000F3A,0x92000391,0x7A000B81,0xDFC25C5,0x7E0013B2,0x6C00162E,0x5C0025C6,0xDFC25C5,0x7E0013B2,0x6C00162E,0x5C0025C6,0x5C0025C6,0xFE78188A,0xFCA82039,0xFEAC2018,0xFE300C02,0xF6000408,0xBC000398,0x9E000164,0x8A0006D0,0xFE5C18B6,0xFE040A46,0x94000E5B,0x6C00162E,
-0x18825C5,0x4C10F8,0x4C10F8,0x4C10F8,0x4C10F8,0xF8180120,0xF8180120,0xF8180120,0x7E180121,0x7E180121,0x58180121,0xAC000882,0xAC000882,0xAC000882,0x7A000131,0x7A000131,0x5800001A,0x54000882,0x54000882,0x4A000385,0x38000882,0x7010F8,0x7010F8,0x7010F8,0x62000651,0x62000651,0x52000313,0x4C000B25,0x4C000B25,0x420005AE,0x340009C3,0xE410F8,
-0xE410F8,0x38000AA9,0x2C000D26,0x240010FB,0xFE2C0756,0xFA440BCC,0x4C10F8,0xFE08025A,0xC6000151,0x9A000161,0x8C0000CD,0x6A0001E2,0xFE000734,0xDA000398,0x660008AE,0x420005AE,0xA010F8,0x700884,0x700884,0x700884,0x700884,0xE0300000,0xE0300000,0xE0300000,0x78300001,0x78300001,0x58300001,0xA80882,0xA80882,0xA80882,0x7A000131,0x7A000131,
-0x5800001A,0x1580882,0x1580882,0x4A000385,0x38000882,0xA80882,0xA80882,0xA80882,0x7A000131,0x7A000131,0x5800001A,0x1580882,0x1580882,0x4A000385,0x38000882,0x1580882,0x1580882,0x4A000385,0x38000882,0x38000882,0xFE3C04A0,0xFA64057D,0x700884,0xFE08024A,0xC6000151,0x9A000161,0x8C0000CD,0x6A0001E2,0xFE140514,0xEC0002D0,0xF00882,0x4A000385,
-0xF00882,0xFC0A69,0xFEC400E9,0xC6B40000,0xA2B40000,0x1780A69,0xE8280000,0xA26C0000,0x3FFC0A69,0x9A000104,0x7E000A69,0x1780A69,0xE8280000,0xA26C0000,0x3FFC0A69,0x9A000104,0x7E000A69,0x3FFC0A69,0x9A000104,0x7E000A69,0x7E000A69,0x1780A69,0xE8280000,0xA26C0000,0x3FFC0A69,0x9A000104,0x7E000A69,0x3FFC0A69,0x9A000104,0x7E000A69,0x7E000A69,0x3FFC0A69,
-0x9A000104,0x7E000A69,0x7E000A69,0x7E000A69,0xFAC807C1,0x10C0A69,0xFEEC0820,0xFE6C03FA,0xFA040152,0xBE0000F2,0x9E000020,0x9400022D,0xFEAC0784,0xFE4C037A,0xAC000005,0x7E000A69,0xDFC0A69,0x180120,0x180120,0x180120,0x180120,0x180120,0x180120,0x180120,0x180120,0x180120,0x180120,0x48000000,0x48000000,0x48000000,0x48000000,0x48000000,
-0x48000000,0x22000000,0x22000000,0x22000000,0x16000001,0x240120,0x240120,0x240120,0x240120,0x240120,0x240120,0x1E00006D,0x1E00006D,0x1E00006D,0x1400003A,0x440120,0x440120,0x440120,0x100000AA,0xA000122,0xFC080020,0x180120,0x180120,0xA8000000,0x74000000,0x58000000,0x58000000,0x3A000000,0x84000059,0x6400002D,0x2C000009,0x1E00006D,
-0x300120,};
-static const uint32_t g_etc1_to_bc7_m6_table200[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x400001,0x600000,0x600000,0x600000,0x600000,0x600000,
-0x600000,0xC40000,0xC40000,0xC40000,0x20000000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0xC40000,0xC40000,0xC40000,0x20000000,0xC40000,0xC40000,0xC40000,0x20000000,0x20000000,0xA440000,0x400001,0x400001,0x4C0000,0x4500000,0x580000,0x580000,0x26C0000,0x4C0000,0x4500000,0x8C0000,0xC40000,
-0x8C0000,0xC40001,0xC40001,0xC40001,0xC40001,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0x17FC0000,
-0x17FC0000,0x62000000,0x62000000,0x62000000,0x4E40000,0xD40000,0xC40001,0x10C0000,0x14C0000,0x1A40000,0x1E40000,0x2BF80000,0xF80000,0x3240000,0x1A40000,0x62000000,0x1A40000,0x1580000,0x3FC0000,0x85F80000,0xAA000001,0x3FC0000,0x85F80000,0xAA000001,0x85F80000,0xAA000001,0xAA000001,0x3FC0000,0x85F80000,0xAA000001,0x85F80000,0xAA000001,
-0xAA000001,0x85F80000,0xAA000001,0xAA000001,0xAA000001,0x3FC0000,0x85F80000,0xAA000001,0x85F80000,0xAA000001,0xAA000001,0x85F80000,0xAA000001,0xAA000001,0xAA000001,0x85F80000,0xAA000001,0xAA000001,0xAA000001,0xAA000001,0x1B00000,0xF6C0000,0xF6C0000,0x33FC0000,0x75F80000,0x97F80000,0xAA000001,0xAA000001,0x1D40000,0x4FFC0000,0xA7E00000,0xAA000001,
-0x63FC0000,0x9843DA,0xFE741F87,0xEA5813DA,0xAA5813DB,0xFE441546,0xF824023F,0xB43004F3,0xC0200C8A,0xA020035E,0x86200C8A,0xFE1C2759,0xEE000A1B,0xAC0C08D9,0xC0000E29,0x9E000252,0x86000AE2,0x960025C5,0x8A001275,0x7800150D,0x640025C5,0xE043DA,0xCC001E91,0xA200151E,0xA8001BBA,0x92000C52,0x800011F2,0x8C002E81,0x80001A11,0x72001AE5,0x620029D5,0x1CC43DA,
-0x6C002D36,0x66002A52,0x56003551,0x4A0043DE,0xFE6427D8,0xFE8C3942,0xFE8C3B6A,0xFE301079,0xFC000338,0xC8000260,0xA6000092,0x96000559,0xFE4426F1,0xFE040DC1,0x9E000D7F,0x72001AE5,0x14043DA,0xCC25C6,0xFE980EA3,0xDE840883,0xAA840883,0xFE680DF2,0xEE400003,0xAE540103,0xB8400A6A,0xA03801D6,0x86400A6A,0x12C25C5,0xEE000A1B,0xAA1C0883,0xC0000E29,0x9E000252,
-0x86000AE2,0x1BF825C5,0x8A001275,0x7800150D,0x640025C5,0x12C25C5,0xEE000A1B,0xAA1C0883,0xC0000E29,0x9E000252,0x86000AE2,0x1BF825C5,0x8A001275,0x7800150D,0x640025C5,0x1BF825C5,0x8A001275,0x7800150D,0x640025C5,0x640025C5,0xFE941919,0xF6BC20B2,0xFAC42082,0xFE440D24,0xFC000338,0xC8000260,0xA6000092,0x96000559,0xFE781975,0xFE240B83,0x9E000D1B,0x7800150D,
-0x1AC25C5,0x5813DA,0x5813DA,0x5813DA,0x5813DA,0xFE24022E,0xFE24022E,0xFE24022E,0x8A200221,0x8A200221,0x62200221,0xC8000882,0xC8000882,0xC8000882,0x860000B9,0x860000B9,0x62000001,0x60000884,0x60000884,0x500002FD,0x40000884,0x28013DA,0x28013DA,0x28013DA,0x7400074D,0x7400074D,0x580003BD,0x58000C13,0x58000C13,0x4C0005BE,0x40000A3D,0x10813DA,
-0x10813DA,0x3E000C45,0x32000EC8,0x2A0013DD,0xFC380952,0xFE4C0E5E,0x5813DA,0xFE100315,0xDE0000D0,0xAE0000CD,0xA2000068,0x78000164,0xFE1408DB,0xFC000344,0x7C0008C2,0x4C0005BE,0xB813DA,0x840882,0x840882,0x840882,0x840882,0xE6440001,0xE6440001,0xE6440001,0x82400001,0x82400001,0x62400001,0xC40882,0xC40882,0xC40882,0x860000B9,0x860000B9,
-0x62000001,0x18C0882,0x18C0882,0x500002FD,0x40000884,0xC40882,0xC40882,0xC40882,0x860000B9,0x860000B9,0x62000001,0x18C0882,0x18C0882,0x500002FD,0x40000884,0x18C0882,0x18C0882,0x500002FD,0x40000884,0x40000884,0xFE5804B1,0xF47805B2,0x840882,0xFE240288,0xDE0000D0,0xAE0000CD,0xA2000068,0x78000164,0xFE340515,0xFC000244,0x1180882,0x500002FD,
-0x1180882,0x1100A69,0xFEDC013D,0xCEC80001,0xAAC40002,0x1900A69,0xF03C0000,0xAA800001,0x4DFC0A69,0xA40000C1,0x86000A69,0x1900A69,0xF03C0000,0xAA800001,0x4DFC0A69,0xA40000C1,0x86000A69,0x4DFC0A69,0xA40000C1,0x86000A69,0x86000A69,0x1900A69,0xF03C0000,0xAA800001,0x4DFC0A69,0xA40000C1,0x86000A69,0x4DFC0A69,0xA40000C1,0x86000A69,0x86000A69,0x4DFC0A69,
-0xA40000C1,0x86000A69,0x86000A69,0x86000A69,0xFED007FD,0x1200A69,0xFD080841,0xFE940451,0xFE100172,0xD00000A9,0xA800000A,0x9C0001E1,0xF6CC07C1,0xFE6403E8,0xB6080000,0x86000A69,0x1FF80A69,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x200221,0x64000000,0x64000000,0x64000000,0x64000000,0x64000000,
-0x64000000,0x30000000,0x30000000,0x30000000,0x20000000,0x300221,0x300221,0x300221,0x300221,0x300221,0x300221,0x280000C2,0x280000C2,0x280000C2,0x1E000068,0x5C0221,0x5C0221,0x5C0221,0x16000145,0x10000221,0xFE0C009D,0x200221,0x200221,0xE8000000,0xA0000000,0x7A000000,0x7A000000,0x50000000,0xC40000A9,0x96000055,0x3E000010,0x280000C2,
-0x400221,};
-static const uint32_t g_etc1_to_bc7_m6_table201[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x500001,0x780000,0x780000,0x780000,0x780000,0x780000,
-0x780000,0xF40000,0xF40000,0xF40000,0x28000000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0xF40000,0xF40000,0xF40000,0x28000000,0xF40000,0xF40000,0xF40000,0x28000000,0x28000000,0x580000,0x500001,0x500001,0x600000,0x4640000,0x700000,0x700000,0x880000,0x600000,0x4640000,0xAC0000,0xF40000,
-0xAC0000,0xD40001,0xD40001,0xD40001,0xD40001,0x33C0000,0x33C0000,0x33C0000,0x23FC0000,0x23FC0000,0x6A000000,0x33C0000,0x33C0000,0x33C0000,0x23FC0000,0x23FC0000,0x6A000000,0x23FC0000,0x23FC0000,0x6A000000,0x6A000000,0x33C0000,0x33C0000,0x33C0000,0x23FC0000,0x23FC0000,0x6A000000,0x23FC0000,0x23FC0000,0x6A000000,0x6A000000,0x23FC0000,
-0x23FC0000,0x6A000000,0x6A000000,0x6A000000,0xF80000,0xE40000,0xD40001,0x1240000,0x1680000,0x1C80000,0x7F80000,0x37F40000,0x10C0000,0x33C0000,0x1C80000,0x6A000000,0x1C80000,0x1680000,0x1BFC0000,0x91F80000,0xB2000001,0x1BFC0000,0x91F80000,0xB2000001,0x91F80000,0xB2000001,0xB2000001,0x1BFC0000,0x91F80000,0xB2000001,0x91F80000,0xB2000001,
-0xB2000001,0x91F80000,0xB2000001,0xB2000001,0xB2000001,0x1BFC0000,0x91F80000,0xB2000001,0x91F80000,0xB2000001,0xB2000001,0x91F80000,0xB2000001,0xB2000001,0xB2000001,0x91F80000,0xB2000001,0xB2000001,0xB2000001,0xB2000001,0x1C40000,0x1800000,0x1800000,0x47FC0000,0x81FC0000,0xA1FC0000,0xB2000001,0xB2000001,0x3E80000,0x61FC0000,0xAFF00000,0xB2000001,
-0x73FC0000,0xA448CA,0xFE802443,0xF26416C6,0xB26416C7,0xFE5018EA,0xFE300393,0xBA3C069F,0xCA280DB2,0xA824045E,0x8E280DB2,0xFE30290A,0xFA00097B,0xB6140951,0xCC000D49,0xA800016E,0x8E000A8D,0xA20025C6,0x9000116D,0x7E001419,0x6C0025C5,0xF448CA,0xE20020BF,0xAE00178E,0xB4001CE2,0xA2000CB3,0x86001292,0x98003001,0x8A001A29,0x7A001AD5,0x68002A81,0x1F048CA,
-0x7800301E,0x6C002C86,0x5C0037E9,0x500048CE,0xFE782C61,0xFE8C3E62,0xF8A040BB,0xFE401395,0xFE0403A8,0xD6000182,0xB0000031,0xA0000422,0xFE542B17,0xFE1810AE,0xA6000CB9,0x7A001AD5,0x15C48CA,0xDC25C6,0xFEB00F63,0xE6940883,0xB2940883,0xFE800E9A,0xF6500003,0xB6640103,0xC0500A6A,0xA84801D6,0x8E500A6A,0x14425C5,0xFA00097B,0xB22C0883,0xCC000D49,0xA800016E,
-0x8E000A8D,0x27F825C5,0x9000116D,0x7E001419,0x6C0025C5,0x14425C5,0xFA00097B,0xB22C0883,0xCC000D49,0xA800016E,0x8E000A8D,0x27F825C5,0x9000116D,0x7E001419,0x6C0025C5,0x27F825C5,0x9000116D,0x7E001419,0x6C0025C5,0x6C0025C5,0xFEA019C8,0xFECC20B2,0xFECC20B2,0xFE5C0DFE,0xFE100393,0xD6000182,0xB0000031,0xA0000422,0xFE8819FE,0xFE400C4D,0xAC000C4E,0x7E001419,
-0x1D025C5,0x6416C6,0x6416C6,0x6416C6,0x6416C6,0xFE300392,0xFE300392,0xFE300392,0x94280349,0x94280349,0x6A280349,0xE0000882,0xE0000882,0xE0000882,0x98000061,0x98000061,0x6C040018,0x6C000884,0x6C000884,0x5C00027D,0x48000884,0x9016C5,0x9016C5,0x9016C5,0x80000875,0x80000875,0x62000491,0x62000CE4,0x62000CE4,0x540005F4,0x46000AA5,0x12416C5,
-0x12416C5,0x44000E01,0x3E001058,0x300016C5,0xFE3C0B76,0xF458115D,0x6416C6,0xFE18042D,0xFC000074,0xC0000074,0xAE000022,0x840000FA,0xFE200AF6,0xFE00038A,0x860008CA,0x540005F4,0xD016C5,0x940882,0x940882,0x940882,0x940882,0xEE540001,0xEE540001,0xEE540001,0x8A500001,0x8A500001,0x6A500001,0xDC0882,0xDC0882,0xDC0882,0x98000061,0x98000061,
-0x6A100001,0x1BC0882,0x1BC0882,0x5C00027D,0x48000884,0xDC0882,0xDC0882,0xDC0882,0x98000061,0x98000061,0x6A100001,0x1BC0882,0x1BC0882,0x5C00027D,0x48000884,0x1BC0882,0x1BC0882,0x5C00027D,0x48000884,0x48000884,0xFA6C04E2,0xFC8805B2,0x940882,0xFE3402B1,0xFC000074,0xC0000074,0xAE000022,0x840000FA,0xFA48054A,0xFC140265,0x1380882,0x5C00027D,
-0x1380882,0x1200A69,0xFEE80195,0xD6D80001,0xB2D40002,0x1A80A69,0xF84C0000,0xB2900001,0x59FC0A69,0xAC000082,0x8E000A69,0x1A80A69,0xF84C0000,0xB2900001,0x59FC0A69,0xAC000082,0x8E000A69,0x59FC0A69,0xAC000082,0x8E000A69,0x8E000A69,0x1A80A69,0xF84C0000,0xB2900001,0x59FC0A69,0xAC000082,0x8E000A69,0x59FC0A69,0xAC000082,0x8E000A69,0x8E000A69,0x59FC0A69,
-0xAC000082,0x8E000A69,0x8E000A69,0x8E000A69,0xFAF00802,0x1300A69,0xFF0C087D,0xFEAC04B1,0xFE3C01C4,0xDA00006A,0xB2040001,0xA800019A,0xFEDC07C1,0xFE880424,0xBE180000,0x8E000A69,0x2DFC0A69,0x280349,0x280349,0x280349,0x280349,0x280349,0x280349,0x280349,0x280349,0x280349,0x280349,0x7C000000,0x7C000000,0x7C000000,0x7C000000,0x7C000000,
-0x7C000000,0x3C000000,0x3C000000,0x3C000000,0x28000000,0x3C0349,0x3C0349,0x3C0349,0x3C0349,0x3C0349,0x3C0349,0x2E000132,0x2E000132,0x2E000132,0x2200009D,0x740349,0x740349,0x740349,0x1C0001F9,0x14000349,0xF4180154,0x280349,0x280349,0xFE04000D,0xC8000000,0x98000000,0x98000000,0x64000000,0xF6000105,0xB4000089,0x46000019,0x2E000132,
-0x540349,};
-static const uint32_t g_etc1_to_bc7_m6_table202[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x600001,0x900000,0x900000,0x900000,0x900000,0x900000,
-0x900000,0x1240000,0x1240000,0x1240000,0x30000000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x1240000,0x1240000,0x1240000,0x30000000,0x1240000,0x1240000,0x1240000,0x30000000,0x30000000,0x680000,0x600001,0x600001,0x2700000,0x4780000,0x840000,0x840000,0xA40000,0x2700000,0x4780000,0xD00000,0x1240000,
-0xD00000,0xE40001,0xE40001,0xE40001,0xE40001,0x1540000,0x1540000,0x1540000,0x2FFC0000,0x2FFC0000,0x72000000,0x1540000,0x1540000,0x1540000,0x2FFC0000,0x2FFC0000,0x72000000,0x2FFC0000,0x2FFC0000,0x72000000,0x72000000,0x1540000,0x1540000,0x1540000,0x2FFC0000,0x2FFC0000,0x72000000,0x2FFC0000,0x2FFC0000,0x72000000,0x72000000,0x2FFC0000,
-0x2FFC0000,0x72000000,0x72000000,0x72000000,0x10C0000,0x2F40000,0xE40001,0x1380000,0x3800000,0x1E80000,0x15F80000,0x41F80000,0x1200000,0x1540000,0x1E80000,0x72000000,0x1E80000,0x1780000,0x35FC0000,0x9DF40000,0xBA000001,0x35FC0000,0x9DF40000,0xBA000001,0x9DF40000,0xBA000001,0xBA000001,0x35FC0000,0x9DF40000,0xBA000001,0x9DF40000,0xBA000001,
-0xBA000001,0x9DF40000,0xBA000001,0xBA000001,0xBA000001,0x35FC0000,0x9DF40000,0xBA000001,0x9DF40000,0xBA000001,0xBA000001,0x9DF40000,0xBA000001,0xBA000001,0xBA000001,0x9DF40000,0xBA000001,0xBA000001,0xBA000001,0xBA000001,0x1D80000,0x1900000,0x1900000,0x5BFC0000,0x8FFC0000,0xABFC0000,0xBA000001,0xBA000001,0x5FC0000,0x71FC0000,0xB9C40000,0xBA000001,
-0x81FC0000,0xB04E1A,0xFE8C2977,0xFC6C1A06,0xBA6C1A07,0xFE5C1D1E,0xFE3C0597,0xC24408A6,0xD4300F1A,0xB22C059E,0x96300F1A,0xFE3C2B5E,0xFE080993,0xC01C0A01,0xD8000C89,0xB40000C6,0x96000A6A,0xAE0025C6,0x9C001065,0x8A001331,0x740025C5,0x1044E1A,0xE800231B,0xB4001A6E,0xC0001E4A,0xA8000D5F,0x8C001392,0xA200315F,0x92001A55,0x82001ADB,0x6E002B4D,0x7FC4E1A,
-0x7E003356,0x72002F0A,0x66003AC6,0x56004E1E,0xFE803152,0xFAA443AA,0xFEAC45DF,0xFE441768,0xFE100503,0xE20000CE,0xBC00000B,0xAA00032A,0xFE5C2FFA,0xFE241464,0xB6000BBD,0x82001ADB,0x1744E1A,0xEC25C6,0xFEC41026,0xEEA40883,0xBAA40883,0xFE980F62,0xFE600003,0xBE740103,0xC8600A6A,0xB05801D6,0x96600A6A,0x15C25C5,0xFE080983,0xBA3C0883,0xD8000C89,0xB40000C6,
-0x96000A6A,0x33F825C5,0x9C001065,0x8A001331,0x740025C5,0x15C25C5,0xFE080983,0xBA3C0883,0xD8000C89,0xB40000C6,0x96000A6A,0x33F825C5,0x9C001065,0x8A001331,0x740025C5,0x33F825C5,0x9C001065,0x8A001331,0x740025C5,0x740025C5,0xFEB41A2D,0xF8E0212E,0xFAE42103,0xFE780F0E,0xFE24046B,0xE20000CE,0xBC00000B,0xAA00032A,0xFE981A99,0xFE480D53,0xB6000B44,0x8A001331,
-0x1F025C5,0x6C1A06,0x6C1A06,0x6C1A06,0x6C1A06,0xFE3C0566,0xFE3C0566,0xFE3C0566,0x9E3004B1,0x9E3004B1,0x723004B1,0xF8000882,0xF8000882,0xF8000882,0xA8000025,0xA8000025,0x76080051,0x78000884,0x78000884,0x66000220,0x50000884,0xA41A05,0xA41A05,0xA41A05,0x8C0009DD,0x8C0009DD,0x680005C9,0x6E000DF4,0x6E000DF4,0x5E000632,0x4C000B25,0x14C1A05,
-0x14C1A05,0x50001001,0x44001218,0x36001A05,0xFE4C0DF1,0xFA641455,0x6C1A06,0xFE2C05C9,0xFE040078,0xD200003A,0xC0000004,0x960000A0,0xFC300D65,0xFE0004CA,0x960008DB,0x5E000632,0xE81A05,0xA40882,0xA40882,0xA40882,0xA40882,0xF6640001,0xF6640001,0xF6640001,0x92600001,0x92600001,0x72600001,0xF40882,0xF40882,0xF40882,0xA8000025,0xA8000025,
-0x72200001,0x1F00882,0x1F00882,0x66000220,0x50000884,0xF40882,0xF40882,0xF40882,0xA8000025,0xA8000025,0x72200001,0x1F00882,0x1F00882,0x66000220,0x50000884,0x1F00882,0x1F00882,0x66000220,0x50000884,0x50000884,0xFE740502,0xF49805E9,0xA40882,0xFE4402E4,0xFE040074,0xD200003A,0xC0000004,0x960000A0,0xFE50057A,0xFE24028A,0x15C0882,0x66000220,
-0x15C0882,0x1300A69,0xFF0001ED,0xDEE80001,0xBAE40002,0x1C00A69,0xFE600002,0xBAA00001,0x65F80A69,0xB6000059,0x96000A69,0x1C00A69,0xFE600002,0xBAA00001,0x65F80A69,0xB6000059,0x96000A69,0x65F80A69,0xB6000059,0x96000A69,0x96000A69,0x1C00A69,0xFE600002,0xBAA00001,0x65F80A69,0xB6000059,0x96000A69,0x65F80A69,0xB6000059,0x96000A69,0x96000A69,0x65F80A69,
-0xB6000059,0x96000A69,0x96000A69,0x96000A69,0xFEF80832,0x1440A69,0xFD280882,0xFEC004EA,0xFE500220,0xE6000050,0xBA140001,0xB2000151,0xFEF00800,0xFEA00488,0xC6280000,0x96000A69,0x3DF80A69,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x3004B1,0x94000000,0x94000000,0x94000000,0x94000000,0x94000000,
-0x94000000,0x48000000,0x48000000,0x48000000,0x30000000,0x4804B1,0x4804B1,0x4804B1,0x4804B1,0x4804B1,0x4804B1,0x3A0001BA,0x3A0001BA,0x3A0001BA,0x2E0000E5,0x8C04B1,0x8C04B1,0x8C04B1,0x200002E4,0x180004B1,0xF8200244,0x3004B1,0x3004B1,0xFC0C0064,0xEE000000,0xB6000000,0xB6000000,0x78000000,0xF60001A5,0xD60000C2,0x56000022,0x3A0001BA,
-0x6404B1,};
-static const uint32_t g_etc1_to_bc7_m6_table203[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0x700001,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,
-0xA80000,0x1580000,0x1580000,0x1580000,0x38000000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0xA80000,0x1580000,0x1580000,0x1580000,0x38000000,0x1580000,0x1580000,0x1580000,0x38000000,0x38000000,0x4780000,0x700001,0x700001,0x840000,0x48C0000,0x2980000,0x2980000,0xC00000,0x840000,0x48C0000,0xF00000,0x1580000,
-0xF00000,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0x3BFC0000,
-0x3BFC0000,0x7A000000,0x7A000000,0x7A000000,0x51C0000,0xB040000,0xF40001,0x34C0000,0x19C0000,0x7FC0000,0x21FC0000,0x4BFC0000,0x1340000,0x16C0000,0x7FC0000,0x7A000000,0x7FC0000,0x1880000,0x4DFC0000,0xA7FC0000,0xC2000001,0x4DFC0000,0xA7FC0000,0xC2000001,0xA7FC0000,0xC2000001,0xC2000001,0x4DFC0000,0xA7FC0000,0xC2000001,0xA7FC0000,0xC2000001,
-0xC2000001,0xA7FC0000,0xC2000001,0xC2000001,0xC2000001,0x4DFC0000,0xA7FC0000,0xC2000001,0xA7FC0000,0xC2000001,0xC2000001,0xA7FC0000,0xC2000001,0xC2000001,0xC2000001,0xA7FC0000,0xC2000001,0xC2000001,0xC2000001,0xC2000001,0x1EC0000,0x9A00000,0x9A00000,0x6FFC0000,0x9DF80000,0xB5FC0000,0xC2000001,0xC2000001,0x23FC0000,0x83FC0000,0xC1D40000,0xC2000001,
-0x91FC0000,0xBC53CA,0xFE982F23,0xFE781DB3,0xC2781D9B,0xFE6821E2,0xFE480853,0xCC4C0AE2,0xDE3810C2,0xBA34072F,0x9E3810C2,0xFE442E39,0xFE140A9F,0xCA240AE9,0xE2000BDE,0xBE000056,0x9E040A76,0xBA0025C6,0xA6000F93,0x9000124D,0x7C0025C5,0x11853CA,0xF400260B,0xC0001DBE,0xCC001FF2,0xB4000E7F,0x980014B2,0xA8003317,0x9E001AAD,0x8C001AFD,0x7A002C15,0x11F853CA,
-0x840036F6,0x780031DE,0x6C003DB6,0x5C0053CE,0xFE8C36AE,0xFEAC491A,0xFEAC4BCF,0xFE541B8E,0xFE24073F,0xEE000058,0xC6080023,0xB6000243,0xFE703553,0xFE301879,0xBE000B25,0x8C001AFD,0x19053CA,0xFC25C6,0xFED010EE,0xF6B40883,0xC2B40883,0xFEA41022,0xFE780023,0xC6840103,0xD0700A6A,0xB86801D6,0x9E700A6A,0x17425C5,0xFE200A0B,0xC24C0883,0xE2000BDE,0xBE000056,
-0x9E0C0A6A,0x3FF825C5,0xA6000F93,0x9000124D,0x7C0025C5,0x17425C5,0xFE200A0B,0xC24C0883,0xE2000BDE,0xBE000056,0x9E0C0A6A,0x3FF825C5,0xA6000F93,0x9000124D,0x7C0025C5,0x3FF825C5,0xA6000F93,0x9000124D,0x7C0025C5,0x7C0025C5,0xFED01ACD,0xFEEC2132,0xFEEC213B,0xFE940FE5,0xFE38056B,0xEE000058,0xC410000B,0xB6000243,0xFEB41B45,0xFE680E81,0xBE000A95,0x9000124D,
-0xBFC25C5,0x781D9A,0x781D9A,0x781D9A,0x781D9A,0xFE440795,0xFE440795,0xFE440795,0xA8380659,0xA8380659,0x7A380659,0xFE0C08AE,0xFE0C08AE,0xFE0C08AE,0xB4000005,0xB4000005,0x7E0C00AD,0x84000884,0x84000884,0x6C0001C4,0x58000884,0x2B01D9A,0x2B01D9A,0x2B01D9A,0x98000B85,0x98000B85,0x74000731,0x74000F18,0x74000F18,0x6800068C,0x52000BBD,0x1681D9A,
-0x1681D9A,0x56001235,0x4A001408,0x3A001D9D,0xFE5810F1,0xFE6C17AD,0x781D9A,0xFE3407B5,0xFE140116,0xE200000D,0xD0000002,0xA0000059,0xFE341005,0xFE100696,0xA60008E8,0x6800068C,0xFC1D9A,0xB40882,0xB40882,0xB40882,0xB40882,0xFE740001,0xFE740001,0xFE740001,0x9A700001,0x9A700001,0x7A700001,0x10C0882,0x10C0882,0x10C0882,0xB4000005,0xB4000005,
-0x7A300001,0xBF80882,0xBF80882,0x6C0001C4,0x58000884,0x10C0882,0x10C0882,0x10C0882,0xB4000005,0xB4000005,0x7A300001,0xBF80882,0xBF80882,0x6C0001C4,0x58000884,0xBF80882,0xBF80882,0x6C0001C4,0x58000884,0x58000884,0xFE900515,0xFCA805E9,0xB40882,0xFE5C02FD,0xFE1C009D,0xE200000D,0xCC080000,0xA0000059,0xFA70057D,0xFA4002D2,0x17C0882,0x6C0001C4,
-0x17C0882,0x1400A69,0xFF0C025D,0xE6F80001,0xC2F40002,0x1D80A69,0xFE7C0014,0xC2B00001,0x71F80A69,0xBE000032,0x9E000A69,0x1D80A69,0xFE7C0014,0xC2B00001,0x71F80A69,0xBE000032,0x9E000A69,0x71F80A69,0xBE000032,0x9E000A69,0x9E000A69,0x1D80A69,0xFE7C0014,0xC2B00001,0x71F80A69,0xBE000032,0x9E000A69,0x71F80A69,0xBE000032,0x9E000A69,0x9E000A69,0x71F80A69,
-0xBE000032,0x9E000A69,0x9E000A69,0x9E000A69,0xFF140845,0x1540A69,0xF53808C5,0xFED80550,0xFE68028A,0xF2000028,0xC2240001,0xBA000115,0xF7080841,0xFEC004E2,0xCE380000,0x9E000A69,0x4BFC0A69,0x380659,0x380659,0x380659,0x380659,0x380659,0x380659,0x380659,0x380659,0x380659,0x380659,0xAC000000,0xAC000000,0xAC000000,0xAC000000,0xAC000000,
-0xAC000000,0x54000000,0x54000000,0x54000000,0x38000000,0x540659,0x540659,0x540659,0x540659,0x540659,0x540659,0x40000262,0x40000262,0x40000262,0x34000131,0xA40659,0xA40659,0xA40659,0x260003E8,0x1C000659,0xFC280374,0x380659,0x380659,0xFE100104,0xFA040014,0xD4000000,0xD4000000,0x8C000000,0xFA0802AD,0xF4000112,0x66000028,0x40000262,
-0x740659,};
-static const uint32_t g_etc1_to_bc7_m6_table204[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0x840000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,
-0xC40000,0x18C0000,0x18C0000,0x18C0000,0x40000001,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0xC40000,0x18C0000,0x18C0000,0x18C0000,0x40000001,0x18C0000,0x18C0000,0x18C0000,0x40000001,0x40000001,0x8C0000,0x840000,0x840000,0x2980000,0xA40000,0xB40000,0xB40000,0xDC0000,0x2980000,0xA40000,0x1180000,0x18C0000,
-0x1180000,0x1080000,0x1080000,0x1080000,0x1080000,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x82000001,0x82000001,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x82000001,0x82000001,0x49F80000,
-0x49F80000,0x82000001,0x82000001,0x82000001,0x1340000,0x5180000,0x1080000,0x1680000,0x1BC0000,0x19FC0000,0x31FC0000,0x59F40000,0x14C0000,0x1880000,0x19FC0000,0x82000001,0x19FC0000,0x1980001,0x69FC0000,0xB5FC0000,0xCC000000,0x69FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xCC000000,0xCC000000,0x69FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xCC000000,
-0xCC000000,0xB5FC0000,0xCC000000,0xCC000000,0xCC000000,0x69FC0000,0xB5FC0000,0xCC000000,0xB5FC0000,0xCC000000,0xCC000000,0xB5FC0000,0xCC000000,0xCC000000,0xCC000000,0xB5FC0000,0xCC000000,0xCC000000,0xCC000000,0xCC000000,0xBFC0000,0x3B40000,0x3B40000,0x85FC0000,0xABFC0000,0xC1F40000,0xCC000000,0xCC000000,0x45FC0000,0x95FC0000,0xCBC80000,0xCC000000,
-0xA1FC0000,0xC85AA5,0xFEA4360C,0xFE88228D,0xCC842208,0xFE8027E1,0xFE540C44,0xD8580DC5,0xE84412ED,0xC43C0938,0xA84012ED,0xFE503266,0xFE200CC4,0xD6240C2A,0xF4000B45,0xCA000015,0xAA080AB6,0xC60025C5,0xB2000E98,0x9C001162,0x860025C6,0x12C5AA3,0xFA002A18,0xCC00220B,0xE20021ED,0xBA00100A,0xA2001647,0xBA00351E,0xA8001B25,0x92001B46,0x80002CFE,0x1BF85AA3,
-0x90003B5B,0x84003543,0x7200414F,0x64005AA3,0xFEA03D3B,0xF8C05071,0xFAC452C9,0xFE6C216E,0xFE280A9E,0xFC00000E,0xD00C0071,0xC0000184,0xFE783BC2,0xFE441DA3,0xCE000A96,0x92001B46,0x1AC5AA3,0x10C25C5,0xFEE811FD,0xFEC80885,0xCCC40884,0xFEBC1115,0xFE880086,0xD0980104,0xDA840A69,0xC07801D5,0xA8840A69,0x38C25C5,0xFE380AD4,0xCC5C0884,0xF4000B45,0xCA000015,
-0xA8200A69,0x4BFC25C5,0xB2000E98,0x9C001162,0x860025C6,0x38C25C5,0xFE380AD4,0xCC5C0884,0xF4000B45,0xCA000015,0xA8200A69,0x4BFC25C5,0xB2000E98,0x9C001162,0x860025C6,0x4BFC25C5,0xB2000E98,0x9C001162,0x860025C6,0x860025C6,0xFEE41BB9,0xF90021AD,0xFD082185,0xFE981149,0xFE5C06AE,0xFC00000E,0xCE24000C,0xC0000184,0xFEC41BE9,0xFE840F78,0xCE0009ED,0x9C001162,
-0x1DF825C5,0x842208,0x842208,0x842208,0x842208,0xFE500A9D,0xFE500A9D,0xFE500A9D,0xB4400884,0xB4400884,0x82400885,0xFE180974,0xFE180974,0xFE180974,0xC4000001,0xC4000001,0x88140141,0x92000882,0x92000882,0x7800015A,0x62000882,0xC42208,0xC42208,0xC42208,0xA2000D86,0xA2000D86,0x8000092D,0x80001086,0x80001086,0x6E00071A,0x5E000C63,0x18C2208,
-0x18C2208,0x5C001505,0x5000166E,0x4000220B,0xFE6814BA,0xF67C1C38,0x842208,0xFE440A76,0xFE1C025D,0xF8000000,0xDE04001E,0xAE000022,0xFE3C13E0,0xFE180932,0xB6000903,0x6E00071A,0x1182208,0xC40884,0xC40884,0xC40884,0xC40884,0xFC88000D,0xFC88000D,0xFC88000D,0xA2840001,0xA2840001,0x82840001,0x3240882,0x3240882,0x3240882,0xC0080001,0xC0080001,
-0x82440001,0x17FC0882,0x17FC0882,0x7800015A,0x62000882,0x3240882,0x3240882,0x3240882,0xC0080001,0xC0080001,0x82440001,0x17FC0882,0x17FC0882,0x7800015A,0x62000882,0x17FC0882,0x17FC0882,0x7800015A,0x62000882,0x62000882,0xFEA0054A,0xF6BC0620,0xC40884,0xFE780349,0xFC3C00CA,0xF8000000,0xD21C0001,0xAE000022,0xF88805B2,0xFC5802F9,0x1A40882,0x7800015A,
-0x1A40882,0x1500A69,0xFF2402D5,0xF1080000,0xCD080000,0x1F40A69,0xFEA0004A,0xCCC00000,0x7FF80A69,0xCA000014,0xA8000A69,0x1F40A69,0xFEA0004A,0xCCC00000,0x7FF80A69,0xCA000014,0xA8000A69,0x7FF80A69,0xCA000014,0xA8000A69,0xA8000A69,0x1F40A69,0xFEA0004A,0xCCC00000,0x7FF80A69,0xCA000014,0xA8000A69,0x7FF80A69,0xCA000014,0xA8000A69,0xA8000A69,0x7FF80A69,
-0xCA000014,0xA8000A69,0xA8000A69,0xA8000A69,0xFB2C0884,0x1680A69,0xFF4C08C5,0xFEEC05A5,0xFE900304,0xFC00000D,0xCC340000,0xC20000DD,0xFF180845,0xFED80550,0xD64C0001,0xA8000A69,0x5DF80A69,0x400884,0x400884,0x400884,0x400884,0x400884,0x400884,0x400884,0x400884,0x400884,0x400884,0xC8000000,0xC8000000,0xC8000000,0xC8000000,0xC8000000,
-0xC8000000,0x60000001,0x60000001,0x60000001,0x40000001,0x600882,0x600882,0x600882,0x600882,0x600882,0x600882,0x4C000335,0x4C000335,0x4C000335,0x3A00019A,0xC40882,0xC40882,0xC40882,0x2C00053D,0x20000882,0xFE2C052D,0x400884,0x400884,0xFE200200,0xFE0C0075,0xF6000000,0xF6000000,0xA2000000,0xFE100422,0xFE0001C4,0x7600003A,0x4C000335,
-0x8C0882,};
-static const uint32_t g_etc1_to_bc7_m6_table205[] = {
-0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x2000000,0x0,
-0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x8000000,0x8000000,0x8000000,0x2000000,0x0,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,
-0xDC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x48000001,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0xDC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x48000001,0x1BC0000,0x1BC0000,0x1BC0000,0x48000001,0x48000001,0x69C0000,0x940000,0x940000,0xAC0000,0xB80000,0xC80000,0xC80000,0xF80000,0xAC0000,0xB80000,0x1380000,0x1BC0000,
-0x1380000,0x1180000,0x1180000,0x1180000,0x1180000,0x1A00000,0x1A00000,0x1A00000,0x55F80000,0x55F80000,0x8A000001,0x1A00000,0x1A00000,0x1A00000,0x55F80000,0x55F80000,0x8A000001,0x55F80000,0x55F80000,0x8A000001,0x8A000001,0x1A00000,0x1A00000,0x1A00000,0x55F80000,0x55F80000,0x8A000001,0x55F80000,0x55F80000,0x8A000001,0x8A000001,0x55F80000,
-0x55F80000,0x8A000001,0x8A000001,0x8A000001,0x3440000,0xD280000,0x1180000,0x17C0000,0x3D40000,0x27FC0000,0x3FF80000,0x63F80000,0x1600000,0x1A00000,0x27FC0000,0x8A000001,0x27FC0000,0x1A80001,0x81FC0000,0xC1FC0000,0xD4000000,0x81FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xD4000000,0xD4000000,0x81FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xD4000000,
-0xD4000000,0xC1FC0000,0xD4000000,0xD4000000,0xD4000000,0x81FC0000,0xC1FC0000,0xD4000000,0xC1FC0000,0xD4000000,0xD4000000,0xC1FC0000,0xD4000000,0xD4000000,0xD4000000,0xC1FC0000,0xD4000000,0xD4000000,0xD4000000,0xD4000000,0x33FC0000,0xBC40000,0xBC40000,0x99FC0000,0xB9FC0000,0xCBF40000,0xD4000000,0xD4000000,0x63FC0000,0xA7FC0000,0xD3D80000,0xD4000000,
-0xB1FC0000,0xD4604E,0xFEB03C09,0xFE9426F6,0xD49025C5,0xFE8C2D3E,0xFE641023,0xDE641046,0xF24C14D2,0xCE440B09,0xB04C14D2,0xFE68364F,0xFE2C0F61,0xE02C0D53,0xFC040B0A,0xD2040012,0xB20C0B07,0xD20025C6,0xB8000DD5,0xA60010C2,0x8E0025C7,0x33C604A,0xFE042E6B,0xD40425C6,0xE800235E,0xC6001139,0xA80017A6,0xC0003691,0xAE001B58,0x9E001B37,0x86002D9F,0x23FC604A,
-0x96003EDE,0x8A0037DE,0x7E004416,0x6A00604A,0xFEA0430C,0xFECC55D2,0xFECC5866,0xFE702686,0xFE3C0E17,0xFE08006C,0xDA1400D9,0xCA0000F6,0xFE904155,0xFE50227D,0xD60009ED,0x9E001B37,0x1C8604A,0x11C25C5,0xFEF412E5,0xFED808A8,0xD4D40884,0xFED011E1,0xFEA0010E,0xD8A80104,0xE2940A69,0xC88801D5,0xB0940A69,0x3A425C5,0xFE580BA3,0xD46C0884,0xFC040B09,0xD2040011,
-0xB0300A69,0x57FC25C5,0xB8000DD4,0xA60010C1,0x8E0025C6,0x3A425C5,0xFE580BA3,0xD46C0884,0xFC040B09,0xD2040011,0xB0300A69,0x57FC25C5,0xB8000DD4,0xA60010C1,0x8E0025C6,0x57FC25C5,0xB8000DD4,0xA60010C1,0x8E0025C6,0x8E0025C6,0xFEF81C2C,0xFF0C21C5,0xFF0C21E5,0xFEC01224,0xFE6807CD,0xFE18003B,0xD634000C,0xCA0000F5,0xFEDC1C41,0xFE9810B6,0xD600095D,0xA60010C1,
-0x2BFC25C5,0x9025C5,0x9025C5,0x9025C5,0x9025C5,0xFE5C0D6A,0xFE5C0D6A,0xFE5C0D6A,0xBC4C0A6A,0xBC4C0A6A,0x8A4C0A6A,0xFE240A95,0xFE240A95,0xFE240A95,0xD204000E,0xD204000E,0x901801D6,0x9E000883,0x9E000883,0x84000113,0x6A000883,0xD425C5,0xD425C5,0xD425C5,0xAE000F45,0xAE000F45,0x86000AD6,0x8C00118B,0x8C00118B,0x7A000763,0x62000CC3,0x1B025C5,
-0x1B025C5,0x66001782,0x56001853,0x460025C6,0xFE681813,0xFC881FBD,0x9025C5,0xFE4C0D02,0xFE2C03E3,0xFE08001B,0xEC08004D,0xBC00000B,0xFE5016E0,0xFE300BA2,0xC60008FD,0x7A000763,0x13025C5,0xD40884,0xD40884,0xD40884,0xD40884,0xFE98001D,0xFE98001D,0xFE98001D,0xAA940001,0xAA940001,0x8A940001,0x33C0882,0x33C0882,0x33C0882,0xC8180001,0xC8180001,
-0x8A540001,0x23FC0882,0x23FC0882,0x84000112,0x6A000882,0x33C0882,0x33C0882,0x33C0882,0xC8180001,0xC8180001,0x8A540001,0x23FC0882,0x23FC0882,0x84000112,0x6A000882,0x23FC0882,0x23FC0882,0x84000112,0x6A000882,0x6A000882,0xFAB4057D,0xFECC0620,0xD40884,0xFE880374,0xFE5400F2,0xFE100001,0xDA2C0001,0xBC00000A,0xFE9405BA,0xFE680322,0x1C80882,0x84000112,
-0x1C80882,0x1600A69,0xFF3C0355,0xF9180000,0xD5180000,0xFFC0A69,0xFEB80092,0xD4D00000,0x8BF80A69,0xD2000008,0xB0000A69,0xFFC0A69,0xFEB80092,0xD4D00000,0x8BF80A69,0xD2000008,0xB0000A69,0x8BF80A69,0xD2000008,0xB0000A69,0xB0000A69,0xFFC0A69,0xFEB80092,0xD4D00000,0x8BF80A69,0xD2000008,0xB0000A69,0x8BF80A69,0xD2000008,0xB0000A69,0xB0000A69,0x8BF80A69,
-0xD2000008,0xB0000A69,0xB0000A69,0xB0000A69,0xFF3408B4,0x1780A69,0xF75C0908,0xFF040611,0xFEBC037A,0xFE180032,0xD4440000,0xCE0000B4,0xFF2C088A,0xFEF005C4,0xDE5C0001,0xB0000A69,0x6BFC0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0x4C0A69,0xDC040001,0xDC040001,0xDC040001,0xDC040001,0xDC040001,
-0xDC040001,0x6C040001,0x6C040001,0x6C040001,0x48000002,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x580003D4,0x580003D4,0x580003D4,0x400001E1,0xDC0A69,0xDC0A69,0xDC0A69,0x32000652,0x24000A69,0xF63C06CD,0x4C0A69,0x4C0A69,0xFE200321,0xFE140115,0xFE080012,0xFE080012,0xB2040001,0xFE1005A5,0xFE0002C5,0x86000035,0x580003D4,
-0x9C0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table206[] = {
-0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x2180000,0x300000,
-0x300000,0x300000,0x300000,0x8000000,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x100001,0x140000,0x140000,0x140000,0x2180000,0x240000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xA40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,
-0xF40000,0x1F00000,0x1F00000,0x1F00000,0x50000001,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0xF40000,0x1F00000,0x1F00000,0x1F00000,0x50000001,0x1F00000,0x1F00000,0x1F00000,0x50000001,0x50000001,0xEAC0000,0xA40000,0xA40000,0x6BC0000,0x2CC0000,0x2DC0000,0x2DC0000,0x1140000,0x6BC0000,0x2CC0000,0x15C0000,0x1F00000,
-0x15C0000,0x1280000,0x1280000,0x1280000,0x1280000,0x1B80000,0x1B80000,0x1B80000,0x61F80000,0x61F80000,0x92000001,0x1B80000,0x1B80000,0x1B80000,0x61F80000,0x61F80000,0x92000001,0x61F80000,0x61F80000,0x92000001,0x92000001,0x1B80000,0x1B80000,0x1B80000,0x61F80000,0x61F80000,0x92000001,0x61F80000,0x61F80000,0x92000001,0x92000001,0x61F80000,
-0x61F80000,0x92000001,0x92000001,0x92000001,0x1580000,0x13C0000,0x1280000,0x3900000,0x1F00000,0x37FC0000,0x4BFC0000,0x6DFC0000,0x1740000,0x1B80000,0x37FC0000,0x92000001,0x37FC0000,0x1B80001,0x99FC0000,0xCDFC0000,0xDC000000,0x99FC0000,0xCDFC0000,0xDC000000,0xCDFC0000,0xDC000000,0xDC000000,0x99FC0000,0xCDFC0000,0xDC000000,0xCDFC0000,0xDC000000,
-0xDC000000,0xCDFC0000,0xDC000000,0xDC000000,0xDC000000,0x99FC0000,0xCDFC0000,0xDC000000,0xCDFC0000,0xDC000000,0xDC000000,0xCDFC0000,0xDC000000,0xDC000000,0xDC000000,0xCDFC0000,0xDC000000,0xDC000000,0xDC000000,0xDC000000,0x5BFC0000,0x1D80000,0x1D80000,0xADFC0000,0xC7F80000,0xD5F40000,0xDC000000,0xDC000000,0x81FC0000,0xB7FC0000,0xDBE80000,0xDC000000,
-0xBFFC0000,0xE4604E,0xFEC43D66,0xFEA027D6,0xDCA025C5,0xFE982F3A,0xFE7811D1,0xE6741046,0xFA5C14D2,0xD6540B09,0xB85C14D2,0xFE7437E7,0xFE40114C,0xE83C0D53,0xFE140B5E,0xDA140012,0xBA1C0B07,0xDA1025C6,0xC4000D55,0xAC001046,0x961025C7,0x154604A,0xFE082F3D,0xDC1425C6,0xF4002116,0xD2000F41,0xB4001656,0xD20034A5,0xBA001888,0xA800188D,0x92002BD7,0x2FFC604A,
-0xA0003D3D,0x960035A6,0x8400424A,0x7200604A,0xFEBC4462,0xFECC56D2,0xF8E05915,0xFE8028C4,0xFE501077,0xFE240126,0xE22400D9,0xD40800DB,0xFE9842FE,0xFE5C2552,0xE4000921,0xA800188D,0x1E8604A,0x12C25C5,0xFF0C13F5,0xFEEC090D,0xDCE40884,0xFEE812E1,0xFEB801D6,0xE0B80104,0xEAA40A69,0xD09801D5,0xB8A40A69,0x3BC25C5,0xFE700C8B,0xDC7C0884,0xFE140B5D,0xDA140011,
-0xB8400A69,0x63FC25C5,0xC4000D04,0xAC000FF5,0x960025C6,0x3BC25C5,0xFE700C8B,0xDC7C0884,0xFE140B5D,0xDA140011,0xB8400A69,0x63FC25C5,0xC4000D04,0xAC000FF5,0x960025C6,0x63FC25C5,0xC4000D04,0xAC000FF5,0x960025C6,0x960025C6,0xFF001CF1,0xFB242229,0xFD282208,0xFED41351,0xFE90091D,0xFE2C00B1,0xDE44000C,0xD4000099,0xFEF81D1D,0xFEC01208,0xE4000908,0xAC000FF5,
-0x3BFC25C5,0xA025C5,0xA025C5,0xA025C5,0xA025C5,0xFE680E06,0xFE680E06,0xFE680E06,0xC45C0A6A,0xC45C0A6A,0x925C0A6A,0xFE300B25,0xFE300B25,0xFE300B25,0xDA14000E,0xDA14000E,0x982801D6,0xA6100883,0xA6100883,0x8A0C0103,0x72100883,0xEC25C5,0xEC25C5,0xEC25C5,0xC0000E11,0xC0000E11,0x92000A76,0x98001033,0x98001033,0x800005C3,0x6E000B7B,0x1E425C5,
-0x1E425C5,0x6C001642,0x5C001743,0x4E0025C6,0xFE841892,0xFE8C201D,0xA025C5,0xFE5C0DCE,0xFE40049B,0xFE1C004D,0xF418004D,0xC410000B,0xFE6417AD,0xFE3C0C7D,0xD6000894,0x800005C3,0x15425C5,0xE40884,0xE40884,0xE40884,0xE40884,0xFEB0003D,0xFEB0003D,0xFEB0003D,0xB2A40001,0xB2A40001,0x92A40001,0x1540882,0x1540882,0x1540882,0xD0280001,0xD0280001,
-0x92640001,0x2FFC0882,0x2FFC0882,0x8A0000CA,0x72000882,0x1540882,0x1540882,0x1540882,0xD0280001,0xD0280001,0x92640001,0x2FFC0882,0x2FFC0882,0x8A0000CA,0x72000882,0x2FFC0882,0x2FFC0882,0x8A0000CA,0x72000882,0x72000882,0xFEBC05A5,0xF6DC0659,0xE40884,0xFE9803A9,0xFC6C0139,0xFE2C0008,0xE23C0001,0xC8000000,0xFAAC05E9,0xFE740371,0x1E80882,0x8A0000CA,
-0x1E80882,0x1700A69,0xFF5003D0,0xFF280001,0xDD280000,0x29FC0A69,0xFED80104,0xDCE00000,0x97F80A69,0xDC000000,0xB8000A69,0x29FC0A69,0xFED80104,0xDCE00000,0x97F80A69,0xDC000000,0xB8000A69,0x97F80A69,0xDC000000,0xB8000A69,0xB8000A69,0x29FC0A69,0xFED80104,0xDCE00000,0x97F80A69,0xDC000000,0xB8000A69,0x97F80A69,0xDC000000,0xB8000A69,0xB8000A69,0x97F80A69,
-0xDC000000,0xB8000A69,0xB8000A69,0xB8000A69,0xFF5008C9,0x1880A69,0xFF6C0908,0xFF200671,0xFED003FA,0xFE480074,0xDC540000,0xD4000080,0xF74C08C5,0xFF080601,0xE66C0001,0xB8000A69,0x7BFC0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0x5C0A69,0xE4140001,0xE4140001,0xE4140001,0xE4140001,0xE4140001,
-0xE4140001,0x74140001,0x74140001,0x74140001,0x50100002,0x840A69,0x840A69,0x840A69,0x840A69,0x840A69,0x840A69,0x680002EA,0x680002EA,0x680002EA,0x4C000119,0x10C0A69,0x10C0A69,0x10C0A69,0x3E0005AA,0x2C000A69,0xFE4C06CD,0x5C0A69,0x5C0A69,0xFE3C0349,0xFE28013D,0xFE180022,0xFE180022,0xBA140001,0xFE2005E4,0xFE1402E4,0x9A000001,0x680002EA,
-0xBC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table207[] = {
-0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x2300000,0x640000,
-0x640000,0x640000,0x640000,0x10000000,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x200001,0x240000,0x240000,0x240000,0x2300000,0x480000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0xB40000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,
-0x10C0000,0xBF80000,0xBF80000,0xBF80000,0x58000001,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0xBF80000,0xBF80000,0xBF80000,0x58000001,0xBF80000,0xBF80000,0xBF80000,0x58000001,0x58000001,0xC00000,0xB40000,0xB40000,0xD00000,0x2E00000,0xF40000,0xF40000,0x32C0000,0xD00000,0x2E00000,0x17C0000,0xBF80000,
-0x17C0000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x6DF80000,
-0x6DF80000,0x9A000001,0x9A000001,0x9A000001,0x7680000,0x14C0000,0x1380000,0x1A80000,0xDFC0000,0x45FC0000,0x59FC0000,0x79F80000,0x1880000,0x1D00000,0x45FC0000,0x9A000001,0x45FC0000,0x1C80001,0xB1FC0000,0xD9FC0000,0xE4000000,0xB1FC0000,0xD9FC0000,0xE4000000,0xD9FC0000,0xE4000000,0xE4000000,0xB1FC0000,0xD9FC0000,0xE4000000,0xD9FC0000,0xE4000000,
-0xE4000000,0xD9FC0000,0xE4000000,0xE4000000,0xE4000000,0xB1FC0000,0xD9FC0000,0xE4000000,0xD9FC0000,0xE4000000,0xE4000000,0xD9FC0000,0xE4000000,0xE4000000,0xE4000000,0xD9FC0000,0xE4000000,0xE4000000,0xE4000000,0xE4000000,0x81FC0000,0x1E80000,0x1E80000,0xC1FC0000,0xD5F80000,0xDFF80000,0xE4000000,0xE4000000,0x9FFC0000,0xC9FC0000,0xE3F80000,0xE4000000,
-0xCFFC0000,0xF4604E,0xFED03EDE,0xFEB828D6,0xE4B025C5,0xFEB03142,0xFE8813AF,0xEE841046,0xFE6C14EA,0xDE640B09,0xC06C14D2,0xFE8C397F,0xFE58135C,0xF04C0D53,0xFE2C0C2E,0xE2240012,0xC22C0B07,0xE22025C6,0xCE080D51,0xB4101046,0x9E2025C7,0x16C604A,0xFE2C30C9,0xE42425C6,0xFA001F7A,0xDE000DA9,0xBC00157A,0xE20032CB,0xC60015F8,0xAE00161D,0x98002A5B,0x3BFC604A,
-0xAC003B6D,0x9C003362,0x8A0040B6,0x7A00604A,0xFED045E7,0xFEEC56D6,0xFEEC5931,0xFE982B12,0xFE6812F6,0xFE38023E,0xEA3400D9,0xDC1800DB,0xFEB4443B,0xFE802784,0xEE0008BF,0xAE00161D,0x7FC604A,0x13C25C5,0xFF1814ED,0xFEFC0994,0xE4F40884,0xFEF413E5,0xFECC02E0,0xE8C80104,0xF2B40A69,0xD8A801D5,0xC0B40A69,0x1D425C5,0xFE940D8B,0xE48C0884,0xFE380C01,0xE2240011,
-0xC0500A69,0x6FFC25C5,0xD0000C54,0xB2000F59,0x9E0025C6,0x1D425C5,0xFE940D8B,0xE48C0884,0xFE380C01,0xE2240011,0xC0500A69,0x6FFC25C5,0xD0000C54,0xB2000F59,0x9E0025C6,0x6FFC25C5,0xD0000C54,0xB2000F59,0x9E0025C6,0x9E0025C6,0xFF141D5E,0xFF2C2251,0xFF2C2274,0xFEEC1476,0xFEA40A79,0xFE480164,0xE654000C,0xE0000062,0xFF081D9A,0xFEC812EA,0xEE0008BB,0xB2000F59,
-0x49FC25C5,0xB025C5,0xB025C5,0xB025C5,0xB025C5,0xFE800E8E,0xFE800E8E,0xFE800E8E,0xCC6C0A6A,0xCC6C0A6A,0x9A6C0A6A,0xFE440BA3,0xFE440BA3,0xFE440BA3,0xE224000E,0xE224000E,0xA03801D6,0xAE200883,0xAE200883,0x921C0103,0x7A200883,0x10425C5,0x10425C5,0x10425C5,0xD2000D0D,0xD2000D0D,0x9A080A6A,0xA8000EC8,0xA8000EC8,0x8C000443,0x74000A83,0x7FC25C5,
-0x7FC25C5,0x7800151A,0x66001632,0x560025C6,0xFE901926,0xFCA82036,0xB025C5,0xFE700EB6,0xFE540563,0xFE34009A,0xFC28004D,0xCC20000B,0xFE78181A,0xFE500D1D,0xE4040883,0x8C000443,0x17425C5,0xF40884,0xF40884,0xF40884,0xF40884,0xFEBC0061,0xFEBC0061,0xFEBC0061,0xBAB40001,0xBAB40001,0x9AB40001,0x16C0882,0x16C0882,0x16C0882,0xD8380001,0xD8380001,
-0x9A740001,0x3BFC0882,0x3BFC0882,0x96000092,0x7A000882,0x16C0882,0x16C0882,0x16C0882,0xD8380001,0xD8380001,0x9A740001,0x3BFC0882,0x3BFC0882,0x96000092,0x7A000882,0x3BFC0882,0x3BFC0882,0x96000092,0x7A000882,0x7A000882,0xFED805B2,0xFEEC0659,0xF40884,0xFEB403F5,0xFE80016D,0xFE480020,0xEA4C0001,0xD0100000,0xFEB40611,0xFE98039D,0x7FC0882,0x96000092,
-0x7FC0882,0x1800A69,0xFF5C0454,0xFF380024,0xE5380000,0x41FC0A69,0xFEF00184,0xE4F00000,0xA1FC0A69,0xE40C0000,0xC0000A69,0x41FC0A69,0xFEF00184,0xE4F00000,0xA1FC0A69,0xE40C0000,0xC0000A69,0xA1FC0A69,0xE40C0000,0xC0000A69,0xC0000A69,0x41FC0A69,0xFEF00184,0xE4F00000,0xA1FC0A69,0xE40C0000,0xC0000A69,0xA1FC0A69,0xE40C0000,0xC0000A69,0xC0000A69,0xA1FC0A69,
-0xE40C0000,0xC0000A69,0xC0000A69,0xC0000A69,0xFD680908,0x19C0A69,0xF77C094D,0xFF4006CD,0xFEE80484,0xFE7000E8,0xE4640000,0xE0000061,0xFF5C08C5,0xFF200681,0xEE7C0001,0xC0000A69,0x89FC0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0x6C0A69,0xEC240001,0xEC240001,0xEC240001,0xEC240001,0xEC240001,
-0xEC240001,0x7C240001,0x7C240001,0x7C240001,0x58200002,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x74000232,0x74000232,0x74000232,0x58000091,0x13C0A69,0x13C0A69,0x13C0A69,0x44000502,0x34000A69,0xF65C070A,0x6C0A69,0x6C0A69,0xFE4C0372,0xFE3C016D,0xFE2C003D,0xFE2C003D,0xC2240001,0xFE3C05E9,0xFA2C0322,0xA2100001,0x74000232,
-0xE00A69,};
-static const uint32_t g_etc1_to_bc7_m6_table208[] = {
-0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x4C0000,0x980000,
-0x980000,0x980000,0x980000,0x18000001,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x340000,0x380000,0x380000,0x380000,0x4C0000,0x6C0000,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0xC40001,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,
-0x3240000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x3240000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x17FC0000,0x17FC0000,0x17FC0000,0x62000000,0x62000000,0xD40000,0xC40001,0xC40001,0x4E40000,0xF80000,0x10C0000,0x10C0000,0x14C0000,0x4E40000,0xF80000,0x1A40000,0x17FC0000,
-0x1A40000,0x1480001,0x1480001,0x1480001,0x1480001,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x79FC0000,
-0x79FC0000,0xA4000000,0xA4000000,0xA4000000,0x1800000,0x1600000,0x1480001,0x1C00000,0x23FC0000,0x57FC0000,0x69F80000,0x85F80000,0x39C0000,0x3E80000,0x57FC0000,0xA4000000,0x57FC0000,0x1DC0000,0xCDFC0000,0xE7F80000,0xEC000001,0xCDFC0000,0xE7F80000,0xEC000001,0xE7F80000,0xEC000001,0xEC000001,0xCDFC0000,0xE7F80000,0xEC000001,0xE7F80000,0xEC000001,
-0xEC000001,0xE7F80000,0xEC000001,0xEC000001,0xEC000001,0xCDFC0000,0xE7F80000,0xEC000001,0xE7F80000,0xEC000001,0xEC000001,0xE7F80000,0xEC000001,0xEC000001,0xEC000001,0xE7F80000,0xEC000001,0xEC000001,0xEC000001,0xEC000001,0xADFC0000,0x1FC0000,0x1FC0000,0xD7FC0000,0xE3FC0000,0xEBF00000,0xEC000001,0xEC000001,0xC1FC0000,0xDBFC0000,0xEDEC0000,0xEC000001,
-0xDFFC0000,0x108604A,0xFEE840B6,0xFECC2A5B,0xECC025C7,0xFEC43362,0xFEA0161D,0xF6941046,0xFE84157A,0xE8780B07,0xC87C14D2,0xFEA43B6D,0xFE7015F8,0xFA600D51,0xFE400DA9,0xEC380012,0xCC400B09,0xEC3425C6,0xD81C0D53,0xBC201046,0xA63425C5,0x188604A,0xFE3832CB,0xEC3825C6,0xFE081F7A,0xE8000C2E,0xC60014EA,0xE80030C9,0xD200135C,0xBA0013AF,0xA20028D6,0x49F8604A,
-0xB800397F,0xA6003142,0x96003EDE,0x8200604E,0xFEE44782,0xF90057DA,0xFB0459DA,0xFEAC2DF6,0xFE7C160E,0xFE4803FF,0xF24400DB,0xE42800D9,0xFED0463D,0xFE902A56,0xF81008BB,0xBA0013AF,0x19FC604A,0x15025C6,0xFF301632,0xFF140A83,0xED080883,0xFF0C151A,0xFEE40443,0xF0D80103,0xFAC40A6A,0xE2BC01D6,0xC8C40A6A,0x1F025C5,0xFEAC0EC8,0xECA00883,0xFE580D0D,0xEC38000E,
-0xC8600A6A,0x7DF825C5,0xDC000BA3,0xBE000E8E,0xA60025C5,0x1F025C5,0xFEAC0EC8,0xECA00883,0xFE580D0D,0xEC38000E,0xC8600A6A,0x7DF825C5,0xDC000BA3,0xBE000E8E,0xA60025C5,0x7DF825C5,0xDC000BA3,0xBE000E8E,0xA60025C5,0xA60025C5,0xFF341E13,0xFD4822AA,0xFF4C228E,0xFF0015CE,0xFEBC0C34,0xFE70027E,0xEE64000B,0xE800004D,0xFF181E3E,0xFEDC1481,0xFC00088B,0xBE000E8E,
-0x5BFC25C5,0xC025C6,0xC025C6,0xC025C6,0xC025C6,0xFE980F59,0xFE980F59,0xFE980F59,0xD67C0A69,0xD67C0A69,0xA47C0A69,0xFE5C0C54,0xFE5C0C54,0xFE5C0C54,0xEA340011,0xEA340011,0xAA4C01D5,0xB6340884,0xB6340884,0x9A2C0104,0x82340884,0x12025C5,0x12025C5,0x12025C5,0xE2000C01,0xE2000C01,0xA4180A69,0xB4000D8B,0xB4000D8B,0x980002E0,0x80000994,0x15F825C5,
-0x15F825C5,0x840013E5,0x720014ED,0x600025C5,0xFEA019B4,0xF6BC20B2,0xC025C6,0xFE880F9E,0xFE6C0669,0xFE480126,0xFE3C0062,0xD430000C,0xFE9018F2,0xFE680E59,0xEC180882,0x980002E0,0x19C25C5,0x1080882,0x1080882,0x1080882,0x1080882,0xFED00092,0xFED00092,0xFED00092,0xC4C40001,0xC4C40001,0xA4C40001,0x1880882,0x1880882,0x1880882,0xE04C0001,0xE04C0001,
-0xA4840001,0x49F80882,0x49F80882,0xA0000061,0x82000884,0x1880882,0x1880882,0x1880882,0xE04C0001,0xE04C0001,0xA4840001,0x49F80882,0x49F80882,0xA0000061,0x82000884,0x49F80882,0x49F80882,0xA0000061,0x82000884,0x82000884,0xFAEC05E9,0xF9000692,0x1080882,0xFEC4042A,0xFE9801A9,0xFE64003D,0xF65C0000,0xD8240001,0xFED00628,0xFEA403E8,0x19FC0882,0xA0000061,
-0x19FC0882,0x1940A69,0xFF740502,0xFF4C0091,0xED480002,0x5BFC0A69,0xFF140232,0xED040001,0xAFFC0A69,0xEC240001,0xC8000A69,0x5BFC0A69,0xFF140232,0xED040001,0xAFFC0A69,0xEC240001,0xC8000A69,0xAFFC0A69,0xEC240001,0xC8000A69,0xC8000A69,0x5BFC0A69,0xFF140232,0xED040001,0xAFFC0A69,0xEC240001,0xC8000A69,0xAFFC0A69,0xEC240001,0xC8000A69,0xC8000A69,0xAFFC0A69,
-0xEC240001,0xC8000A69,0xC8000A69,0xC8000A69,0xFF780910,0x1AC0A69,0xFF8C0951,0xFF580749,0xFF10052A,0xFE98018D,0xEC780001,0xE800003D,0xFF70090A,0xFF4006E5,0xF88C0000,0xC8000A69,0x9BFC0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0x7C0A69,0xF6340000,0xF6340000,0xF6340000,0xF6340000,0xF6340000,
-0xF6340000,0x86340000,0x86340000,0x86340000,0x62340000,0xB80A69,0xB80A69,0xB80A69,0xB80A69,0xB80A69,0xB80A69,0x86000184,0x86000184,0x86000184,0x62000024,0x1740A69,0x1740A69,0x1740A69,0x50000454,0x3E000A69,0xFE6C0710,0x7C0A69,0x7C0A69,0xFE5803B5,0xFE5001A5,0xFE3C0061,0xFE3C0061,0xCC340000,0xFA500622,0xFE3C0355,0xAC200000,0x86000184,
-0x1080A69,};
-static const uint32_t g_etc1_to_bc7_m6_table209[] = {
-0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0xCC0000,
-0xCC0000,0xCC0000,0xCC0000,0x20000001,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x480000,0x480000,0x480000,0x640000,0x900000,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0xD40001,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,
-0x33C0000,0x23FC0000,0x23FC0000,0x23FC0000,0x6A000000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x33C0000,0x23FC0000,0x23FC0000,0x23FC0000,0x6A000000,0x23FC0000,0x23FC0000,0x23FC0000,0x6A000000,0x6A000000,0xE40000,0xD40001,0xD40001,0xF80000,0x10C0000,0x1240000,0x1240000,0x1680000,0xF80000,0x10C0000,0x1C80000,0x23FC0000,
-0x1C80000,0x1580001,0x1580001,0x1580001,0x1580001,0x7FC0000,0x7FC0000,0x7FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x7FC0000,0x7FC0000,0x7FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x85FC0000,0x85FC0000,0xAC000000,0xAC000000,0x7FC0000,0x7FC0000,0x7FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x85FC0000,0x85FC0000,0xAC000000,0xAC000000,0x85FC0000,
-0x85FC0000,0xAC000000,0xAC000000,0xAC000000,0x5900000,0x1700000,0x1580001,0x3D40000,0x37FC0000,0x65FC0000,0x75FC0000,0x91F40000,0x3B00000,0x7FC0000,0x65FC0000,0xAC000000,0x65FC0000,0x1EC0000,0xE5FC0000,0xF3F80000,0xF4000001,0xE5FC0000,0xF3F80000,0xF4000001,0xF3F80000,0xF4000001,0xF4000001,0xE5FC0000,0xF3F80000,0xF4000001,0xF3F80000,0xF4000001,
-0xF4000001,0xF3F80000,0xF4000001,0xF4000001,0xF4000001,0xE5FC0000,0xF3F80000,0xF4000001,0xF3F80000,0xF4000001,0xF4000001,0xF3F80000,0xF4000001,0xF4000001,0xF4000001,0xF3F80000,0xF4000001,0xF4000001,0xF4000001,0xF4000001,0xD5FC0000,0x77FC0000,0x77FC0000,0xEBFC0000,0xF1FC0000,0xF5F00000,0xF4000001,0xF4000001,0xDFFC0000,0xEDFC0000,0xF5FC0000,0xF4000001,
-0xEFFC0000,0x118604A,0xFEF4424A,0xFED82BD7,0xF4D025C7,0xFED035A6,0xFEAC188D,0xFEA41046,0xFE941656,0xF0880B07,0xD08C14D2,0xFEBC3D3D,0xFE881888,0xFE740D55,0xFE580F41,0xF4480012,0xD4500B09,0xF44425C6,0xE02C0D53,0xC4301046,0xAE4425C5,0x1A0604A,0xFE5834A5,0xF44825C6,0xFE142116,0xF4000B5E,0xD00414D2,0xFA002F3D,0xDE00114C,0xC20011D1,0xAE0027D6,0x55F8604A,
-0xC40037E7,0xB2002F3A,0x9C003D66,0x8A00604E,0xFEF44912,0xFF0C57EA,0xFF0C5A1A,0xFEC4307A,0xFE901902,0xFE5C05EA,0xFA5400DB,0xEC3800D9,0xFEDC478F,0xFEB02D65,0xFE2008C1,0xC20011D1,0x27FC604A,0x16025C6,0xFF441743,0xFF200B7B,0xF5180883,0xFF241642,0xFEF005C3,0xF8E80103,0xFED80A76,0xEACC01D6,0xD0D40A6A,0xDFC25C5,0xFECC1033,0xF4B00883,0xFE7C0E11,0xF448000E,
-0xD0700A6A,0x89F825C5,0xE6000B25,0xCA000E06,0xAE0025C5,0xDFC25C5,0xFECC1033,0xF4B00883,0xFE7C0E11,0xF448000E,0xD0700A6A,0x89F825C5,0xE6000B25,0xCA000E06,0xAE0025C5,0x89F825C5,0xE6000B25,0xCA000E06,0xAE0025C5,0xAE0025C5,0xFF3C1EC2,0xFF4C22FE,0xF75C2313,0xFF1416C1,0xFEDC0DD4,0xFE8403CB,0xF674000B,0xF010004D,0xFF341F0E,0xFF0415B5,0xFE1408A6,0xCA000E06,
-0x69FC25C5,0xD025C6,0xD025C6,0xD025C6,0xD025C6,0xFEA40FF5,0xFEA40FF5,0xFEA40FF5,0xDE8C0A69,0xDE8C0A69,0xAC8C0A69,0xFE740D04,0xFE740D04,0xFE740D04,0xF2440011,0xF2440011,0xB25C01D5,0xBE440884,0xBE440884,0xA23C0104,0x8A440884,0x13825C5,0x13825C5,0x13825C5,0xF4000B5D,0xF4000B5D,0xAC280A69,0xC6000C8B,0xC6000C8B,0xA20001D6,0x8800090D,0x21F825C5,
-0x21F825C5,0x8A0012E1,0x780013F5,0x680025C5,0xFEBC1A59,0xFECC20B2,0xD025C6,0xFE981069,0xFE800755,0xFE5C01A9,0xFE540099,0xDC40000C,0xFEA01966,0xFE740F25,0xF4280882,0xA20001D6,0x1BC25C5,0x1180882,0x1180882,0x1180882,0x1180882,0xFEE800CA,0xFEE800CA,0xFEE800CA,0xCCD40001,0xCCD40001,0xACD40001,0x1A00882,0x1A00882,0x1A00882,0xE85C0001,0xE85C0001,
-0xAC940001,0x55F80882,0x55F80882,0xA600003D,0x8A000884,0x1A00882,0x1A00882,0x1A00882,0xE85C0001,0xE85C0001,0xAC940001,0x55F80882,0x55F80882,0xA600003D,0x8A000884,0x55F80882,0x55F80882,0xA600003D,0x8A000884,0x8A000884,0xFEF40611,0xFF0C069A,0x1180882,0xFEDC0451,0xFEAC01ED,0xFE7C0064,0xFE6C0000,0xE0340001,0xF8EC0659,0xFEC40422,0x27FC0882,0xA600003D,
-0x27FC0882,0x1A40A69,0xFF8005AA,0xFF640119,0xF5580002,0x75FC0A69,0xFF2C02EA,0xF5140001,0xBBFC0A69,0xF4340001,0xD0000A69,0x75FC0A69,0xFF2C02EA,0xF5140001,0xBBFC0A69,0xF4340001,0xD0000A69,0xBBFC0A69,0xF4340001,0xD0000A69,0xD0000A69,0x75FC0A69,0xFF2C02EA,0xF5140001,0xBBFC0A69,0xF4340001,0xD0000A69,0xBBFC0A69,0xF4340001,0xD0000A69,0xD0000A69,0xBBFC0A69,
-0xF4340001,0xD0000A69,0xD0000A69,0xD0000A69,0xFD90094D,0x1C00A69,0xF9A00992,0xFF6C0794,0xFF3C05C4,0xFEC80235,0xF4880001,0xF2000022,0xF98C094D,0xFF58076D,0xFEA00004,0xD0000A69,0xA9FC0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0x8C0A69,0xFE440000,0xFE440000,0xFE440000,0xFE440000,0xFE440000,
-0xFE440000,0x8E440000,0x8E440000,0x8E440000,0x6A440000,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0x92000104,0x92000104,0x92000104,0x6A000001,0x1A40A69,0x1A40A69,0x1A40A69,0x560003D0,0x46000A69,0xF8800745,0x8C0A69,0x8C0A69,0xFE6803E8,0xFE6401E1,0xFE540080,0xFE540080,0xD4440000,0xFE580652,0xFE50037A,0xB4300000,0x92000104,
-0x1280A69,};
-static const uint32_t g_etc1_to_bc7_m6_table210[] = {
-0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0x7C0000,0xFC0000,
-0xFC0000,0xFC0000,0xFC0000,0x28000001,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x540000,0x4580000,0x4580000,0x4580000,0x7C0000,0xB00000,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0xE40001,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,
-0x1540000,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x72000000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x1540000,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x72000000,0x2FFC0000,0x2FFC0000,0x2FFC0000,0x72000000,0x72000000,0x2F40000,0xE40001,0xE40001,0x10C0000,0x1200000,0x1380000,0x1380000,0x3800000,0x10C0000,0x1200000,0x1E80000,0x2FFC0000,
-0x1E80000,0x1680001,0x1680001,0x1680001,0x1680001,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,0x91FC0000,0xB4000000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,0x91FC0000,0xB4000000,0x91FC0000,0x91FC0000,0xB4000000,0xB4000000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,0x91FC0000,0xB4000000,0x91FC0000,0x91FC0000,0xB4000000,0xB4000000,0x91FC0000,
-0x91FC0000,0xB4000000,0xB4000000,0xB4000000,0x1A40000,0x9800000,0x1680001,0x1EC0000,0x4BFC0000,0x75FC0000,0x83FC0000,0x9BF80000,0x3C40000,0x1FFC0000,0x75FC0000,0xB4000000,0x75FC0000,0x1FC0000,0xFDFC0000,0xFFF80000,0xFC000001,0xFDFC0000,0xFFF80000,0xFC000001,0xFFF80000,0xFC000001,0xFC000001,0xFDFC0000,0xFFF80000,0xFC000001,0xFFF80000,0xFC000001,
-0xFC000001,0xFFF80000,0xFC000001,0xFC000001,0xFC000001,0xFDFC0000,0xFFF80000,0xFC000001,0xFFF80000,0xFC000001,0xFC000001,0xFFF80000,0xFC000001,0xFC000001,0xFC000001,0xFFF80000,0xFC000001,0xFC000001,0xFC000001,0xFC000001,0xFDFC0000,0xF7FC0000,0xF7FC0000,0xFDFC0000,0xFFF80000,0xFFF00000,0xFC000001,0xFC000001,0xFDFC0000,0xFDFC0000,0xFFD00000,0xFC000001,
-0xFFF80000,0x128604A,0xFF004416,0xFEF02D9F,0xFCE025C7,0xFEE837DE,0xFEC01B37,0xFEB010C2,0xFEAC17A6,0xF8980B07,0xD89C14D2,0xFED03EDE,0xFEA01B58,0xFE8C0DD5,0xFE701139,0xFC580012,0xDC600B09,0xFC5425C6,0xE83C0D53,0xCC401046,0xB65425C5,0x1B8604A,0xFE7C3691,0xFC5825C6,0xFE2C235E,0xFC040B0A,0xD81414D2,0xFC002E6B,0xE8000F61,0xCC001023,0xB40026F6,0x61F8604A,
-0xCA00364F,0xB8002D3E,0xA6003C09,0x9200604E,0xFF004A76,0xF92058E2,0xFB245AA3,0xFED432EA,0xFEA81C14,0xFE7C0838,0xFE6800F6,0xF44800D9,0xFEF0492C,0xFEC02FD3,0xFE38092D,0xCC001023,0x37FC604A,0x17025C6,0xFF501853,0xFF380CC3,0xFD280883,0xFF301782,0xFF080763,0xFEF40113,0xFEF00AD6,0xF2DC01D6,0xD8E40A6A,0x25FC25C5,0xFEE4118B,0xFCC00883,0xFEA00F45,0xFC58000E,
-0xD8800A6A,0x95F825C5,0xEC000A95,0xD0000D6A,0xB60025C5,0x25FC25C5,0xFEE4118B,0xFCC00883,0xFEA00F45,0xFC58000E,0xD8800A6A,0x95F825C5,0xEC000A95,0xD0000D6A,0xB60025C5,0x95F825C5,0xEC000A95,0xD0000D6A,0xB60025C5,0xB60025C5,0xFF501F35,0xFD68232A,0xFF6C2313,0xFF30183D,0xFEE80F6B,0xFEAC055A,0xFE84000B,0xF820004D,0xFF441F99,0xFF101711,0xFE34092B,0xD0000D6A,
-0x79FC25C5,0xE025C6,0xE025C6,0xE025C6,0xE025C6,0xFEB010C1,0xFEB010C1,0xFEB010C1,0xE69C0A69,0xE69C0A69,0xB49C0A69,0xFE8C0DD4,0xFE8C0DD4,0xFE8C0DD4,0xFA540011,0xFA540011,0xBA6C01D5,0xC6540884,0xC6540884,0xAA4C0104,0x92540884,0x15025C5,0x15025C5,0x15025C5,0xFC040B09,0xFC040B09,0xB4380A69,0xD2000BA3,0xD2000BA3,0xAE00010E,0x920008A8,0x2DF825C5,
-0x2DF825C5,0x960011E1,0x840012E5,0x700025C5,0xFECC1ACA,0xF6DC212D,0xE025C6,0xFEAC116D,0xFE88084E,0xFE700251,0xFE6800F5,0xE450000C,0xFEB419D9,0xFE901016,0xFC380882,0xAE00010E,0x1E025C5,0x1280882,0x1280882,0x1280882,0x1280882,0xFEF40112,0xFEF40112,0xFEF40112,0xD4E40001,0xD4E40001,0xB4E40001,0x1B80882,0x1B80882,0x1B80882,0xF06C0001,0xF06C0001,
-0xB4A40001,0x61F80882,0x61F80882,0xB200001D,0x92000884,0x1B80882,0x1B80882,0x1B80882,0xF06C0001,0xF06C0001,0xB4A40001,0x61F80882,0x61F80882,0xB200001D,0x92000884,0x61F80882,0x61F80882,0xB200001D,0x92000884,0x92000884,0xFF100620,0xF92006CD,0x1280882,0xFCF404B1,0xFEC00239,0xFE9400AA,0xFE84000A,0xE8440001,0xFEF8065D,0xFEDC0451,0x37FC0882,0xB200001D,
-0x37FC0882,0x1B40A69,0xFF980652,0xFF7C01E1,0xFD680002,0x8DFC0A69,0xFF4C03D4,0xFD240001,0xC7FC0A69,0xFC440001,0xD8000A69,0x8DFC0A69,0xFF4C03D4,0xFD240001,0xC7FC0A69,0xFC440001,0xD8000A69,0xC7FC0A69,0xFC440001,0xD8000A69,0xD8000A69,0x8DFC0A69,0xFF4C03D4,0xFD240001,0xC7FC0A69,0xFC440001,0xD8000A69,0xC7FC0A69,0xFC440001,0xD8000A69,0xD8000A69,0xC7FC0A69,
-0xFC440001,0xD8000A69,0xD8000A69,0xD8000A69,0xFDA40992,0x1D00A69,0xFFAC099A,0xFF840812,0xFF500668,0xFEF0031D,0xFC980001,0xFA000012,0xFF980951,0xFF7007FD,0xFED00049,0xD8000A69,0xB9FC0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0x9C0A69,0xFC580008,0xFC580008,0xFC580008,0xFC580008,0xFC580008,
-0xFC580008,0x96540000,0x96540000,0x96540000,0x72540000,0xE80A69,0xE80A69,0xE80A69,0xE80A69,0xE80A69,0xE80A69,0xA2000092,0xA2000092,0xA2000092,0x720C0000,0x1D80A69,0x1D80A69,0x1D80A69,0x60000355,0x4E000A69,0xFE8C0751,0x9C0A69,0x9C0A69,0xFC800424,0xFE6C0220,0xFE6000B4,0xFE6000B4,0xDC540000,0xFE740659,0xFE6403CA,0xBC400000,0xA2000092,
-0x14C0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table211[] = {
-0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x940000,0x12C0000,
-0x12C0000,0x12C0000,0x12C0000,0x30000001,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0x640000,0xC680000,0xC680000,0xC680000,0x940000,0xD40000,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0xF40001,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,
-0x16C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x16C0000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x3BFC0000,0x3BFC0000,0x3BFC0000,0x7A000000,0x7A000000,0xB040000,0xF40001,0xF40001,0x51C0000,0x1340000,0x34C0000,0x34C0000,0x19C0000,0x51C0000,0x1340000,0x7FC0000,0x3BFC0000,
-0x7FC0000,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x9DFC0000,
-0x9DFC0000,0xBC000000,0xBC000000,0xBC000000,0x1B80000,0x1940000,0x1780001,0x9FC0000,0x5DFC0000,0x83FC0000,0x91FC0000,0xA5FC0000,0x3D80000,0x37FC0000,0x83FC0000,0xBC000000,0x83FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1345AA3,0xFF18414F,0xFEFC2CFE,0xFEF025C6,0xFEF43543,0xFED41B46,0xFEC41162,0xFEB81647,0xFAA80AB6,0xDCAC12ED,0xFEDC3B5B,0xFEAC1B25,0xFE980E98,0xFE88100A,0xFE680015,0xDE700938,0xFE64220B,0xEA4C0C2A,0xD24C0DC5,0xBC642208,0x1C85AA3,0xFE88351E,0xFE7025C5,0xFE3821ED,0xFE140B45,0xDC2812ED,0xFE082A18,0xEE000CC4,0xD4000C44,0xBA00228D,0x69FC5AA3,
-0xD6003266,0xBE0027E1,0xAC00360C,0x9A005AA5,0xFF0C469D,0xFF2C536D,0xFF2C555E,0xFEEC313E,0xFEBC1BA5,0xFE9008D1,0xFE7C0184,0xF85C0071,0xFEF84551,0xFED02DDA,0xFE50093C,0xD4000C44,0x41FC5AA3,0x178220B,0xFF5C166E,0xFF400C63,0xFF380882,0xFF441505,0xFF20071A,0xFF0C015A,0xFEFC092D,0xF4EC0141,0xDCF40885,0x37FC2208,0xFEFC1086,0xFED80882,0xFEB80D86,0xFE740001,
-0xDE940884,0x9DFC2208,0xF2000974,0xD6000A9D,0xBC002208,0x37FC2208,0xFEFC1086,0xFED80882,0xFEB80D86,0xFE740001,0xDE940884,0x9DFC2208,0xF2000974,0xD6000A9D,0xBC002208,0x9DFC2208,0xF2000974,0xD6000A9D,0xBC002208,0xBC002208,0xFF581C81,0xFF6C1FAD,0xFF6C1FDA,0xFF401614,0xFEFC0E52,0xFEC00532,0xFEA00022,0xFC40001E,0xFD581C9A,0xFF24153E,0xFE58092B,0xD6000A9D,
-0x83FC2208,0xF025C6,0xF025C6,0xF025C6,0xF025C6,0xFEC41162,0xFEC41162,0xFEC41162,0xEEAC0A69,0xEEAC0A69,0xBCAC0A69,0xFE980E98,0xFE980E98,0xFE980E98,0xFE680015,0xFE680015,0xC27C01D5,0xCE640884,0xCE640884,0xB25C0104,0x9A640884,0x16825C5,0x16825C5,0x16825C5,0xFE140B45,0xFE140B45,0xBC480A69,0xE2000AD4,0xE2000AD4,0xBA000086,0x9A000885,0x39F825C5,
-0x39F825C5,0x9C001115,0x8A0011FD,0x780025C5,0xFED81B58,0xFEEC212D,0xF025C6,0xFEBC1244,0xFE9C0952,0xFE7C0334,0xFE7C0184,0xEC60000C,0xFEBC1A8C,0xFEA41121,0xFE500893,0xBA000086,0x3FC25C5,0x1380882,0x1380882,0x1380882,0x1380882,0xFF0C015A,0xFF0C015A,0xFF0C015A,0xDCF40001,0xDCF40001,0xBCF40001,0x1D00882,0x1D00882,0x1D00882,0xF87C0001,0xF87C0001,
-0xBCB40001,0x6DF80882,0x6DF80882,0xBA00000D,0x9A000884,0x1D00882,0x1D00882,0x1D00882,0xF87C0001,0xF87C0001,0xBCB40001,0x6DF80882,0x6DF80882,0xBA00000D,0x9A000884,0x6DF80882,0x6DF80882,0xBA00000D,0x9A000884,0x9A000884,0xFF200659,0xFF2C06D9,0x1380882,0xFD0404E4,0xFED4028D,0xFEB000E1,0xFEA00022,0xF0540001,0xFD100692,0xFEE80492,0x45FC0882,0xBA00000D,
-0x45FC0882,0x1BC0882,0xFFA4053D,0xFF88019A,0xFF780001,0x9BFC0882,0xFF640335,0xFF380001,0xCFF80882,0xFE6C0000,0xDC000884,0x9BFC0882,0xFF640335,0xFF380001,0xCFF80882,0xFE6C0000,0xDC000884,0xCFF80882,0xFE6C0000,0xDC000884,0xDC000884,0x9BFC0882,0xFF640335,0xFF380001,0xCFF80882,0xFE6C0000,0xDC000884,0xCFF80882,0xFE6C0000,0xDC000884,0xDC000884,0xCFF80882,
-0xFE6C0000,0xDC000884,0xDC000884,0xDC000884,0xFDB007C1,0x5D80882,0xF5B80802,0xFF94069A,0xFF68053D,0xFF1802A1,0xFEB80000,0xFE0C0000,0xFDA807C1,0xFF880665,0xFEF00049,0xDC000884,0xC1FC0882,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xAC0A69,0xFE680014,0xFE680014,0xFE680014,0xFE680014,0xFE680014,
-0xFE680014,0x9E640000,0x9E640000,0x9E640000,0x7A640000,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0xAE00004A,0xAE00004A,0xAE00004A,0x7A1C0000,0x5F80A69,0x5F80A69,0x5F80A69,0x6C0002D5,0x56000A69,0xF8A00784,0xAC0A69,0xAC0A69,0xFE900455,0xFE800254,0xFE7800DD,0xFE7800DD,0xE4640000,0xFA880692,0xFE7803F5,0xC4500000,0xAE00004A,
-0x16C0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table212[] = {
-0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0xB00000,0x1640000,
-0x1640000,0x1640000,0x1640000,0x3A000000,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x740001,0x67C0000,0x67C0000,0x67C0000,0xB00000,0xF80000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1080000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,
-0x1880000,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x1880000,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x49F80000,0x49F80000,0x49F80000,0x82000001,0x82000001,0x5180000,0x1080000,0x1080000,0x1340000,0x14C0000,0x1680000,0x1680000,0x1BC0000,0x1340000,0x14C0000,0x19FC0000,0x49F80000,
-0x19FC0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0xABF80000,
-0xABF80000,0xC4000001,0xC4000001,0xC4000001,0x1CC0000,0xBA40000,0x18C0000,0x2BFC0000,0x73FC0000,0x95FC0000,0x9FFC0000,0xB3F40000,0x1F00000,0x53FC0000,0x95FC0000,0xC4000001,0x95FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x14053CE,0xFF243DB6,0xFF082C15,0xFF0425C5,0xFF0C31DE,0xFEE41AFD,0xFEDC124D,0xFECC14B2,0xFCC00A76,0xE2C010C2,0xFEF436F6,0xFEC01AAD,0xFEB00F93,0xFE940E7F,0xFE800056,0xE488072F,0xFE7C1DBE,0xEC680AE9,0xD8640AE2,0xC2741D9B,0x3DC53CA,0xFEA03317,0xFE8825C6,0xFE641FF2,0xFE2C0BDE,0xE23C10C2,0xFE14260B,0xF4000A9F,0xDA000853,0xC2001DB3,0x73FC53CA,
-0xDC002E39,0xCA0021E2,0xB2002F23,0xA00053CA,0xFF204186,0xFF2C4DB2,0xF9404F7E,0xFF002E66,0xFED01ABC,0xFEA4092A,0xFE900243,0xFA700023,0xFF10407E,0xFEDC2BAA,0xFE700975,0xDA000853,0x4FFC53CA,0x1841D9D,0xFF681408,0xFF580BBD,0xFF480884,0xFF501235,0xFF2C068C,0xFF2401C4,0xFF140731,0xF90000AD,0xE3080659,0x49FC1D9A,0xFF140F18,0xFEF00884,0xFECC0B85,0xFE940005,
-0xE2AC0659,0xA7F81D9A,0xF80008AE,0xDC000795,0xC2001D9A,0x49FC1D9A,0xFF140F18,0xFEF00884,0xFECC0B85,0xFE940005,0xE2AC0659,0xA7F81D9A,0xF80008AE,0xDC000795,0xC2001D9A,0xA7F81D9A,0xF80008AE,0xDC000795,0xC2001D9A,0xC2001D9A,0xFF7418F9,0xF9801BAD,0xFB841BC5,0xFF40137A,0xFF100CD4,0xFEDC04E3,0xFEBC0059,0xFE5C0002,0xFF5C18D2,0xFF401296,0xFE7C0912,0xDC000795,
-0x8FFC1D9A,0x10425C5,0x10425C5,0x10425C5,0x10425C5,0xFEDC124D,0xFEDC124D,0xFEDC124D,0xF6C00A6A,0xF6C00A6A,0xC4C00A6A,0xFEB00F93,0xFEB00F93,0xFEB00F93,0xFE800056,0xFE800056,0xCA8C01D6,0xD8740883,0xD8740883,0xBC700103,0xA4740883,0x38025C5,0x38025C5,0x38025C5,0xFE2C0BDE,0xFE2C0BDE,0xC45C0A6A,0xEE000A0B,0xEE000A0B,0xC2000023,0xA40C0883,0x45FC25C5,
-0x45FC25C5,0xAC001022,0x960010EE,0x800025C6,0xFEE81BE6,0xF6FC21AD,0x10425C5,0xFECC134A,0xFEB00A91,0xFE98043A,0xFE900243,0xF674000B,0xFED01B16,0xFEB01226,0xFE6808D8,0xC2000023,0x15FC25C5,0x1480884,0x1480884,0x1480884,0x1480884,0xFF2401C4,0xFF2401C4,0xFF2401C4,0xE5080001,0xE5080001,0xC5080001,0x3E80882,0x3E80882,0x3E80882,0xFE940005,0xFE940005,
-0xC4C80001,0x79FC0882,0x79FC0882,0xC4000001,0xA4000882,0x3E80882,0x3E80882,0x3E80882,0xFE940005,0xFE940005,0xC4C80001,0x79FC0882,0x79FC0882,0xC4000001,0xA4000882,0x79FC0882,0x79FC0882,0xC4000001,0xA4000882,0xA4000882,0xFB340694,0xFB440708,0x1480884,0xFF180519,0xFEF002F2,0xFEC80151,0xFEBC0059,0xFA640000,0xF92806CD,0xFF0404EA,0x57FC0882,0xC4000001,
-0x57FC0882,0x1C40659,0xFFB003E8,0xFF940131,0xFF8C0000,0xA9FC0659,0xFF700262,0xFF540000,0xD5F80659,0xFEA00000,0xE2000659,0xA9FC0659,0xFF700262,0xFF540000,0xD5F80659,0xFEA00000,0xE2000659,0xD5F80659,0xFEA00000,0xE2000659,0xE2000659,0xA9FC0659,0xFF700262,0xFF540000,0xD5F80659,0xFEA00000,0xE2000659,0xD5F80659,0xFEA00000,0xE2000659,0xE2000659,0xD5F80659,
-0xFEA00000,0xE2000659,0xE2000659,0xE2000659,0xFFB405BA,0x1E40659,0xFBC405E9,0xFFA004F4,0xFF7C03E8,0xFF2C01F9,0xFEE40000,0xFE500000,0xFFAC05C4,0xFF9404C9,0xFF100032,0xE2000659,0xC9FC0659,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xC00A69,0xFE800032,0xFE800032,0xFE800032,0xFE800032,0xFE800032,
-0xFE800032,0xA6780001,0xA6780001,0xA6780001,0x82740002,0x11C0A69,0x11C0A69,0x11C0A69,0x11C0A69,0x11C0A69,0x11C0A69,0xC0000014,0xC0000014,0xC0000014,0x82300001,0x11FC0A69,0x11FC0A69,0x11FC0A69,0x7800025D,0x5E000A69,0xFEAC07A2,0xC00A69,0xC00A69,0xFEA00492,0xFE94029A,0xFE880115,0xFE880115,0xEC780001,0xFE9006C4,0xFC900451,0xCC640001,0xC0000014,
-0x1940A69,};
-static const uint32_t g_etc1_to_bc7_m6_table213[] = {
-0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x2C40000,0x1940000,
-0x1940000,0x1940000,0x1940000,0x42000000,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0x840001,0xE8C0000,0xE8C0000,0xE8C0000,0x2C40000,0x11C0000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1180000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,
-0x1A00000,0x55F80000,0x55F80000,0x55F80000,0x8A000001,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x55F80000,0x55F80000,0x55F80000,0x8A000001,0x55F80000,0x55F80000,0x55F80000,0x8A000001,0x8A000001,0xD280000,0x1180000,0x1180000,0x3440000,0x1600000,0x17C0000,0x17C0000,0x3D40000,0x3440000,0x1600000,0x27FC0000,0x55F80000,
-0x27FC0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0xB7F80000,0xB7F80000,0xCC000001,0x6BFC0000,0x6BFC0000,0x6BFC0000,0xB7F80000,0xB7F80000,0xCC000001,0xB7F80000,0xB7F80000,0xCC000001,0xCC000001,0x6BFC0000,0x6BFC0000,0x6BFC0000,0xB7F80000,0xB7F80000,0xCC000001,0xB7F80000,0xB7F80000,0xCC000001,0xCC000001,0xB7F80000,
-0xB7F80000,0xCC000001,0xCC000001,0xCC000001,0x1E00000,0x1B80000,0x19C0000,0x49FC0000,0x87FC0000,0xA3FC0000,0xADFC0000,0xBDF80000,0x11FC0000,0x6BFC0000,0xA3FC0000,0xCC000001,0xA3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x14C4E1E,0xFF303AC6,0xFF202B4D,0xFF1425C5,0xFF182F0A,0xFEF81ADB,0xFEE81331,0xFED81392,0xFED00A6A,0xE6D00F1A,0xFF003356,0xFED81A55,0xFEC41065,0xFEAC0D5F,0xFE9400C6,0xE898059E,0xFE941A6E,0xF07C0A01,0xDC7808A6,0xC8841A07,0x1F04E1A,0xFEB8315F,0xFEA025C6,0xFE7C1E4A,0xFE4C0C89,0xE6500F1A,0xFE2C231B,0xFA000993,0xE0000597,0xC8041A06,0x7DF84E1A,
-0xE0002B5E,0xD0001D1E,0xB8002977,0xA6004E1A,0xFF343DC5,0xFD48484A,0xFF4C4A16,0xFF082C34,0xFEE01A1E,0xFEB809A4,0xFEA4032A,0xFE84000B,0xFF183CC9,0xFEF429C3,0xFE7C09C3,0xE0000597,0x5BFC4E1A,0x1901A05,0xFF741218,0xFF640B25,0xFF580884,0xFF5C1001,0xFF400632,0xFF300220,0xFF2005C9,0xFB100051,0xE71804B1,0x59FC1A05,0xFF200DF4,0xFF080884,0xFEE409DD,0xFEAC0025,
-0xE6C004B1,0xADFC1A05,0xFE080882,0xE0000566,0xC8001A06,0x59FC1A05,0xFF200DF4,0xFF080884,0xFEE409DD,0xFEAC0025,0xE6C004B1,0xADFC1A05,0xFE080882,0xE0000566,0xC8001A06,0xADFC1A05,0xFE080882,0xE0000566,0xC8001A06,0xC8001A06,0xFF7815DA,0xFF8C182D,0xFF8C185D,0xFF5C114D,0xFF280B9E,0xFEF0049D,0xFED000A0,0xFE7C0004,0xFD74161E,0xFF481074,0xFEA008FB,0xE0000566,
-0x99FC1A05,0x11425C5,0x11425C5,0x11425C5,0x11425C5,0xFEE81331,0xFEE81331,0xFEE81331,0xFED00A6A,0xFED00A6A,0xCCD00A6A,0xFEC41065,0xFEC41065,0xFEC41065,0xFE9400C6,0xFE9400C6,0xD29C01D6,0xE0840883,0xE0840883,0xC4800103,0xAC840883,0x39825C5,0x39825C5,0x39825C5,0xFE4C0C89,0xFE4C0C89,0xCC6C0A6A,0xFA000983,0xFA000983,0xCE000003,0xAC1C0883,0x51FC25C5,
-0x51FC25C5,0xB2000F62,0x9C001026,0x880025C6,0xFD001CA9,0xFF0C21AD,0x11425C5,0xFEDC144E,0xFEC40BB5,0xFEAC053E,0xFEA4032A,0xFE84000B,0xFEE41BE5,0xFED012ED,0xFE7C0933,0xCE000003,0x23FC25C5,0x1580884,0x1580884,0x1580884,0x1580884,0xFF300220,0xFF300220,0xFF300220,0xED180001,0xED180001,0xCD180001,0x7FC0882,0x7FC0882,0x7FC0882,0xFEAC0025,0xFEAC0025,
-0xCCD80001,0x85FC0882,0x85FC0882,0xCC0C0001,0xAC000882,0x7FC0882,0x7FC0882,0x7FC0882,0xFEAC0025,0xFEAC0025,0xCCD80001,0x85FC0882,0x85FC0882,0xCC0C0001,0xAC000882,0x85FC0882,0x85FC0882,0xCC0C0001,0xAC000882,0xAC000882,0xFF3C06C4,0xFF4C0728,0x1580884,0xFF24057A,0xFF040352,0xFEE4019A,0xFED000A0,0xFE7C0004,0xFF3406D1,0xFF180521,0x65FC0882,0xCC0C0001,
-0x65FC0882,0x1CC04B1,0xFFBC02E4,0xFFA000E5,0xFF9C0000,0xB5FC04B1,0xFF8801BA,0xFF6C0000,0xDBF804B1,0xFED40000,0xE60004B1,0xB5FC04B1,0xFF8801BA,0xFF6C0000,0xDBF804B1,0xFED40000,0xE60004B1,0xDBF804B1,0xFED40000,0xE60004B1,0xE60004B1,0xB5FC04B1,0xFF8801BA,0xFF6C0000,0xDBF804B1,0xFED40000,0xE60004B1,0xDBF804B1,0xFED40000,0xE60004B1,0xE60004B1,0xDBF804B1,
-0xFED40000,0xE60004B1,0xE60004B1,0xE60004B1,0xF9C40451,0x1EC04B1,0xFFCC0451,0xFFB003A1,0xFF9002E4,0xFF480168,0xFF0C0000,0xFE8C0000,0xF3C40451,0xFFA00392,0xFF300025,0xE60004B1,0xD1FC04B1,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xD00A69,0xFE900059,0xFE900059,0xFE900059,0xFE900059,0xFE900059,
-0xFE900059,0xAE880001,0xAE880001,0xAE880001,0x8A840002,0x1340A69,0x1340A69,0x1340A69,0x1340A69,0x1340A69,0x1340A69,0xCE000002,0xCE000002,0xCE000002,0x8A400001,0x1DFC0A69,0x1DFC0A69,0x1DFC0A69,0x7E0001ED,0x66000A69,0xFAC407C1,0xD00A69,0xD00A69,0xFCB804E2,0xFEA802DA,0xFE980151,0xFE980151,0xF4880001,0xFEAC0708,0xFEA00480,0xD4740001,0xCE000002,
-0x1B40A69,};
-static const uint32_t g_etc1_to_bc7_m6_table214[] = {
-0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x2DC0000,0x1C40000,
-0x1C40000,0x1C40000,0x1C40000,0x4A000000,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0x940001,0xA00000,0xA00000,0xA00000,0x2DC0000,0x13C0000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,
-0x1B80000,0x61F80000,0x61F80000,0x61F80000,0x92000001,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x61F80000,0x61F80000,0x61F80000,0x92000001,0x61F80000,0x61F80000,0x61F80000,0x92000001,0x92000001,0x13C0000,0x1280000,0x1280000,0x1580000,0x1740000,0x3900000,0x3900000,0x1F00000,0x1580000,0x1740000,0x37FC0000,0x61F80000,
-0x37FC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,0xC3F80000,0xD4000001,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,0xC3F80000,0xD4000001,0xC3F80000,0xC3F80000,0xD4000001,0xD4000001,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,0xC3F80000,0xD4000001,0xC3F80000,0xC3F80000,0xD4000001,0xD4000001,0xC3F80000,
-0xC3F80000,0xD4000001,0xD4000001,0xD4000001,0x3F00000,0x1C80000,0x1AC0000,0x67FC0000,0x9BFC0000,0xB3FC0000,0xBBFC0000,0xC7FC0000,0x37FC0000,0x83FC0000,0xB3FC0000,0xD4000001,0xB3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x15848CE,0xFF4437E9,0xFF2C2A81,0xFF2425C5,0xFF242C86,0xFF081AD5,0xFF001419,0xFEF01292,0xFEE00A8D,0xEAE00DB2,0xFF0C301E,0xFEE41A29,0xFEDC116D,0xFEB80CB3,0xFEAC016E,0xEAA8045E,0xFEA0178E,0xF4900951,0xE088069F,0xCC9416C7,0x7FC48CA,0xFECC3001,0xFEB825C6,0xFE941CE2,0xFE640D49,0xEA640DB2,0xFE3820BF,0xFE08097B,0xE6000393,0xCC1816C6,0x85FC48CA,
-0xE600290A,0xD60018EA,0xBE002443,0xAC0048CA,0xFF3C39F6,0xFF4C436A,0xFF4C4586,0xFF182A0A,0xFEF419AA,0xFEC80A63,0xFEBC0422,0xFE9C0031,0xFF34391E,0xFF0027C6,0xFE940A51,0xE6000393,0x65FC48CA,0x19C16C5,0xFF801058,0xFF700AA5,0xFF680884,0xFF740E01,0xFF5405F4,0xFF44027D,0xFF380491,0xFD240018,0xEB280349,0x6BFC16C5,0xFF380CE4,0xFF200884,0xFEFC0875,0xFECC0061,
-0xEAD40349,0xB7F816C5,0xFE380882,0xE6000392,0xCC0016C6,0x6BFC16C5,0xFF380CE4,0xFF200884,0xFEFC0875,0xFECC0061,0xEAD40349,0xB7F816C5,0xFE380882,0xE6000392,0xCC0016C6,0xB7F816C5,0xFE380882,0xE6000392,0xCC0016C6,0xCC0016C6,0xFF80136D,0xFF8C154D,0xF59815A4,0xFF6C0F72,0xFF480A86,0xFF040496,0xFEE800FA,0xFEA00022,0xFF781352,0xFF5C0EC6,0xFED008E5,0xE6000392,
-0xA3FC16C5,0x12425C5,0x12425C5,0x12425C5,0x12425C5,0xFF001419,0xFF001419,0xFF001419,0xFEE00A8D,0xFEE00A8D,0xD4E00A6A,0xFEDC116D,0xFEDC116D,0xFEDC116D,0xFEAC016E,0xFEAC016E,0xDAAC01D6,0xE8940883,0xE8940883,0xCC900103,0xB4940883,0x3B025C5,0x3B025C5,0x3B025C5,0xFE640D49,0xFE640D49,0xD47C0A6A,0xFE08097B,0xFE08097B,0xD6100003,0xB42C0883,0x5DFC25C5,
-0x5DFC25C5,0xBE000E9A,0xA6000F63,0x900025C6,0xFF101D2B,0xF920222A,0x12425C5,0xFEF81556,0xFED80CE9,0xFEBC0666,0xFEBC0422,0xFE9C0031,0xFEF81C56,0xFEDC1421,0xFE9409D8,0xD6100003,0x33FC25C5,0x1680884,0x1680884,0x1680884,0x1680884,0xFF44027D,0xFF44027D,0xFF44027D,0xF5280001,0xF5280001,0xD5280001,0x1FFC0882,0x1FFC0882,0x1FFC0882,0xFECC0061,0xFECC0061,
-0xD4E80001,0x91FC0882,0x91FC0882,0xD41C0001,0xB4000882,0x1FFC0882,0x1FFC0882,0x1FFC0882,0xFECC0061,0xFECC0061,0xD4E80001,0x91FC0882,0x91FC0882,0xD41C0001,0xB4000882,0x91FC0882,0x91FC0882,0xD41C0001,0xB4000882,0xB4000882,0xFF5806CD,0xFB640745,0x1680884,0xFF4405B2,0xFF1803BA,0xFEFC01F9,0xFEE800FA,0xFEA00022,0xFD4C0708,0xFF300581,0x75FC0882,0xD41C0001,
-0x75FC0882,0x1D40349,0xFFC401F9,0xFFB8009D,0xFFAC0000,0xC1FC0349,0xFFA00132,0xFF840000,0xE1F80349,0xFF040000,0xEA000349,0xC1FC0349,0xFFA00132,0xFF840000,0xE1F80349,0xFF040000,0xEA000349,0xE1F80349,0xFF040000,0xEA000349,0xEA000349,0xC1FC0349,0xFFA00132,0xFF840000,0xE1F80349,0xFF040000,0xEA000349,0xE1F80349,0xFF040000,0xEA000349,0xEA000349,0xE1F80349,
-0xFF040000,0xEA000349,0xEA000349,0xEA000349,0xFDCC02F9,0x1F40349,0xFFCC0311,0xFFC00288,0xFFA80202,0xFF700104,0xFF340000,0xFEC80000,0xF7CC02F9,0xFDBC0288,0xFF500019,0xEA000349,0xD9FC0349,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xE00A69,0xFEA40082,0xFEA40082,0xFEA40082,0xFEA40082,0xFEA40082,
-0xFEA40082,0xB6980001,0xB6980001,0xB6980001,0x92940002,0x1480A69,0x1480A69,0x1480A69,0x1480A69,0x1480A69,0x1480A69,0xD80C0000,0xD80C0000,0xD80C0000,0x92500001,0x29FC0A69,0x29FC0A69,0x29FC0A69,0x8A000195,0x6E000A69,0xFECC07E9,0xE00A69,0xE00A69,0xFCC80515,0xFEB40321,0xFEAC019A,0xFEAC019A,0xFC980001,0xFEBC070A,0xFEB404B1,0xDC840001,0xD80C0000,
-0x1D80A69,};
-static const uint32_t g_etc1_to_bc7_m6_table215[] = {
-0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x2F40000,0x1F40000,
-0x1F40000,0x1F40000,0x1F40000,0x52000000,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xA40001,0xB00000,0xB00000,0xB00000,0x2F40000,0x1600000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1380000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,
-0x1D00000,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x1D00000,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x6DF80000,0x6DF80000,0x6DF80000,0x9A000001,0x9A000001,0x14C0000,0x1380000,0x1380000,0x7680000,0x1880000,0x1A80000,0x1A80000,0xDFC0000,0x7680000,0x1880000,0x45FC0000,0x6DF80000,
-0x45FC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0xCFF80000,
-0xCFF80000,0xDC000001,0xDC000001,0xDC000001,0x17FC0000,0x5D80000,0x1BC0000,0x85FC0000,0xAFFC0000,0xC1FC0000,0xC9F80000,0xD3F80000,0x5FFC0000,0x9BFC0000,0xC1FC0000,0xDC000001,0xC1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x16443DE,0xFF503551,0xFF3829D5,0xFF3425C5,0xFF302A52,0xFF181AE5,0xFF0C150D,0xFEFC11F2,0xFEF00AE2,0xEEF00C8A,0xFF242D36,0xFEFC1A11,0xFEE81275,0xFED80C52,0xFEC00252,0xEEBC035E,0xFEB8151E,0xF8A408D9,0xE69404F3,0xD2A413DB,0x19FC43DA,0xFEE42E81,0xFED025C5,0xFEAC1BBA,0xFE7C0E29,0xEE780C8A,0xFE641E91,0xFE200A1B,0xEC0C023F,0xD22813DA,0x8FF843DA,
-0xF0002759,0xDC001546,0xC4001F87,0xB20043DA,0xFF4C36D3,0xF9603F4E,0xFB6440F5,0xFF2C286E,0xFF04195C,0xFEDC0B1C,0xFED00559,0xFEB00092,0xFF3435BE,0xFF102642,0xFEB00AE7,0xEC0C023F,0x71FC43DA,0x1A413DD,0xFF8C0EC8,0xFF7C0A3D,0xFF780884,0xFF800C45,0xFF6405BE,0xFF5C02FD,0xFF4C03BD,0xFF380001,0xEF380221,0x7BFC13DA,0xFF4C0C13,0xFF380884,0xFF14074D,0xFEE400B9,
-0xEEE80221,0xBFF813DA,0xFE6C0882,0xEC00022E,0xD20013DA,0x7BFC13DA,0xFF4C0C13,0xFF380884,0xFF14074D,0xFEE400B9,0xEEE80221,0xBFF813DA,0xFE6C0882,0xEC00022E,0xD20013DA,0xBFF813DA,0xFE6C0882,0xEC00022E,0xD20013DA,0xD20013DA,0xFF9410EE,0xF9A01299,0xFBA412D8,0xFF780DD1,0xFF5C09A6,0xFF240491,0xFF0C0164,0xFEB80068,0xFF8810FD,0xFF680D52,0xFEE408D3,0xEC00022E,
-0xADFC13DA,0x13425C5,0x13425C5,0x13425C5,0x13425C5,0xFF0C150D,0xFF0C150D,0xFF0C150D,0xFEF00AE2,0xFEF00AE2,0xDCF00A6A,0xFEE81275,0xFEE81275,0xFEE81275,0xFEC00252,0xFEC00252,0xE2BC01D6,0xF0A40883,0xF0A40883,0xD4A00103,0xBCA40883,0x1C825C5,0x1C825C5,0x1C825C5,0xFE7C0E29,0xFE7C0E29,0xDC8C0A6A,0xFE200A1B,0xFE200A1B,0xDE200003,0xBC3C0883,0x69FC25C5,
-0x69FC25C5,0xCA000DF2,0xB2000EA3,0x980025C6,0xFF201DA6,0xFF2C222E,0x13425C5,0xFF081643,0xFEEC0E2D,0xFED407CE,0xFED00559,0xFEB00092,0xFF141D18,0xFEF41512,0xFEB00A83,0xDE200003,0x41FC25C5,0x1780884,0x1780884,0x1780884,0x1780884,0xFF5C02FD,0xFF5C02FD,0xFF5C02FD,0xFD380001,0xFD380001,0xDD380001,0x37FC0882,0x37FC0882,0x37FC0882,0xFEE400B9,0xFEE400B9,
-0xDCF80001,0x9DFC0882,0x9DFC0882,0xDC2C0001,0xBC000882,0x37FC0882,0x37FC0882,0x37FC0882,0xFEE400B9,0xFEE400B9,0xDCF80001,0x9DFC0882,0x9DFC0882,0xDC2C0001,0xBC000882,0x9DFC0882,0x9DFC0882,0xDC2C0001,0xBC000882,0xBC000882,0xFB6C0708,0xFF6C076D,0x1780884,0xFD5805E9,0xFF400422,0xFF20028A,0xFF0C0164,0xFEB80068,0xF5640745,0xFF4805B4,0x83FC0882,0xDC2C0001,
-0x83FC0882,0x1DC0221,0xFFD00145,0xFFC00068,0xFFBC0000,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xE7F80221,0xFF340000,0xEE000221,0xEE000221,0xCDFC0221,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xFF340000,0xEE000221,0xE7F80221,0xFF340000,0xEE000221,0xEE000221,0xE7F80221,
-0xFF340000,0xEE000221,0xEE000221,0xEE000221,0xFFD001ED,0x1FC0221,0xF7DC0200,0xFFC001A8,0xFFA80152,0xFF8400A0,0xFF5C0000,0xFF040000,0xFBD401E1,0xFFC00190,0xFF700010,0xEE000221,0xDFFC0221,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xF00A69,0xFEB400C1,0xFEB400C1,0xFEB400C1,0xFEB400C1,0xFEB400C1,
-0xFEB400C1,0xBEA80001,0xBEA80001,0xBEA80001,0x9AA40002,0x1600A69,0x1600A69,0x1600A69,0x1600A69,0x1600A69,0x1600A69,0xE01C0000,0xE01C0000,0xE01C0000,0x9A600001,0x35FC0A69,0x35FC0A69,0x35FC0A69,0x9000013D,0x76000A69,0xFAE40802,0xF00A69,0xF00A69,0xFED8054A,0xFED00372,0xFEC401E1,0xFEC401E1,0xFCA8000A,0xFAD00745,0xFEBC0502,0xE4940001,0xE01C0000,
-0x1F80A69,};
-static const uint32_t g_etc1_to_bc7_m6_table216[] = {
-0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0x1100000,0xDFC0000,
-0xDFC0000,0xDFC0000,0xDFC0000,0x5A000001,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xB80000,0xC40000,0xC40000,0xC40000,0x1100000,0x1880000,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x1480001,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,
-0x3E80000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x3E80000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0x79FC0000,0x79FC0000,0x79FC0000,0xA4000000,0xA4000000,0x1600000,0x1480001,0x1480001,0x1800000,0x39C0000,0x1C00000,0x1C00000,0x23FC0000,0x1800000,0x39C0000,0x57FC0000,0x79FC0000,
-0x57FC0000,0x1CC0001,0x1CC0001,0x1CC0001,0x1CC0001,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xDDF40000,0xDDF40000,0xE6000000,0xE6000000,0xB7FC0000,0xB7FC0000,0xB7FC0000,0xDDF40000,0xDDF40000,0xE6000000,0xDDF40000,0xDDF40000,0xE6000000,0xE6000000,0xDDF40000,
-0xDDF40000,0xE6000000,0xE6000000,0xE6000000,0x57FC0000,0x1EC0000,0x1CC0001,0xA7FC0000,0xC5FC0000,0xD3FC0000,0xD7FC0000,0xDFF80000,0x8BFC0000,0xB7FC0000,0xD3FC0000,0xE6000000,0xD3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1743EBF,0xFF5C32AA,0xFF4C292F,0xFF4425C6,0xFF4427D5,0xFF301B26,0xFF24162E,0xFF141185,0xFF080B81,0xF3000B89,0xFF302A4B,0xFF101A25,0xFF0013B2,0xFEE40C25,0xFED80391,0xF6CC0285,0xFECC130C,0xFCB8088E,0xEAA80370,0xD8B810F8,0x2FFC3EBF,0xFEFC2CFE,0xFEEC25C5,0xFEB81ACB,0xFEA00F3A,0xF2900B89,0xFE881C84,0xFE380B28,0xF0200130,0xD83C10F9,0x99FC3EBF,
-0xF8002621,0xE0001205,0xD0001AB8,0xB8003EC1,0xFF583335,0xFF6C3A65,0xFF6C3C36,0xFF4026D4,0xFF181935,0xFEF00C2B,0xFEE806D0,0xFEC00164,0xFF50326B,0xFF3024FE,0xFEBC0BFE,0xF0200130,0x7FF83EBF,0x1B010FB,0xFFA40D26,0xFF9409C3,0xFF8C0882,0xFF8C0AA9,0xFF7405AE,0xFF680385,0xFF580313,0xFF4C001A,0xF3480121,0x8DFC10F8,0xFF640B25,0xFF540882,0xFF2C0651,0xFF080131,
-0xF2FC0121,0xC7FC10F8,0xFEA00882,0xF2080120,0xD80010F8,0x8DFC10F8,0xFF640B25,0xFF540882,0xFF2C0651,0xFF080131,0xF2FC0121,0xC7FC10F8,0xFEA00882,0xF2080120,0xD80010F8,0xC7FC10F8,0xFEA00882,0xF2080120,0xD80010F8,0xD80010F8,0xFFA00E9D,0xFFAC0FC9,0xFFAC1026,0xFF880C23,0xFF6808C1,0xFF380495,0xFF2801E2,0xFEE400CD,0xFF980EBE,0xFF840BB2,0xFF1008B5,0xF2080120,
-0xB9FC10F8,0x14425C6,0x14425C6,0x14425C6,0x14425C6,0xFF24162E,0xFF24162E,0xFF24162E,0xFF080B81,0xFF080B81,0xE7000A69,0xFF0013B2,0xFF0013B2,0xFF0013B2,0xFED80391,0xFED80391,0xECD001D5,0xF8B80884,0xF8B80884,0xDCB00104,0xC4B80884,0x1E425C5,0x1E425C5,0x1E425C5,0xFEA00F3A,0xFEA00F3A,0xE69C0A69,0xFE380B28,0xFE380B28,0xE6340002,0xC4500884,0x77F825C5,
-0x77F825C5,0xD6000D49,0xB8000DD4,0xA20025C5,0xFF2C1E68,0xF94022AA,0x14425C6,0xFF181761,0xFF000FB2,0xFEF00952,0xFEE806D0,0xFEC00164,0xFF201DB2,0xFF101682,0xFEBC0B9A,0xE6340002,0x53FC25C5,0x18C0882,0x18C0882,0x18C0882,0x18C0882,0xFF680385,0xFF680385,0xFF680385,0xFF4C001A,0xFF4C001A,0xE7480001,0x53FC0882,0x53FC0882,0x53FC0882,0xFF080131,0xFF080131,
-0xE7080001,0xABF80882,0xABF80882,0xE63C0000,0xC4000884,0x53FC0882,0x53FC0882,0x53FC0882,0xFF080131,0xFF080131,0xE7080001,0xABF80882,0xABF80882,0xE63C0000,0xC4000884,0xABF80882,0xABF80882,0xE63C0000,0xC4000884,0xC4000884,0xFF74073A,0xFD880782,0x18C0882,0xFB700659,0xFF540488,0xFF300304,0xFF2801E2,0xFEE400CD,0xFD740745,0xFF5C05F5,0x95FC0882,0xE63C0000,
-0x95FC0882,0x1E40122,0xFFDC00AA,0xFFD4003A,0xFFCC0001,0xDBFC0120,0xFFC0006D,0xFFB80000,0xEDFC0120,0xFF6C0000,0xF2000120,0xDBFC0120,0xFFC0006D,0xFFB80000,0xEDFC0120,0xFF6C0000,0xF2000120,0xEDFC0120,0xFF6C0000,0xF2000120,0xF2000120,0xDBFC0120,0xFFC0006D,0xFFB80000,0xEDFC0120,0xFF6C0000,0xF2000120,0xEDFC0120,0xFF6C0000,0xF2000120,0xF2000120,0xEDFC0120,
-0xFF6C0000,0xF2000120,0xF2000120,0xF2000120,0xF5E40109,0x47FC0120,0xFBE40109,0xFFD800DD,0xFFCC00B5,0xFFAC0059,0xFF880000,0xFF480000,0xFFDC00F4,0xFFCC00DA,0xFF980009,0xF2000120,0xE9FC0120,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0x1000A69,0xFEC80104,0xFEC80104,0xFEC80104,0xFEC80104,0xFEC80104,
-0xFEC80104,0xC8B80000,0xC8B80000,0xC8B80000,0xA4B80000,0x17C0A69,0x17C0A69,0x17C0A69,0x17C0A69,0x17C0A69,0x17C0A69,0xEA2C0000,0xEA2C0000,0xEA2C0000,0xA4700000,0x43F80A69,0x43F80A69,0x43F80A69,0x9C0000E9,0x80000A69,0xF4F80841,0x1000A69,0x1000A69,0xFEE80589,0xFEE403C8,0xFED4022D,0xFED4022D,0xFEC00020,0xFED80781,0xFED0053D,0xEEA40000,0xEA2C0000,
-0x11FC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table217[] = {
-0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x1280000,0x19FC0000,
-0x19FC0000,0x19FC0000,0x19FC0000,0x62000001,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0x2D40000,0x2D40000,0x2D40000,0x1280000,0x1A80000,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x1580001,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,
-0x7FC0000,0x85FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x7FC0000,0x85FC0000,0x85FC0000,0x85FC0000,0xAC000000,0x85FC0000,0x85FC0000,0x85FC0000,0xAC000000,0xAC000000,0x1700000,0x1580001,0x1580001,0x5900000,0x3B00000,0x3D40000,0x3D40000,0x37FC0000,0x5900000,0x3B00000,0x65FC0000,0x85FC0000,
-0x65FC0000,0x1DC0001,0x1DC0001,0x1DC0001,0x1DC0001,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xE7FC0000,0xE7FC0000,0xEE000000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xE7FC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xE7FC0000,0xEE000000,0xEE000000,0xCFFC0000,0xCFFC0000,0xCFFC0000,0xE7FC0000,0xE7FC0000,0xEE000000,0xE7FC0000,0xE7FC0000,0xEE000000,0xEE000000,0xE7FC0000,
-0xE7FC0000,0xEE000000,0xEE000000,0xEE000000,0x91FC0000,0x7FC0000,0x1DC0001,0xC5FC0000,0xD9FC0000,0xE1FC0000,0xE5FC0000,0xE9FC0000,0xB3FC0000,0xCFFC0000,0xE1FC0000,0xEE000000,0xE1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1803A9B,0xFF68308E,0xFF5828A3,0xFF5425C6,0xFF502631,0xFF3C1B66,0xFF301742,0xFF201179,0xFF140C51,0xF7100AE9,0xFF4427DC,0xFF241A56,0xFF1814E2,0xFEFC0C45,0xFEE404F1,0xF8DC020D,0xFEE4119C,0xFECC0888,0xEEBC0268,0xDEC80EC4,0x41FC3A9B,0xFF142BBE,0xFF0425C5,0xFED81A61,0xFEB81052,0xF6A40AE9,0xFEA01B0C,0xFE640C2B,0xF6340082,0xDE500EC4,0xA1FC3A9B,
-0xFE0025C5,0xE6000F95,0xD60016F4,0xBE003A9D,0xFF643075,0xFF6C3735,0xF77C38AF,0xFF442569,0xFF281942,0xFEFC0D81,0xFEF8085D,0xFED8026A,0xFF542FDE,0xFF4023EC,0xFEDC0D0E,0xF6340082,0x89FC3A9B,0x1BC0EC3,0xFFB00BEE,0xFFA0096B,0xFF9C0882,0xFF980989,0xFF8405BA,0xFF80040D,0xFF7002BB,0xFF64006A,0xF7580081,0x9BFC0EC3,0xFF7C0A6D,0xFF6C0882,0xFF4C05D6,0xFF2C01BD,
-0xF7100081,0xCFF80EC3,0xFED40882,0xF6280080,0xDE000EC4,0x9BFC0EC3,0xFF7C0A6D,0xFF6C0882,0xFF4C05D6,0xFF2C01BD,0xF7100081,0xCFF80EC3,0xFED40882,0xF6280080,0xDE000EC4,0xCFF80EC3,0xFED40882,0xF6280080,0xDE000EC4,0xDE000EC4,0xFFB40CEA,0xF5B80E03,0xF7BC0E43,0xFF980AE1,0xFF7C0839,0xFF5804EB,0xFF3C028A,0xFEFC015D,0xFFA40D0A,0xFF900AA6,0xFF3008A8,0xF6280080,
-0xC1FC0EC3,0x15425C6,0x15425C6,0x15425C6,0x15425C6,0xFF301742,0xFF301742,0xFF301742,0xFF140C51,0xFF140C51,0xEF100A69,0xFF1814E2,0xFF1814E2,0xFF1814E2,0xFEE404F1,0xFEE404F1,0xF4E001D5,0xFECC0888,0xFECC0888,0xE4C00104,0xCCC80884,0x1FC25C5,0x1FC25C5,0x1FC25C5,0xFEB81052,0xFEB81052,0xEEAC0A69,0xFE640C2B,0xFE640C2B,0xEE440002,0xCC600884,0x83F825C5,
-0x83F825C5,0xDC000CB5,0xC4000D24,0xAA0025C5,0xFF3C1EFE,0xFF4C22BE,0x15425C6,0xFF2C188D,0xFF141116,0xFEFC0ADD,0xFEF8085D,0xFED8026A,0xFF341E29,0xFF18178E,0xFEDC0CBD,0xEE440002,0x61FC25C5,0x19C0882,0x19C0882,0x19C0882,0x19C0882,0xFF80040D,0xFF80040D,0xFF80040D,0xFF64006A,0xFF64006A,0xEF580001,0x6BFC0882,0x6BFC0882,0x6BFC0882,0xFF2C01BD,0xFF2C01BD,
-0xEF180001,0xB7F80882,0xB7F80882,0xEE4C0000,0xCC000884,0x6BFC0882,0x6BFC0882,0x6BFC0882,0xFF2C01BD,0xFF2C01BD,0xEF180001,0xB7F80882,0xB7F80882,0xEE4C0000,0xCC000884,0xB7F80882,0xB7F80882,0xEE4C0000,0xCC000884,0xCC000884,0xFF900745,0xF59807C1,0x19C0882,0xFF780681,0xFF700515,0xFF5003B5,0xFF3C028A,0xFEFC015D,0xFD880782,0xFF74065D,0xA3FC0882,0xEE4C0000,
-0xA3FC0882,0x1EC0082,0xFFE8004A,0xFFE40019,0xFFDC0001,0xE9FC0080,0xFFD8002D,0xFFCC0001,0xF3FC0080,0xFF9C0000,0xF6000080,0xE9FC0080,0xFFD8002D,0xFFCC0001,0xF3FC0080,0xFF9C0000,0xF6000080,0xF3FC0080,0xFF9C0000,0xF6000080,0xF6000080,0xE9FC0080,0xFFD8002D,0xFFCC0001,0xF3FC0080,0xFF9C0000,0xF6000080,0xF3FC0080,0xFF9C0000,0xF6000080,0xF6000080,0xF3FC0080,
-0xFF9C0000,0xF6000080,0xF6000080,0xF6000080,0xF9EC0071,0x87FC0080,0xFFEC0071,0xFDE80062,0xFDE00055,0xFFC80022,0xFFB00000,0xFF840000,0xFDEC0071,0xFFE40062,0xFFB80004,0xF6000080,0xF1FC0080,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0x1100A69,0xFEDC0145,0xFEDC0145,0xFEDC0145,0xFEDC0145,0xFEDC0145,
-0xFEDC0145,0xD0C80000,0xD0C80000,0xD0C80000,0xACC80000,0x1940A69,0x1940A69,0x1940A69,0x1940A69,0x1940A69,0x1940A69,0xF23C0000,0xF23C0000,0xF23C0000,0xAC800000,0x4FF80A69,0x4FF80A69,0x4FF80A69,0xA60000B4,0x88000A69,0xFD080841,0x1100A69,0x1100A69,0xFEF405E4,0xFEEC041D,0xFEE80290,0xFEE80290,0xFED40048,0xFEF40784,0xFEE40595,0xF6B40000,0xF23C0000,
-0x1FFC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table218[] = {
-0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x1400000,0x25F80000,
-0x25F80000,0x25F80000,0x25F80000,0x6A000001,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xD80000,0xAE40000,0xAE40000,0xAE40000,0x1400000,0x1CC0000,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1680001,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,
-0x1FFC0000,0x91FC0000,0x91FC0000,0x91FC0000,0xB4000000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x1FFC0000,0x91FC0000,0x91FC0000,0x91FC0000,0xB4000000,0x91FC0000,0x91FC0000,0x91FC0000,0xB4000000,0xB4000000,0x9800000,0x1680001,0x1680001,0x1A40000,0x3C40000,0x1EC0000,0x1EC0000,0x4BFC0000,0x1A40000,0x3C40000,0x75FC0000,0x91FC0000,
-0x75FC0000,0x1EC0001,0x1EC0001,0x1EC0001,0x1EC0001,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xF3FC0000,0xF3FC0000,0xF6000000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xF3FC0000,0xF3FC0000,0xF6000000,0xF3FC0000,0xF3FC0000,0xF6000000,0xF6000000,0xE9FC0000,0xE9FC0000,0xE9FC0000,0xF3FC0000,0xF3FC0000,0xF6000000,0xF3FC0000,0xF3FC0000,0xF6000000,0xF6000000,0xF3FC0000,
-0xF3FC0000,0xF6000000,0xF6000000,0xF6000000,0xC9FC0000,0x87FC0000,0x1EC0001,0xE3FC0000,0xEDFC0000,0xF1FC0000,0xF3F80000,0xF5F80000,0xDBFC0000,0xE9FC0000,0xF1FC0000,0xF6000000,0xF1FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x18C36D7,0xFF742EAA,0xFF6C2826,0xFF6425C6,0xFF5C24DD,0xFF501BC6,0xFF441831,0xFF3811B1,0xFF2C0D49,0xFB200A89,0xFF5025F8,0xFF301AB6,0xFF241626,0xFF140CC5,0xFEFC0671,0xFCF001D9,0xFEFC10AC,0xFEE408D8,0xF4C801A8,0xE2D80CE4,0x53FC36D7,0xFF2C2A9E,0xFF1C25C5,0xFEF01A01,0xFED811B1,0xFAB80A89,0xFEB819F4,0xFE880D5B,0xFA4C0020,0xE2640CE4,0xABF836D7,
-0xFE2C25C5,0xEC000DB5,0xDC0013A8,0xC40036D9,0xFF742E0F,0xFD883393,0xFD88351B,0xFF54242D,0xFF381982,0xFF180EB9,0xFF1009F9,0xFEEC03A5,0xFF702D9A,0xFF5022A2,0xFEF40E18,0xFA4C0020,0x95FC36D7,0x1C80CE3,0xFFBC0AE6,0xFFAC092B,0xFFAC0882,0xFFA408B1,0xFF9805D6,0xFF8C04A5,0xFF7C02B3,0xFF7000EA,0xFB680021,0xAFFC0CE3,0xFF9409D5,0xFF840882,0xFF640576,0xFF380271,
-0xFB240021,0xD7FC0CE3,0xFF040882,0xFA4C0020,0xE2000CE4,0xAFFC0CE3,0xFF9409D5,0xFF840882,0xFF640576,0xFF380271,0xFB240021,0xD7FC0CE3,0xFF040882,0xFA4C0020,0xE2000CE4,0xD7FC0CE3,0xFF040882,0xFA4C0020,0xE2000CE4,0xE2000CE4,0xFFBC0B6D,0xFBC40C2B,0xFBC40C7B,0xFFB00A0A,0xFF9007D9,0xFF6C0515,0xFF640332,0xFF2801F4,0xFFB40B76,0xFF9809BE,0xFF50089B,0xFA4C0020,
-0xCDFC0CE3,0x16425C6,0x16425C6,0x16425C6,0x16425C6,0xFF441831,0xFF441831,0xFF441831,0xFF2C0D49,0xFF2C0D49,0xF7200A69,0xFF241626,0xFF241626,0xFF241626,0xFEFC0671,0xFEFC0671,0xFCF001D5,0xFEE408D8,0xFEE408D8,0xECD00104,0xD4D80884,0x19FC25C5,0x19FC25C5,0x19FC25C5,0xFED811B1,0xFED811B1,0xF6BC0A69,0xFE880D5B,0xFE880D5B,0xF6540002,0xD4700884,0x8FF825C5,
-0x8FF825C5,0xE6000C45,0xCA000C84,0xB20025C5,0xFF4C1F85,0xF960232D,0x16425C6,0xFF3C1996,0xFF2C12AC,0xFF180CA8,0xFF1009F9,0xFEEC03A5,0xFD4C1F0B,0xFF30188D,0xFEF40DD8,0xF6540002,0x71FC25C5,0x1AC0882,0x1AC0882,0x1AC0882,0x1AC0882,0xFF8C04A5,0xFF8C04A5,0xFF8C04A5,0xFF7000EA,0xFF7000EA,0xF7680001,0x83FC0882,0x83FC0882,0x83FC0882,0xFF380271,0xFF380271,
-0xF7280001,0xC3F80882,0xC3F80882,0xF65C0000,0xD4000884,0x83FC0882,0x83FC0882,0x83FC0882,0xFF380271,0xFF380271,0xF7280001,0xC3F80882,0xC3F80882,0xF65C0000,0xD4000884,0xC3F80882,0xC3F80882,0xF65C0000,0xD4000884,0xD4000884,0xFFA00784,0xFDA807C1,0x1AC0882,0xFF9806CD,0xFF840581,0xFF640431,0xFF640332,0xFF2801F4,0xFF940794,0xFF8406B2,0xB3FC0882,0xF65C0000,
-0xB3FC0882,0x1F40022,0xFFF40012,0xFFF00005,0xFFEC0001,0xF5FC0020,0xFFEC000D,0xFFE40001,0xF9FC0020,0xFFCC0000,0xFA000020,0xF5FC0020,0xFFEC000D,0xFFE40001,0xF9FC0020,0xFFCC0000,0xFA000020,0xF9FC0020,0xFFCC0000,0xFA000020,0xFA000020,0xF5FC0020,0xFFEC000D,0xFFE40001,0xF9FC0020,0xFFCC0000,0xFA000020,0xF9FC0020,0xFFCC0000,0xFA000020,0xFA000020,0xF9FC0020,
-0xFFCC0000,0xFA000020,0xFA000020,0xFA000020,0xFDF40019,0xC7FC0020,0xF3F40022,0xFFF00019,0xFFE80011,0xFFDC0009,0xFFD80000,0xFFC00000,0xFFF0001D,0xFFF00019,0xFFDC0001,0xFA000020,0xF9FC0020,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0x1200A69,0xFEF401A5,0xFEF401A5,0xFEF401A5,0xFEF401A5,0xFEF401A5,
-0xFEF401A5,0xD8D80000,0xD8D80000,0xD8D80000,0xB4D80000,0x1AC0A69,0x1AC0A69,0x1AC0A69,0x1AC0A69,0x1AC0A69,0x1AC0A69,0xFA4C0000,0xFA4C0000,0xFA4C0000,0xB4900000,0x5BF80A69,0x5BF80A69,0x5BF80A69,0xAC000080,0x90000A69,0xF5180884,0x1200A69,0x1200A69,0xFF100620,0xFF000469,0xFEF802E4,0xFEF802E4,0xFEE40088,0xFB0807C1,0xFEF805C4,0xFEC40000,0xFA4C0000,
-0x2FFC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table219[] = {
-0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x1580000,0x31F80000,
-0x31F80000,0x31F80000,0x31F80000,0x72000001,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xE80000,0xF80000,0xF80000,0xF80000,0x1580000,0x1EC0000,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x1780001,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,
-0x37FC0000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x37FC0000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0x9DFC0000,0x9DFC0000,0x9DFC0000,0xBC000000,0xBC000000,0x1940000,0x1780001,0x1780001,0x1B80000,0x3D80000,0x9FC0000,0x9FC0000,0x5DFC0000,0x1B80000,0x3D80000,0x83FC0000,0x9DFC0000,
-0x83FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1983373,0xFF8C2CEA,0xFF7C279F,0xFF7425C6,0xFF7423B5,0xFF5C1C42,0xFF5C1969,0xFF40125B,0xFF380E81,0xFF300A69,0xFF5C247C,0xFF441B19,0xFF441758,0xFF200D81,0xFF140831,0xFF0001F9,0xFF081024,0xFEF00984,0xF8DC0130,0xE8E80B58,0x65FC3373,0xFF4029BE,0xFF3425C5,0xFF0819E1,0xFEF01301,0xFECC0A69,0xFED81984,0xFEA00EBB,0xFE640002,0xE8740B59,0xB3FC3373,
-0xFE6025C5,0xF6000C55,0xE00010E1,0xCA003375,0xFF802BE9,0xFF8C3087,0xFF8C323F,0xFF70239D,0xFF4C19E6,0xFF2C1017,0xFF240BD8,0xFF040545,0xFF782B2A,0xFF5C221F,0xFF100FAE,0xFE640002,0x9FFC3373,0x1D00B5B,0xFFC40A0B,0xFFC008EE,0xFFBC0882,0xFFBC0809,0xFFA4061E,0xFFA4053D,0xFF9402EB,0xFF88019A,0xFF780001,0xBDFC0B58,0xFFAC095D,0xFF9C0882,0xFF7C0556,0xFF640335,
-0xFF380001,0xDFF80B58,0xFF340882,0xFE6C0000,0xE8000B58,0xBDFC0B58,0xFFAC095D,0xFF9C0882,0xFF7C0556,0xFF640335,0xFF380001,0xDFF80B58,0xFF340882,0xFE6C0000,0xE8000B58,0xDFF80B58,0xFF340882,0xFE6C0000,0xE8000B58,0xE8000B58,0xFDCC0A58,0xFFCC0AB3,0xFFCC0B13,0xFFB40941,0xFFA407A1,0xFF7C0571,0xFF7C03E8,0xFF4802D0,0xFFC40A5B,0xFFB00926,0xFF700892,0xFE6C0000,
-0xD7FC0B58,0x17425C6,0x17425C6,0x17425C6,0x17425C6,0xFF5C1969,0xFF5C1969,0xFF5C1969,0xFF380E81,0xFF380E81,0xFF300A69,0xFF441758,0xFF441758,0xFF441758,0xFF140831,0xFF140831,0xFF0001F9,0xFEF00984,0xFEF00984,0xF4E00104,0xDCE80884,0x31FC25C5,0x31FC25C5,0x31FC25C5,0xFEF01301,0xFEF01301,0xFECC0A69,0xFEA00EBB,0xFEA00EBB,0xFE640002,0xDC800884,0x9BF825C5,
-0x9BF825C5,0xF2000BD5,0xD6000BF4,0xBA0025C5,0xFF682010,0xFF6C2345,0x17425C6,0xFF541A99,0xFF401434,0xFF2C0E5E,0xFF240BD8,0xFF040545,0xFF501F9B,0xFF4419D4,0xFEFC0F78,0xFE640002,0x7FFC25C5,0x1BC0882,0x1BC0882,0x1BC0882,0x1BC0882,0xFFA4053D,0xFFA4053D,0xFFA4053D,0xFF88019A,0xFF88019A,0xFF780001,0x9BFC0882,0x9BFC0882,0x9BFC0882,0xFF640335,0xFF640335,
-0xFF380001,0xCFF80882,0xCFF80882,0xFE6C0000,0xDC000884,0x9BFC0882,0x9BFC0882,0x9BFC0882,0xFF640335,0xFF640335,0xFF380001,0xCFF80882,0xCFF80882,0xFE6C0000,0xDC000884,0xCFF80882,0xCFF80882,0xFE6C0000,0xDC000884,0xDC000884,0xFBB407C1,0xF5B80802,0x1BC0882,0xFFA4071A,0xFF9805F5,0xFF7C04C8,0xFF7C03E8,0xFF4802D0,0xFDB007C1,0xFF9806F5,0xC1FC0882,0xFE6C0000,
-0xC1FC0882,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0x1300A69,0xFF0001F9,0xFF0001F9,0xFF0001F9,0xFF0001F9,0xFF0001F9,
-0xFF0001F9,0xE0E80000,0xE0E80000,0xE0E80000,0xBCE80000,0x1C40A69,0x1C40A69,0x1C40A69,0x1C40A69,0x1C40A69,0x1C40A69,0xFE640002,0xFE640002,0xFE640002,0xBCA00000,0x67F80A69,0x67F80A69,0x67F80A69,0xB8000050,0x98000A69,0xFD280884,0x1300A69,0x1300A69,0xFF200659,0xFF1404BD,0xFF080340,0xFF080340,0xFEF400CD,0xFF1007E9,0xFD100622,0xFED8000A,0xFE640002,
-0x3FF80A69,};
-static const uint32_t g_etc1_to_bc7_m6_table220[] = {
-0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x1740000,0x3FF80000,
-0x3FF80000,0x3FF80000,0x3FF80000,0x7C000000,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xF80001,0xD080000,0xD080000,0xD080000,0x1740000,0xBFC0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,
-0x53FC0000,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0x53FC0000,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0xABF80000,0xABF80000,0xABF80000,0xC4000001,0xC4000001,0xBA40000,0x18C0000,0x18C0000,0x1CC0000,0x1F00000,0x2BFC0000,0x2BFC0000,0x73FC0000,0x1CC0000,0x1F00000,0x95FC0000,0xABF80000,
-0x95FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1A02C8F,0xFF8C2726,0xFF8822B7,0xFF80212E,0xFF801F75,0xFF68194A,0xFF6816D9,0xFF5810F7,0xFF4C0DDA,0xFF440A69,0xFF681F8A,0xFF5017A7,0xFF50145E,0xFF380C25,0xFF2C07B9,0xFF180271,0xFF200D42,0xFF080792,0xFAF8009A,0xEAF8087A,0x71FC2C8F,0xFF58247E,0xFF44212D,0xFF201735,0xFF081185,0xFEE80A69,0xFEF015BA,0xFEB80C99,0xFE880020,0xEA900879,0xB9FC2C8F,
-0xFE84212D,0xF8000AED,0xE6000C45,0xD0002C91,0xFF8C262B,0xF79C2A69,0xF9A02B7E,0xFF701F1D,0xFF5816D7,0xFF3C0E7E,0xFF3C0AF6,0xFF1404FA,0xFF8025C2,0xFF681E1D,0xFF1C0D7A,0xFE880020,0xA7FC2C8F,0x1D80876,0xFFD00776,0xFFC806AE,0xFFC40659,0xFFC405E2,0xFFB00491,0xFFB003E8,0xFFA00226,0xFF940131,0xFF8C0000,0xC7FC0876,0xFFB806FA,0xFFA80659,0xFF8803F3,0xFF700262,
-0xFF540000,0xE3FC0876,0xFF4C0659,0xFEA00000,0xEA000879,0xC7FC0876,0xFFB806FA,0xFFA80659,0xFF8803F3,0xFF700262,0xFF540000,0xE3FC0876,0xFF4C0659,0xFEA00000,0xEA000879,0xE3FC0876,0xFF4C0659,0xFEA00000,0xEA000879,0xEA000879,0xFFD0079D,0xFFCC0822,0xF5D80851,0xFFC006D9,0xFFAC05A1,0xFFA0042C,0xFF8C02F2,0xFF640209,0xFFD007BE,0xFFC006B9,0xFF840669,0xFEA00000,
-0xDDF80876,0x180212E,0x180212E,0x180212E,0x180212E,0xFF6816D9,0xFF6816D9,0xFF6816D9,0xFF4C0DDA,0xFF4C0DDA,0xFF440A69,0xFF50145E,0xFF50145E,0xFF50145E,0xFF2C07B9,0xFF2C07B9,0xFF180271,0xFF080792,0xFF080792,0xF8F4007E,0xE2F8065A,0x43FC212D,0x43FC212D,0x43FC212D,0xFF081185,0xFF081185,0xFEE80A69,0xFEB80C99,0xFEB80C99,0xFE880020,0xE294065A,0xA3FC212D,
-0xA3FC212D,0xF8000AC9,0xDC0008A6,0xC000212D,0xFF741C62,0xF9801F46,0x180212E,0xFF5C17A3,0xFF541226,0xFF3C0CEE,0xFF3C0AF6,0xFF1404FA,0xFF641BFE,0xFF5016B4,0xFF1C0D49,0xFE880020,0x8BFC212D,0x1C40659,0x1C40659,0x1C40659,0x1C40659,0xFFB003E8,0xFFB003E8,0xFFB003E8,0xFF940131,0xFF940131,0xFF8C0000,0xA9FC0659,0xA9FC0659,0xA9FC0659,0xFF700262,0xFF700262,
-0xFF540000,0xD5F80659,0xD5F80659,0xFEA00000,0xE2000659,0xA9FC0659,0xA9FC0659,0xA9FC0659,0xFF700262,0xFF700262,0xFF540000,0xD5F80659,0xD5F80659,0xFEA00000,0xE2000659,0xD5F80659,0xD5F80659,0xFEA00000,0xE2000659,0xE2000659,0xFFBC05B4,0xFBC405E9,0x1C40659,0xFFB4054A,0xFFAC0480,0xFF9403B5,0xFF8C02F2,0xFF640209,0xFFB405BA,0xFFA4053D,0xC9FC0659,0xFEA00000,
-0xC9FC0659,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0x1440A69,0xFF180271,0xFF180271,0xFF180271,0xFF180271,0xFF180271,
-0xFF180271,0xE8FC0001,0xE8FC0001,0xE8FC0001,0xC4F80002,0x1E00A69,0x1E00A69,0x1E00A69,0x1E00A69,0x1E00A69,0x1E00A69,0xFE880020,0xFE880020,0xFE880020,0xC4B40001,0x73FC0A69,0x73FC0A69,0x73FC0A69,0xC200002D,0xA0000A69,0xF73C08C5,0x1440A69,0x1440A69,0xFF2C06B2,0xFF280521,0xFF1803BA,0xFF1803BA,0xFF080131,0xF9300841,0xFF200665,0xFEF0002D,0xFE880020,
-0x4FFC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table221[] = {
-0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x18C0000,0x4BF80000,
-0x4BF80000,0x4BF80000,0x4BF80000,0x84000000,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x1080001,0x11C0000,0x11C0000,0x11C0000,0x18C0000,0x1BFC0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x19C0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,
-0x6BFC0000,0xB7F80000,0xB7F80000,0xB7F80000,0xCC000001,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0x6BFC0000,0xB7F80000,0xB7F80000,0xB7F80000,0xCC000001,0xB7F80000,0xB7F80000,0xB7F80000,0xCC000001,0xCC000001,0x1B80000,0x19C0000,0x19C0000,0x1E00000,0x11FC0000,0x49FC0000,0x49FC0000,0x87FC0000,0x1E00000,0x11FC0000,0xA3FC0000,0xB7F80000,
-0xA3FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1A826F7,0xFF982262,0xFF901EC2,0xFF8C1D72,0xFF8C1C09,0xFF7416DE,0xFF7414CD,0xFF640FDB,0xFF580D4E,0xFF540A69,0xFF741B8A,0xFF6814D7,0xFF5C120E,0xFF400B3C,0xFF380755,0xFF2402E9,0xFF2C0B0A,0xFF200632,0xFD08003A,0xEF080642,0x7DFC26F7,0xFF64202E,0xFF541D72,0xFF2C1515,0xFF201055,0xFF000A69,0xFEFC12C2,0xFED80B12,0xFEA00050,0xEEA00642,0xBFFC26F7,
-0xFEA01D72,0xFE000A6D,0xE60008D5,0xD40026F9,0xFFA021AB,0xFBA42501,0xFDA825FE,0xFF801B4F,0xFF6C146F,0xFF540D63,0xFF3C0A46,0xFF2804D2,0xFF9020F8,0xFF741A72,0xFF300B96,0xFEA00050,0xAFFC26F7,0x1DC0642,0xFFD4058D,0xFFCC04F1,0xFFCC04B1,0xFFC40462,0xFFBC035D,0xFFBC02E4,0xFFAC0192,0xFFA000E5,0xFF9C0000,0xCFFC0641,0xFFB8052A,0xFFB404B1,0xFFA002E3,0xFF8801BA,
-0xFF6C0000,0xE7FC0641,0xFF6404B1,0xFED40000,0xEE000641,0xCFFC0641,0xFFB8052A,0xFFB404B1,0xFFA002E3,0xFF8801BA,0xFF6C0000,0xE7FC0641,0xFF6404B1,0xFED40000,0xEE000641,0xE7FC0641,0xFF6404B1,0xFED40000,0xEE000641,0xEE000641,0xFFD805A2,0xF7DC0600,0xF7DC0621,0xFFC4051A,0xFFC0042D,0xFFA80302,0xFFA0022D,0xFF740184,0xFFD005AE,0xFFC804E1,0xFF9404BA,0xFED40000,
-0xE1FC0641,0x18C1D72,0x18C1D72,0x18C1D72,0x18C1D72,0xFF7414CD,0xFF7414CD,0xFF7414CD,0xFF580D4E,0xFF580D4E,0xFF540A69,0xFF5C120E,0xFF5C120E,0xFF5C120E,0xFF380755,0xFF380755,0xFF2402E9,0xFF200632,0xFF200632,0xFB040032,0xE70804B2,0x53FC1D72,0x53FC1D72,0x53FC1D72,0xFF201055,0xFF201055,0xFF000A69,0xFED80B12,0xFED80B12,0xFEA00050,0xE6A804B2,0xABF81D72,
-0xABF81D72,0xFC000A6D,0xE0000631,0xC4001D75,0xFF841959,0xFD881BB6,0x18C1D72,0xFF701549,0xFF581073,0xFF4C0C25,0xFF3C0A46,0xFF2804D2,0xFF7818E3,0xFF5C14A8,0xFF300B72,0xFEA00050,0x95FC1D72,0x1CC04B1,0x1CC04B1,0x1CC04B1,0x1CC04B1,0xFFBC02E4,0xFFBC02E4,0xFFBC02E4,0xFFA000E5,0xFFA000E5,0xFF9C0000,0xB5FC04B1,0xB5FC04B1,0xB5FC04B1,0xFF8801BA,0xFF8801BA,
-0xFF6C0000,0xDBF804B1,0xDBF804B1,0xFED40000,0xE60004B1,0xB5FC04B1,0xB5FC04B1,0xB5FC04B1,0xFF8801BA,0xFF8801BA,0xFF6C0000,0xDBF804B1,0xDBF804B1,0xFED40000,0xE60004B1,0xDBF804B1,0xDBF804B1,0xFED40000,0xE60004B1,0xE60004B1,0xF7C80451,0xFFCC0451,0x1CC04B1,0xFDC003F5,0xFFAC0340,0xFFA002A8,0xFFA0022D,0xFF740184,0xF9C40451,0xFFB003E8,0xD1FC04B1,0xFED40000,
-0xD1FC04B1,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0x1540A69,0xFF2402E9,0xFF2402E9,0xFF2402E9,0xFF2402E9,0xFF2402E9,
-0xFF2402E9,0xF10C0001,0xF10C0001,0xF10C0001,0xCD080002,0x1F80A69,0x1F80A69,0x1F80A69,0x1F80A69,0x1F80A69,0x1F80A69,0xFEA00050,0xFEA00050,0xFEA00050,0xCCC40001,0x7FFC0A69,0x7FFC0A69,0x7FFC0A69,0xCA000012,0xA8000A69,0xFF4C08C5,0x1540A69,0x1540A69,0xFF3C06F5,0xFF34057A,0xFF34042A,0xFF34042A,0xFF1C0195,0xFF3C0845,0xFF34069A,0xFEFC007A,0xFEA00050,
-0x5FF80A69,};
-static const uint32_t g_etc1_to_bc7_m6_table222[] = {
-0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x1A40000,0x57F80000,
-0x57F80000,0x57F80000,0x57F80000,0x8C000000,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x1180001,0x12C0000,0x12C0000,0x12C0000,0x1A40000,0x29FC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x1AC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,
-0x83FC0000,0xC3F80000,0xC3F80000,0xC3F80000,0xD4000001,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0x83FC0000,0xC3F80000,0xC3F80000,0xC3F80000,0xD4000001,0xC3F80000,0xC3F80000,0xC3F80000,0xD4000001,0xD4000001,0x1C80000,0x1AC0000,0x1AC0000,0x3F00000,0x37FC0000,0x67FC0000,0x67FC0000,0x9BFC0000,0x3F00000,0x37FC0000,0xB3FC0000,0xC3F80000,
-0xB3FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B021DF,0xFFA41E16,0xFF9C1B22,0xFF981A0E,0xFF9818ED,0xFF8014AA,0xFF8012F1,0xFF700EDF,0xFF700CCE,0xFF640A69,0xFF8017F2,0xFF741257,0xFF681006,0xFF580A54,0xFF4C0723,0xFF3C0361,0xFF380932,0xFF2C0506,0xFF18000A,0xF1180462,0x89FC21DF,0xFF701C46,0xFF641A0D,0xFF381345,0xFF2C0F41,0xFF180A69,0xFF14101A,0xFEF009AA,0xFEB800A0,0xF0B80461,0xC5FC21DF,
-0xFEC01A0D,0xFE200A69,0xEC0005DD,0xD80021E1,0xFFA01D2B,0xFFAC2019,0xFFAC2106,0xFF881837,0xFF78124A,0xFF5C0C29,0xFF5009B3,0xFF4004BB,0xFF941CEE,0xFF801747,0xFF3C0A2E,0xFEB800A0,0xB7FC21DF,0x1E40462,0xFFDC03DA,0xFFD80371,0xFFD40349,0xFFD00306,0xFFC80259,0xFFC401F9,0xFFB80116,0xFFB8009D,0xFFAC0000,0xD5FC0461,0xFFCC03A1,0xFFC00349,0xFFAC01FB,0xFFA00132,
-0xFF840000,0xEBF80461,0xFF7C0349,0xFF040000,0xF0000461,0xD5FC0461,0xFFCC03A1,0xFFC00349,0xFFAC01FB,0xFFA00132,0xFF840000,0xEBF80461,0xFF7C0349,0xFF040000,0xF0000461,0xEBF80461,0xFF7C0349,0xFF040000,0xF0000461,0xF0000461,0xFFD803F2,0xF9E0042C,0xF9E00449,0xFFD40382,0xFFC802E2,0xFFB80212,0xFFA80172,0xFF8C0112,0xFFDC03F8,0xFFC80371,0xFFA40352,0xFF040000,
-0xE5FC0461,0x1981A0E,0x1981A0E,0x1981A0E,0x1981A0E,0xFF8012F1,0xFF8012F1,0xFF8012F1,0xFF700CCE,0xFF700CCE,0xFF640A69,0xFF681006,0xFF681006,0xFF681006,0xFF4C0723,0xFF4C0723,0xFF3C0361,0xFF2C0506,0xFF2C0506,0xFD180009,0xEB18034A,0x63FC1A0D,0x63FC1A0D,0x63FC1A0D,0xFF2C0F41,0xFF2C0F41,0xFF180A69,0xFEF009AA,0xFEF009AA,0xFEB800A0,0xEABC034A,0xB3F81A0D,
-0xB3F81A0D,0xFE200A69,0xE6000425,0xCA001A0D,0xFF841689,0xFF8C18AA,0x1981A0E,0xFF801316,0xFF6C0EEB,0xFF5C0B29,0xFF5009B3,0xFF4004BB,0xFF80163E,0xFF741269,0xFF3C0A0A,0xFEB800A0,0x9FF81A0D,0x1D40349,0x1D40349,0x1D40349,0x1D40349,0xFFC401F9,0xFFC401F9,0xFFC401F9,0xFFB8009D,0xFFB8009D,0xFFAC0000,0xC1FC0349,0xC1FC0349,0xC1FC0349,0xFFA00132,0xFFA00132,
-0xFF840000,0xE1F80349,0xE1F80349,0xFF040000,0xEA000349,0xC1FC0349,0xC1FC0349,0xC1FC0349,0xFFA00132,0xFFA00132,0xFF840000,0xE1F80349,0xE1F80349,0xFF040000,0xEA000349,0xE1F80349,0xE1F80349,0xFF040000,0xEA000349,0xEA000349,0xFBD002F9,0xFFCC0311,0x1D40349,0xFFC402B9,0xFFC00244,0xFFB001D4,0xFFA80172,0xFF8C0112,0xFDCC02F9,0xFFC402AD,0xD9FC0349,0xFF040000,
-0xD9FC0349,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0x1640A69,0xFF3C0361,0xFF3C0361,0xFF3C0361,0xFF3C0361,0xFF3C0361,
-0xFF3C0361,0xF91C0001,0xF91C0001,0xF91C0001,0xD5180002,0x13FC0A69,0x13FC0A69,0x13FC0A69,0x13FC0A69,0x13FC0A69,0x13FC0A69,0xFEB800A0,0xFEB800A0,0xFEB800A0,0xD4D40001,0x8BFC0A69,0x8BFC0A69,0x8BFC0A69,0xD4000005,0xB0000A69,0xF75C090A,0x1640A69,0x1640A69,0xFF4C073A,0xFF5005E9,0xFF440492,0xFF440492,0xFF300209,0xFB500884,0xFD4C0708,0xFF1800CD,0xFEB800A0,
-0x6DFC0A69,};
-static const uint32_t g_etc1_to_bc7_m6_table223[] = {
-0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x61FC0000,
-0x61FC0000,0x61FC0000,0x61FC0000,0x94000000,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x1280001,0x73C0000,0x73C0000,0x73C0000,0x1BC0000,0x39FC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x1BC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,
-0x9BFC0000,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0x9BFC0000,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0xCFF80000,0xCFF80000,0xCFF80000,0xDC000001,0xDC000001,0x5D80000,0x1BC0000,0x1BC0000,0x17FC0000,0x5FFC0000,0x85FC0000,0x85FC0000,0xAFFC0000,0x17FC0000,0x5FFC0000,0xC1FC0000,0xCFF80000,
-0xC1FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1B81D47,0xFFB01A42,0xFFA017DF,0xFFA016FE,0xFF98161D,0xFF9012AA,0xFF8C1145,0xFF7C0E03,0xFF7C0C4A,0xFF740A69,0xFF8C14C2,0xFF801027,0xFF740E46,0xFF640994,0xFF5806FB,0xFF5003DA,0xFF4C07C3,0xFF380432,0xFF2C0002,0xF32802DA,0x95FC1D47,0xFF7C18C6,0xFF7416FD,0xFF581187,0xFF400E66,0xFF300A69,0xFF2C0DD2,0xFF080882,0xFED80112,0xF4CC02D9,0xCBFC1D47,
-0xFEE416FD,0xFE540A69,0xF000039A,0xDC001D49,0xFFA8198F,0xFFAC1C09,0xF5B81CBF,0xFF981527,0xFF801057,0xFF700B37,0xFF680929,0xFF4804DA,0xFFA01957,0xFF90147F,0xFF50092A,0xFED80112,0xBFF81D47,0x1E802D6,0xFFE00289,0xFFE0023E,0xFFDC0221,0xFFDC01F2,0xFFD00185,0xFFD00145,0xFFCC00C0,0xFFC00068,0xFFBC0000,0xDFFC02D6,0xFFD8025D,0xFFCC0221,0xFFB80143,0xFFAC00C2,
-0xFF9C0000,0xEFFC02D6,0xFF940221,0xFF340000,0xF20002D9,0xDFFC02D6,0xFFD8025D,0xFFCC0221,0xFFB80143,0xFFAC00C2,0xFF9C0000,0xEFFC02D6,0xFF940221,0xFF340000,0xF20002D9,0xEFFC02D6,0xFF940221,0xFF340000,0xF20002D9,0xF20002D9,0xFFE40296,0xFDE802AC,0xFDE802C1,0xFFDC0238,0xFFD401E1,0xFFB80162,0xFFB800F5,0xFFA000A9,0xFFDC0298,0xFFDC0233,0xFFB40225,0xFF340000,
-0xEBFC02D6,0x1A016FE,0x1A016FE,0x1A016FE,0x1A016FE,0xFF8C1145,0xFF8C1145,0xFF8C1145,0xFF7C0C4A,0xFF7C0C4A,0xFF740A69,0xFF740E46,0xFF740E46,0xFF740E46,0xFF5806FB,0xFF5806FB,0xFF5003DA,0xFF380432,0xFF380432,0xFF2C0002,0xEF280222,0x75FC16FD,0x75FC16FD,0x75FC16FD,0xFF400E66,0xFF400E66,0xFF300A69,0xFF080882,0xFF080882,0xFED80112,0xEED00222,0xBBFC16FD,
-0xBBFC16FD,0xFE540A69,0xEC000289,0xD00016FD,0xFF901433,0xF9A015ED,0x1A016FE,0xFF88111E,0xFF800DB3,0xFF680A6E,0xFF680929,0xFF4804DA,0xFF9013B8,0xFF8010A6,0xFF500911,0xFED80112,0xA9FC16FD,0x1DC0221,0x1DC0221,0x1DC0221,0x1DC0221,0xFFD00145,0xFFD00145,0xFFD00145,0xFFC00068,0xFFC00068,0xFFBC0000,0xCDFC0221,0xCDFC0221,0xCDFC0221,0xFFAC00C2,0xFFAC00C2,
-0xFF9C0000,0xE7F80221,0xE7F80221,0xFF340000,0xEE000221,0xCDFC0221,0xCDFC0221,0xCDFC0221,0xFFAC00C2,0xFFAC00C2,0xFF9C0000,0xE7F80221,0xE7F80221,0xFF340000,0xEE000221,0xE7F80221,0xE7F80221,0xFF340000,0xEE000221,0xEE000221,0xFFD801E1,0xF7DC0200,0x1DC0221,0xFFD401C2,0xFFC80179,0xFFB80131,0xFFB800F5,0xFFA000A9,0xFFD001ED,0xFFC801BD,0xDFFC0221,0xFF340000,
-0xDFFC0221,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0x1740A69,0xFF5003DA,0xFF5003DA,0xFF5003DA,0xFF5003DA,0xFF5003DA,
-0xFF5003DA,0xFF2C0002,0xFF2C0002,0xFF2C0002,0xDD280002,0x2BFC0A69,0x2BFC0A69,0x2BFC0A69,0x2BFC0A69,0x2BFC0A69,0x2BFC0A69,0xFED80112,0xFED80112,0xFED80112,0xDCE40001,0x97FC0A69,0x97FC0A69,0x97FC0A69,0xDC040001,0xB8000A69,0xFF6C090A,0x1740A69,0x1740A69,0xFF680782,0xFF580652,0xFF540502,0xFF540502,0xFF40028A,0xFF5808B4,0xFF500750,0xFF300132,0xFED80112,
-0x7DF80A69,};
-static const uint32_t g_etc1_to_bc7_m6_table224[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x5C0001,0x5C0001,0x5C0001,0x5C0001,0x8C0000,0x8C0000,0x8C0000,0x1180000,0x1180000,0x2E000000,0x8C0000,0x8C0000,0x8C0000,0x1180000,0x1180000,0x2E000000,0x1180000,0x1180000,0x2E000000,0x2E000000,0x8C0000,0x8C0000,0x8C0000,0x1180000,0x1180000,0x2E000000,0x1180000,0x1180000,0x2E000000,0x2E000000,0x1180000,
-0x1180000,0x2E000000,0x2E000000,0x2E000000,0x6C0000,0x640000,0x5C0001,0x800000,0x9C0000,0xC80000,0xE40000,0x15C0000,0x2740000,0x8C0000,0xC80000,0x2E000000,0xC80000,0x16C0001,0x25FC0000,0x95F80000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,
-0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x25FC0000,0x95F80000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0x95F80000,0xB6000000,0xB6000000,0xB6000000,0xB6000000,0x1CC0000,0xB840000,0xB840000,0x4FFC0000,0x87F80000,0xA5F80000,0xB6000000,0xB6000000,0x3F00000,0x67FC0000,0xB3D80000,0xB6000000,
-0x79FC0000,0x7457CA,0xFE442041,0xFE2005E5,0xB62005C1,0xFE182962,0xF4000585,0xB4000094,0xAA002420,0x96000F64,0x72002420,0xEA004691,0xCC001B02,0xA2000CAD,0x98002F04,0x88001816,0x6E002950,0x72004691,0x72002CBD,0x5C0036EC,0x4C004691,0xA857CA,0xAE002A47,0x92001711,0x860037C0,0x80001F6C,0x68002DF0,0x68004B75,0x6C0031E9,0x56003AAD,0x480048F9,0x15857CA,
-0x5C003E8D,0x500043F0,0x44004F85,0x380057CA,0xFE3438A1,0xFA644CC8,0xFE6C4B89,0xFE001F2A,0xE200192D,0xAC001816,0x92001141,0x82001EEA,0xFE1837C5,0xFC002039,0x86002331,0x56003AAD,0xF057CA,0x984692,0xFE5C1A55,0xFE300461,0xB6300451,0xFE302822,0xF4000585,0xB4000094,0xAA002420,0x96000F64,0x72002420,0xE44691,0xCC001B02,0xA2000CAD,0x98002F04,0x88001816,
-0x6E002950,0x1D04691,0x72002CBD,0x5C0036EC,0x4C004691,0xE44691,0xCC001B02,0xA2000CAD,0x98002F04,0x88001816,0x6E002950,0x1D04691,0x72002CBD,0x5C0036EC,0x4C004691,0x1D04691,0x72002CBD,0x5C0036EC,0x4C004691,0x4C004691,0xFE503353,0xFC883FC2,0xFE8C3D0E,0xFE001F2A,0xE200192D,0xAC001816,0x92001141,0x82001EEA,0xFE3433C8,0xFC001F39,0x860022F1,0x5C0036EC,
-0x1484691,0x2005C1,0x2005C1,0x2005C1,0x2005C1,0x8E000000,0x8E000000,0x8E000000,0x44000001,0x44000001,0x2E000000,0x44000451,0x44000451,0x44000451,0x3400019A,0x3400019A,0x280000CD,0x22000451,0x22000451,0x200002A8,0x16000451,0x3005C1,0x3005C1,0x3005C1,0x2E0002A3,0x2E0002A3,0x28000176,0x1E0004C1,0x1E0004C1,0x1C000311,0x1400048E,0x5C05C1,
-0x5C05C1,0x16000402,0x1000051E,0xE0005C2,0xF6000171,0xF21402AC,0x2005C1,0x900001D0,0x5A0001A8,0x440001A8,0x40000161,0x2E0001F9,0x9600030A,0x66000286,0x2A00045A,0x1C000311,0x4005C1,0x300451,0x300451,0x300451,0x300451,0x8E000000,0x8E000000,0x8E000000,0x44000001,0x44000001,0x2E000000,0x440451,0x440451,0x440451,0x3400019A,0x3400019A,
-0x280000CD,0x880451,0x880451,0x200002A8,0x16000451,0x440451,0x440451,0x440451,0x3400019A,0x3400019A,0x280000CD,0x880451,0x880451,0x200002A8,0x16000451,0x880451,0x880451,0x200002A8,0x16000451,0x16000451,0xF6000171,0xF8200200,0x300451,0x900001D0,0x5A0001A8,0x440001A8,0x40000161,0x2E0001F9,0x960002B9,0x6C00025D,0x600451,0x200002A8,
-0x600451,0xE42422,0xFE980C49,0xF8600001,0xB65C0001,0x1542420,0xF4000585,0xB4000094,0x2FFC2420,0x96000F64,0x72002420,0x1542420,0xF4000585,0xB4000094,0x2FFC2420,0x96000F64,0x72002420,0x2FFC2420,0x96000F64,0x72002420,0x72002420,0x1542420,0xF4000585,0xB4000094,0x2FFC2420,0x96000F64,0x72002420,0x2FFC2420,0x96000F64,0x72002420,0x72002420,0x2FFC2420,
-0x96000F64,0x72002420,0x72002420,0x72002420,0xFAAC1E85,0x2F42420,0xF6DC1F81,0xFE4015A0,0xF6001009,0xBC000EF9,0x9E000AE1,0x9000133D,0xFE901E08,0xFE14145D,0xA6000988,0x72002420,0x1E82420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table225[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x6C0001,0x6C0001,0x6C0001,0x6C0001,0xA40000,0xA40000,0xA40000,0x14C0000,0x14C0000,0x36000000,0xA40000,0xA40000,0xA40000,0x14C0000,0x14C0000,0x36000000,0x14C0000,0x14C0000,0x36000000,0x36000000,0xA40000,0xA40000,0xA40000,0x14C0000,0x14C0000,0x36000000,0x14C0000,0x14C0000,0x36000000,0x36000000,0x14C0000,
-0x14C0000,0x36000000,0x36000000,0x36000000,0x800000,0x2740000,0x6C0001,0x940000,0xB80000,0xE80000,0x10C0000,0x1980000,0x2880000,0xA40000,0xE80000,0x36000000,0xE80000,0x17C0001,0x3DFC0000,0xA1F80000,0xBE000000,0x3DFC0000,0xA1F80000,0xBE000000,0xA1F80000,0xBE000000,0xBE000000,0x3DFC0000,0xA1F80000,0xBE000000,0xA1F80000,0xBE000000,
-0xBE000000,0xA1F80000,0xBE000000,0xBE000000,0xBE000000,0x3DFC0000,0xA1F80000,0xBE000000,0xA1F80000,0xBE000000,0xBE000000,0xA1F80000,0xBE000000,0xBE000000,0xBE000000,0xA1F80000,0xBE000000,0xBE000000,0xBE000000,0xBE000000,0x1E00000,0x1980000,0x1980000,0x63FC0000,0x95F80000,0xAFF80000,0xBE000000,0xBE000000,0xFFC0000,0x77FC0000,0xBBE80000,0xBE000000,
-0x87FC0000,0x7C5F3A,0xFE442651,0xFE280892,0xBE2407E1,0xFE242C5A,0xFA0004BD,0xBA000034,0xB6002420,0x9C000E48,0x7A002420,0xF8004B86,0xE2001C71,0xA8000DB5,0xA200306B,0x92001820,0x74002A0C,0x78004B89,0x78002F69,0x660039D9,0x50004B89,0xB45F3A,0xBA002DB7,0xA2001906,0x92003AA0,0x86002070,0x6E002F60,0x6E005129,0x72003541,0x60003DE9,0x4C004E42,0x1705F3A,
-0x5C0043BD,0x56004888,0x440055C5,0x3C005F3A,0xFE503ED3,0xFE6C5368,0xFE6C52D9,0xFE142325,0xF600193D,0xBC0017ED,0x9E0010D5,0x8A001F71,0xFE183E05,0xFC002279,0x9400251B,0x60003DE9,0x1005F3A,0xA44B86,0xFE681ED9,0xFE38064D,0xBE3805E9,0xFE3C2A76,0xFA0004BD,0xBA000034,0xB6002420,0x9C000E48,0x7A002420,0xF44B86,0xE2001C71,0xA8000DB5,0xA200306B,0x92001820,
-0x74002A0C,0x1F04B86,0x78002F69,0x660039D9,0x50004B89,0xF44B86,0xE2001C71,0xA8000DB5,0xA200306B,0x92001820,0x74002A0C,0x1F04B86,0x78002F69,0x660039D9,0x50004B89,0x1F04B86,0x78002F69,0x660039D9,0x50004B89,0x50004B89,0xFE64380A,0xFE8C4466,0xFE8C426E,0xFE1422C1,0xF600193D,0xBC0017ED,0x9E0010D5,0x8A001F71,0xFE443845,0xFC002179,0x940024CA,0x660039D9,
-0x15C4B86,0x2407E1,0x2407E1,0x2407E1,0x2407E1,0xA6000000,0xA6000000,0xA6000000,0x50000001,0x50000001,0x36000000,0x500005E9,0x500005E9,0x500005E9,0x40000232,0x40000232,0x2E000121,0x280005E9,0x280005E9,0x260003A4,0x1A0005E9,0x3407E1,0x3407E1,0x3407E1,0x340003AB,0x340003AB,0x2E000202,0x2200067A,0x2200067A,0x20000441,0x18000635,0x6807E1,
-0x6807E1,0x1C000576,0x160006F2,0x120007E2,0xF8040269,0xF61C042C,0x2407E1,0xA400028A,0x68000249,0x4C000254,0x4C0001E1,0x380002BA,0xA400042E,0x72000363,0x320005F2,0x20000441,0x4C07E1,0x3805E9,0x3805E9,0x3805E9,0x3805E9,0xA6000000,0xA6000000,0xA6000000,0x50000001,0x50000001,0x36000000,0x5005E9,0x5005E9,0x5005E9,0x40000232,0x40000232,
-0x2E000121,0xA005E9,0xA005E9,0x260003A4,0x1A0005E9,0x5005E9,0x5005E9,0x5005E9,0x40000232,0x40000232,0x2E000121,0xA005E9,0xA005E9,0x260003A4,0x1A0005E9,0xA005E9,0xA005E9,0x260003A4,0x1A0005E9,0x1A0005E9,0xF8040265,0xFC280320,0x3805E9,0xA400028A,0x68000249,0x4C000254,0x4C0001E1,0x380002BA,0xB40003B5,0x72000332,0x7005E9,0x260003A4,
-0x7005E9,0xF42422,0xFEB00D41,0xFE700002,0xBE6C0001,0x16C2420,0xFA0004BD,0xBA000034,0x3BFC2420,0x9C000E48,0x7A002420,0x16C2420,0xFA0004BD,0xBA000034,0x3BFC2420,0x9C000E48,0x7A002420,0x3BFC2420,0x9C000E48,0x7A002420,0x7A002420,0x16C2420,0xFA0004BD,0xBA000034,0x3BFC2420,0x9C000E48,0x7A002420,0x3BFC2420,0x9C000E48,0x7A002420,0x7A002420,0x3BFC2420,
-0x9C000E48,0x7A002420,0x7A002420,0x7A002420,0xFEB41EAD,0xB042420,0xFEEC1F81,0xFE6C1661,0xF6000F79,0xBC000E29,0xA80009CD,0x94001248,0xFE981E3A,0xFE401528,0xB400084A,0x7A002420,0x7FC2420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table226[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x7C0001,0x7C0001,0x7C0001,0x7C0001,0xBC0000,0xBC0000,0xBC0000,0x17C0000,0x17C0000,0x3E000000,0xBC0000,0xBC0000,0xBC0000,0x17C0000,0x17C0000,0x3E000000,0x17C0000,0x17C0000,0x3E000000,0x3E000000,0xBC0000,0xBC0000,0xBC0000,0x17C0000,0x17C0000,0x3E000000,0x17C0000,0x17C0000,0x3E000000,0x3E000000,0x17C0000,
-0x17C0000,0x3E000000,0x3E000000,0x3E000000,0x4900000,0xA840000,0x7C0001,0xAC0000,0xD40000,0x10C0000,0x1340000,0x1D40000,0x29C0000,0xBC0000,0x10C0000,0x3E000000,0x10C0000,0x18C0001,0x55FC0000,0xADF80000,0xC6000000,0x55FC0000,0xADF80000,0xC6000000,0xADF80000,0xC6000000,0xC6000000,0x55FC0000,0xADF80000,0xC6000000,0xADF80000,0xC6000000,
-0xC6000000,0xADF80000,0xC6000000,0xC6000000,0xC6000000,0x55FC0000,0xADF80000,0xC6000000,0xADF80000,0xC6000000,0xC6000000,0xADF80000,0xC6000000,0xC6000000,0xC6000000,0xADF80000,0xC6000000,0xC6000000,0xC6000000,0xC6000000,0x1F40000,0x1A80000,0x1A80000,0x77FC0000,0xA1FC0000,0xB9F80000,0xC6000000,0xC6000000,0x2FFC0000,0x89FC0000,0xC3F80000,0xC6000000,
-0x97FC0000,0x84672A,0xFE502CD1,0xFE2C0BE9,0xC6280A59,0xFE303022,0xFE0404C5,0xC6000004,0xC2002420,0xA6000D61,0x82002420,0xFE0450E5,0xE8001E1D,0xB4000EED,0xAE003223,0x98001848,0x7A002AE0,0x800050D1,0x7E00325D,0x6C003CB9,0x560050D1,0xC0672A,0xC6003187,0xA2001B66,0xA2003D84,0x920021A0,0x740030F0,0x74005745,0x780038E9,0x6600415D,0x520053E2,0x188672A,
-0x66004956,0x5C004D70,0x4A005C5D,0x4000672A,0xFE504513,0xFE6C5AE8,0xF67C5B12,0xFE142775,0xF60019AD,0xBC00181D,0xA60010A0,0x90002036,0xFE3444C8,0xFE0025AA,0x96002723,0x6600415D,0x114672A,0xB050D2,0xFE7423CD,0xFE4008E2,0xC64007C1,0xFE442D24,0xFE0404C1,0xC6000004,0xC2002420,0xA6000D61,0x82002420,0x30050D1,0xE8001E1D,0xB4000EED,0xAE003223,0x98001848,
-0x7A002AE0,0x5FC50D1,0x7E00325D,0x6C003CB9,0x560050D1,0x30050D1,0xE8001E1D,0xB4000EED,0xAE003223,0x98001848,0x7A002AE0,0x5FC50D1,0x7E00325D,0x6C003CB9,0x560050D1,0x5FC50D1,0x7E00325D,0x6C003CB9,0x560050D1,0x560050D1,0xFE783CC9,0xF8A049A0,0xFCA84789,0xFE142711,0xF60019AD,0xBC00181D,0xA60010A0,0x90002036,0xFC583D2B,0xFE0424A1,0x960026D2,0x6C003CB9,
-0x17050D1,0x280A59,0x280A59,0x280A59,0x280A59,0xBE000000,0xBE000000,0xBE000000,0x5C000001,0x5C000001,0x3E000000,0x5C0007C1,0x5C0007C1,0x5C0007C1,0x4C0002EA,0x4C0002EA,0x3A000179,0x2E0007C1,0x2E0007C1,0x2C0004C8,0x1E0007C1,0x23C0A56,0x23C0A56,0x23C0A56,0x400004DB,0x400004DB,0x340002A6,0x28000882,0x28000882,0x2600058D,0x1E000825,0x7C0A56,
-0x7C0A56,0x2000074C,0x1A00092D,0x14000A56,0xFA0803B9,0xF8200600,0x280A59,0xBA00034D,0x7C000301,0x62000308,0x5600028A,0x40000385,0xC200057A,0x9000047B,0x3A0007D1,0x2600058D,0x580A56,0x4007C1,0x4007C1,0x4007C1,0x4007C1,0xBE000000,0xBE000000,0xBE000000,0x5C000001,0x5C000001,0x3E000000,0x5C07C1,0x5C07C1,0x5C07C1,0x4C0002EA,0x4C0002EA,
-0x3A000179,0xB807C1,0xB807C1,0x2C0004C8,0x1E0007C1,0x5C07C1,0x5C07C1,0x5C07C1,0x4C0002EA,0x4C0002EA,0x3A000179,0xB807C1,0xB807C1,0x2C0004C8,0x1E0007C1,0xB807C1,0xB807C1,0x2C0004C8,0x1E0007C1,0x1E0007C1,0xFC0C039D,0xFE2C0488,0x4007C1,0xBA00034D,0x7C000301,0x62000308,0x5600028A,0x40000385,0xC20004EA,0x9000042A,0x8007C1,0x2C0004C8,
-0x8007C1,0x1042422,0xFEC40E2A,0xFE84002D,0xC67C0001,0x1842420,0xFE0804B5,0xC6000004,0x47FC2420,0xA6000D61,0x82002420,0x1842420,0xFE0804B5,0xC6000004,0x47FC2420,0xA6000D61,0x82002420,0x47FC2420,0xA6000D61,0x82002420,0x82002420,0x1842420,0xFE0804B5,0xC6000004,0x47FC2420,0xA6000D61,0x82002420,0x47FC2420,0xA6000D61,0x82002420,0x82002420,0x47FC2420,
-0xA6000D61,0x82002420,0x82002420,0x82002420,0xFED01F04,0x1182420,0xF6FC2002,0xFE6C1711,0xFE0C0F79,0xD0000D00,0xB00008C8,0x9C001174,0xFEAC1EC1,0xFE4C15DD,0xBE000734,0x82002420,0x17FC2420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table227[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0x8C0001,0x8C0001,0x8C0001,0x8C0001,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x1AC0000,0x1AC0000,0x46000000,0x46000000,0x2D00000,0x2D00000,0x2D00000,0x1AC0000,0x1AC0000,0x46000000,0x1AC0000,0x1AC0000,0x46000000,0x46000000,0x1AC0000,
-0x1AC0000,0x46000000,0x46000000,0x46000000,0xA40000,0x980000,0x8C0001,0xC00000,0x2EC0000,0x12C0000,0x15C0000,0x5F80000,0x2B00000,0x2D00000,0x12C0000,0x46000000,0x12C0000,0x19C0001,0x6FFC0000,0xB9F80000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,
-0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0x6FFC0000,0xB9F80000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0xB9F80000,0xCE000000,0xCE000000,0xCE000000,0xCE000000,0x15FC0000,0x5B80000,0x5B80000,0x8BFC0000,0xAFFC0000,0xC3F80000,0xCE000000,0xCE000000,0x4DFC0000,0x99FC0000,0xCDCC0000,0xCE000000,
-0xA5FC0000,0x8C6F9A,0xFE5C3409,0xFE380FFD,0xCE300D21,0xFE303482,0xFE080585,0xD0000008,0xCE002420,0xAC000C6D,0x8A002420,0xFE0C56D6,0xF4002035,0xBA00107D,0xB40033FB,0xA200187A,0x80002BCC,0x8A005671,0x84003599,0x72003FC9,0x5C005671,0xCC6F9A,0xD20035B7,0xAE001E26,0xA20040A4,0x980022EC,0x7A0032A0,0x80005DB5,0x7E003CE1,0x6C004509,0x580059E2,0x1A06F9A,
-0x6C004F2A,0x5C0052B0,0x5000636D,0x44006F9A,0xFE504C53,0xF8806322,0xFA846322,0xFE142CC5,0xFC001ADA,0xCC001898,0xAE001085,0x980020F9,0xFE344BA8,0xFE0029CA,0x9E0029C1,0x6C004509,0x1246F9A,0xB85672,0xFE802931,0xFE4C0BF6,0xCE4809D9,0xFE503080,0xFE080575,0xCE040008,0xCE002420,0xAC000C6D,0x8A002420,0x1145671,0xF4002035,0xBA00107D,0xB40033FB,0xA200187A,
-0x80002BCC,0xFF85671,0x84003599,0x72003FC9,0x5C005671,0x1145671,0xF4002035,0xBA00107D,0xB40033FB,0xA200187A,0x80002BCC,0xFF85671,0x84003599,0x72003FC9,0x5C005671,0xFF85671,0x84003599,0x72003FC9,0x5C005671,0x5C005671,0xFE7841F9,0xFEAC4ECC,0xFEAC4D09,0xFE302BF5,0xFC001ADA,0xCC001898,0xAE001085,0x980020F9,0xFE5C422B,0xFE0428A1,0x9E00295D,0x72003FC9,
-0x18C5671,0x300D21,0x300D21,0x300D21,0x300D21,0xD6000000,0xD6000000,0xD6000000,0x68000000,0x68000000,0x46000000,0x6A0009D9,0x6A0009D9,0x6A0009D9,0x520003BA,0x520003BA,0x400001DD,0x340009D9,0x340009D9,0x32000614,0x220009D9,0x2440D21,0x2440D21,0x2440D21,0x46000633,0x46000633,0x3A000362,0x2E000AD2,0x2E000AD2,0x2C000709,0x22000A52,0x8C0D21,
-0x8C0D21,0x26000948,0x1C000B96,0x16000D22,0xFC0C0561,0xFA24082C,0x300D21,0xD000042A,0x860003D9,0x6A0003D4,0x62000334,0x4A000484,0xE4000704,0x900005AB,0x400009E9,0x2C000709,0x640D21,0x4809D9,0x4809D9,0x4809D9,0x4809D9,0xD6000000,0xD6000000,0xD6000000,0x68000000,0x68000000,0x46000000,0x6809D9,0x6809D9,0x6809D9,0x520003BA,0x520003BA,
-0x400001DD,0xD009D9,0xD009D9,0x32000614,0x220009D9,0x6809D9,0x6809D9,0x6809D9,0x520003BA,0x520003BA,0x400001DD,0xD009D9,0xD009D9,0x32000614,0x220009D9,0xD009D9,0xD009D9,0x32000614,0x220009D9,0x220009D9,0xFE100521,0xF4380659,0x4809D9,0xD000042A,0x860003D9,0x6A0003D4,0x62000334,0x4A000484,0xF400063D,0x9A000551,0x9409D9,0x32000614,
-0x9409D9,0x1142422,0xFED00F3A,0xFE940082,0xCE8C0001,0x19C2420,0xFE080565,0xCE080000,0x53FC2420,0xAC000C6D,0x8A002420,0x19C2420,0xFE080565,0xCE080000,0x53FC2420,0xAC000C6D,0x8A002420,0x53FC2420,0xAC000C6D,0x8A002420,0x8A002420,0x19C2420,0xFE080565,0xCE080000,0x53FC2420,0xAC000C6D,0x8A002420,0x53FC2420,0xAC000C6D,0x8A002420,0x8A002420,0x53FC2420,
-0xAC000C6D,0x8A002420,0x8A002420,0x8A002420,0xF6E81F81,0x1282420,0xFF0C2002,0xFE9417C2,0xFE100FE9,0xDA000BE9,0xB40007B4,0xA800109D,0xFCD01F02,0xFE6416CD,0xC6000659,0x8A002420,0x25FC2420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table228[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0xA00000,0xA00000,0xA00000,0xA00000,0xEC0000,0xEC0000,0xEC0000,0x1E40000,0x1E40000,0x4E000001,0xEC0000,0xEC0000,0xEC0000,0x1E40000,0x1E40000,0x4E000001,0x1E40000,0x1E40000,0x4E000001,0x4E000001,0xEC0000,0xEC0000,0xEC0000,0x1E40000,0x1E40000,0x4E000001,0x1E40000,0x1E40000,0x4E000001,0x4E000001,0x1E40000,
-0x1E40000,0x4E000001,0x4E000001,0x4E000001,0x4B80000,0xCA80000,0xA00000,0xD80000,0x10C0000,0x1540000,0x1880000,0x11FC0000,0xC80000,0xEC0000,0x1540000,0x4E000001,0x1540000,0x1B00000,0x89FC0000,0xC5FC0000,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,
-0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0x89FC0000,0xC5FC0000,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0xC5FC0000,0xD6000001,0xD6000001,0xD6000001,0xD6000001,0x41FC0000,0x1CC0000,0x1CC0000,0xA1FC0000,0xBFF80000,0xCFF40000,0xD6000001,0xD6000001,0x6FFC0000,0xADFC0000,0xD5FC0000,0xD6000001,
-0xB7FC0000,0x9479B0,0xFE5C3CE5,0xFE381579,0xD63410AD,0xFE3C3A48,0xFE08077D,0xD804003A,0xDC002420,0xB8000B51,0x92002422,0xFE185E40,0xFA0022E9,0xC6001283,0xC0003629,0xAE0018E6,0x8C002CC6,0x92005D2B,0x90003993,0x78004373,0x62005D2B,0xDC79B0,0xE2003A6D,0xB40021B4,0xAE00448A,0xA200246F,0x86003492,0x8600656B,0x840041BB,0x7200496F,0x5E006114,0x1BC79B0,
-0x72005634,0x66005924,0x56006BE7,0x480079B4,0xFE645556,0xFC886C60,0xFE8C6CD0,0xFE1833EE,0xFC001D3E,0xDA0018C5,0xBC00109D,0xA00021F6,0xFE345496,0xFE042F97,0xA6002D3F,0x7200496F,0x13879B0,0xC45D2C,0xFE8C2FC1,0xFE581024,0xD6500C81,0xFE5C34E6,0xFE140759,0xDA080032,0xDC002420,0xB8000B51,0x92002422,0x3245D2B,0xFA0022E9,0xC6001283,0xC0003629,0xAE0018E6,
-0x8C002CC6,0x17FC5D2B,0x90003993,0x78004373,0x62005D2B,0x3245D2B,0xFA0022E9,0xC6001283,0xC0003629,0xAE0018E6,0x8C002CC6,0x17FC5D2B,0x90003993,0x78004373,0x62005D2B,0x17FC5D2B,0x90003993,0x78004373,0x62005D2B,0x62005D2B,0xFE9448B3,0xFEAC5556,0xF8C05444,0xFE4031C3,0xFC001D3E,0xDA0018C5,0xBC00109D,0xA00021F6,0xFE5C48FD,0xFE042E53,0xA6002CDB,0x78004373,
-0x1A45D2B,0x3410AC,0x3410AC,0x3410AC,0x3410AC,0xF2000000,0xF2000000,0xF2000000,0x76000000,0x76000000,0x4E000001,0x78000C80,0x78000C80,0x78000C80,0x620004A9,0x620004A9,0x46000262,0x3A000C80,0x3A000C80,0x380007B5,0x26000C82,0x5010AB,0x5010AB,0x5010AB,0x4C0007F2,0x4C0007F2,0x40000453,0x34000DC1,0x34000DC1,0x320008EE,0x24000D22,0xA010AB,
-0xA010AB,0x2C000BD1,0x20000EC6,0x1A0010AB,0xFE1007AA,0xFE2C0B01,0x3410AC,0xEC000562,0x9E0004E1,0x800004FD,0x6E00040D,0x560005C9,0xF40008E9,0xB2000742,0x46000C98,0x320008EE,0x7010AB,0x500C80,0x500C80,0x500C80,0x500C80,0xF2000000,0xF2000000,0xF2000000,0x76000000,0x76000000,0x4E000001,0x2740C80,0x2740C80,0x2740C80,0x620004A9,0x620004A9,
-0x46000262,0xF00C80,0xF00C80,0x380007B5,0x26000C82,0x2740C80,0x2740C80,0x2740C80,0x620004A9,0x620004A9,0x46000262,0xF00C80,0xF00C80,0x380007B5,0x26000C82,0xF00C80,0xF00C80,0x380007B5,0x26000C82,0x26000C82,0xFE200745,0xF8400884,0x500C80,0xEC000562,0x9E0004E1,0x800004FD,0x6E00040D,0x560005C9,0xF4000808,0xB20006C9,0xA80C80,0x380007B5,
-0xA80C80,0x1282420,0xFEE81074,0xFEAC0124,0xD6A00001,0x1B82420,0xFE2C069D,0xD61C0001,0x61F82420,0xB8000B51,0x92002422,0x1B82420,0xFE2C069D,0xD61C0001,0x61F82420,0xB8000B51,0x92002422,0x61F82420,0xB8000B51,0x92002422,0x92002422,0x1B82420,0xFE2C069D,0xD61C0001,0x61F82420,0xB8000B51,0x92002422,0x61F82420,0xB8000B51,0x92002422,0x92002422,0x61F82420,
-0xB8000B51,0x92002422,0x92002422,0x92002422,0xFEF81F85,0x13C2420,0xF9202081,0xFEB018A0,0xFE281109,0xE8000B14,0xBE0006B2,0xB2000F82,0xFEDC1F22,0xFE881771,0xD4000541,0x92002422,0x37FC2420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table229[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x1,0xB00000,0xB00000,0xB00000,0xB00000,0x1040000,0x1040000,0x1040000,0x7FC0000,0x7FC0000,0x56000001,0x1040000,0x1040000,0x1040000,0x7FC0000,0x7FC0000,0x56000001,0x7FC0000,0x7FC0000,0x56000001,0x56000001,0x1040000,0x1040000,0x1040000,0x7FC0000,0x7FC0000,0x56000001,0x7FC0000,0x7FC0000,0x56000001,0x56000001,0x7FC0000,
-0x7FC0000,0x56000001,0x56000001,0x56000001,0xCC0000,0xBC0000,0xB00000,0xF00000,0x1280000,0x1740000,0x1AC0000,0x1DF40000,0xDC0000,0x1040000,0x1740000,0x56000001,0x1740000,0x1C00000,0xA3FC0000,0xD1FC0000,0xDE000001,0xA3FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xDE000001,0xDE000001,0xA3FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xDE000001,
-0xDE000001,0xD1FC0000,0xDE000001,0xDE000001,0xDE000001,0xA3FC0000,0xD1FC0000,0xDE000001,0xD1FC0000,0xDE000001,0xDE000001,0xD1FC0000,0xDE000001,0xDE000001,0xDE000001,0xD1FC0000,0xDE000001,0xDE000001,0xDE000001,0xDE000001,0x69FC0000,0x7DC0000,0x7DC0000,0xB3FC0000,0xCBFC0000,0xD9F40000,0xDE000001,0xDE000001,0x8DFC0000,0xBDFC0000,0xDFD00000,0xDE000001,
-0xC5FC0000,0x9C8330,0xFE684541,0xFE401B24,0xDE3C142D,0xFE443FEA,0xFE140A2D,0xE008009A,0xE6002422,0xC4000A69,0x9A002422,0xFE246584,0xFA002639,0xCC0014AB,0xCC003831,0xB4001956,0x92002DC2,0x9A006380,0x96003D53,0x7E0046EB,0x66006383,0xE88330,0xE8003F39,0xC0002534,0xBA00483A,0xA8002617,0x8C00366A,0x8C006CB7,0x8A00465F,0x78004D93,0x620067CB,0x1D88330,
-0x78005CE4,0x6C005F04,0x5C0073EF,0x4C008334,0xFE645DA6,0xFE8C7530,0xFE8C7650,0xFE303A6D,0xFE04205E,0xDE001961,0xC40010B2,0xAA002336,0xFE445CDB,0xFE0435B7,0xB600301B,0x78004D93,0x14C8330,0xD06380,0xFE983611,0xFE641478,0xDE580F21,0xFE683962,0xFE2009C5,0xE20C007E,0xE6002422,0xC4000A69,0x9A002422,0x1346380,0xFA002639,0xCC0014AB,0xCC003831,0xB4001956,
-0x92002DC2,0x1FF86380,0x96003D53,0x7E0046EB,0x66006383,0x1346380,0xFA002639,0xCC0014AB,0xCC003831,0xB4001956,0x92002DC2,0x1FF86380,0x96003D53,0x7E0046EB,0x66006383,0x1FF86380,0x96003D53,0x7E0046EB,0x66006383,0x66006383,0xFE944E93,0xFAC45BA4,0xFCC85A64,0xFE403763,0xFE04205A,0xDE001961,0xC40010B2,0xAA002336,0xFE784F21,0xFE18341D,0xB6002FA2,0x7E0046EB,
-0x1B86380,0x3C142C,0x3C142C,0x3C142C,0x3C142C,0xFE000010,0xFE000010,0xFE000010,0x82000000,0x82000000,0x56000001,0x84000F20,0x84000F20,0x84000F20,0x680005A5,0x680005A5,0x4C0002EA,0x40000F20,0x40000F20,0x3E000955,0x2A000F22,0x54142B,0x54142B,0x54142B,0x580009A2,0x580009A2,0x46000543,0x3A0010A9,0x3A0010A9,0x38000ACE,0x28000FDB,0xAC142B,
-0xAC142B,0x2C000E41,0x240011F3,0x1C00142B,0xFE100A3A,0xFE2C0DF1,0x3C142C,0xFA000682,0xA80005ED,0x840005EA,0x800004FD,0x56000709,0xF4000B09,0xB20008D2,0x52000F39,0x38000ACE,0x78142B,0x580F20,0x580F20,0x580F20,0x580F20,0xFE04000D,0xFE04000D,0xFE04000D,0x82000000,0x82000000,0x56000001,0x2800F20,0x2800F20,0x2800F20,0x680005A5,0x680005A5,
-0x4C0002EA,0x1080F20,0x1080F20,0x3E000955,0x2A000F22,0x2800F20,0x2800F20,0x2800F20,0x680005A5,0x680005A5,0x4C0002EA,0x1080F20,0x1080F20,0x3E000955,0x2A000F22,0x1080F20,0x1080F20,0x3E000955,0x2A000F22,0x2A000F22,0xFE200965,0xFC480AB4,0x580F20,0xFA000682,0xA80005ED,0x840005EA,0x800004FD,0x56000709,0xF6040A20,0xBC000848,0xB80F20,0x3E000955,
-0xB80F20,0x1382420,0xFF0011A4,0xFEB801F4,0xDEB00001,0x1D02420,0xFE4C07F9,0xDE2C0001,0x6DF82420,0xC4000A69,0x9A002422,0x1D02420,0xFE4C07F9,0xDE2C0001,0x6DF82420,0xC4000A69,0x9A002422,0x6DF82420,0xC4000A69,0x9A002422,0x9A002422,0x1D02420,0xFE4C07F9,0xDE2C0001,0x6DF82420,0xC4000A69,0x9A002422,0x6DF82420,0xC4000A69,0x9A002422,0x9A002422,0x6DF82420,
-0xC4000A69,0x9A002422,0x9A002422,0x9A002422,0xFD102000,0x14C2420,0xFF2C208D,0xFEC01945,0xFE501231,0xF6000A12,0xC80005D2,0xBA000EBA,0xFEF01FA9,0xFEA01865,0xDE000465,0x9A002422,0x45FC2420,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
-0x0,0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table230[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,
-0x40000,0x80000,0x80000,0x80000,0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,0x80000,0x80000,0x1,0x80000,0x80000,0x80000,0x1,0x1,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x40000,0x80000,0x80000,
-0x80000,0xC00000,0xC00000,0xC00000,0xC00000,0x11C0000,0x11C0000,0x11C0000,0x13FC0000,0x13FC0000,0x5E000001,0x11C0000,0x11C0000,0x11C0000,0x13FC0000,0x13FC0000,0x5E000001,0x13FC0000,0x13FC0000,0x5E000001,0x5E000001,0x11C0000,0x11C0000,0x11C0000,0x13FC0000,0x13FC0000,0x5E000001,0x13FC0000,0x13FC0000,0x5E000001,0x5E000001,0x13FC0000,
-0x13FC0000,0x5E000001,0x5E000001,0x5E000001,0xE00000,0xCC0000,0xC00000,0x1040000,0x3400000,0x1980000,0x1D40000,0x27F80000,0xF00000,0x11C0000,0x1980000,0x5E000001,0x1980000,0x1D00000,0xBBFC0000,0xDDFC0000,0xE6000001,0xBBFC0000,0xDDFC0000,0xE6000001,0xDDFC0000,0xE6000001,0xE6000001,0xBBFC0000,0xDDFC0000,0xE6000001,0xDDFC0000,0xE6000001,
-0xE6000001,0xDDFC0000,0xE6000001,0xE6000001,0xE6000001,0xBBFC0000,0xDDFC0000,0xE6000001,0xDDFC0000,0xE6000001,0xE6000001,0xDDFC0000,0xE6000001,0xE6000001,0xE6000001,0xDDFC0000,0xE6000001,0xE6000001,0xE6000001,0xE6000001,0x91FC0000,0xFEC0000,0xFEC0000,0xC7FC0000,0xD9FC0000,0xE3F40000,0xE6000001,0xE6000001,0xABFC0000,0xCFFC0000,0xE7E00000,0xE6000001,
-0xD5FC0000,0xA48BEC,0xFE744D71,0xFE4C20D8,0xE6401785,0xFE5045F6,0xFE140D79,0xEC0C0111,0xF2042424,0xCA00098D,0xA2002426,0xFE306BE4,0xFA002945,0xD80015EB,0xD8003941,0xC00018FE,0x98002E12,0xA2006878,0x9C003FBB,0x84004933,0x6C00687B,0xF48BEC,0xEE004349,0xC6002804,0xC6004B2A,0xAE002717,0x920037A6,0x980072CF,0x960049C7,0x7E00509F,0x68006D47,0x1F08BEC,
-0x7E0062AC,0x72006404,0x5C007ADF,0x52008BEC,0xFE786595,0xFE8C7DDC,0xF8A07F6D,0xFE3040E1,0xFE04237E,0xEE00192E,0xC8000FFB,0xB6002345,0xFE546487,0xFE043C07,0xB60031CB,0x7E00509F,0x15C8BEC,0xDC6878,0xFEA43B8D,0xFE701888,0xE6601145,0xFE743D82,0xFE2C0C8D,0xEA1400D9,0xF2042420,0xCA00098D,0xA2042422,0x3446878,0xFA002945,0xD80015EB,0xD8003941,0xC00018FE,
-0x98002E12,0x27FC6878,0x9C003FBB,0x84004933,0x6C00687B,0x3446878,0xFA002945,0xD80015EB,0xD8003941,0xC00018FE,0x98002E12,0x27FC6878,0x9C003FBB,0x84004933,0x6C00687B,0x27FC6878,0x9C003FBB,0x84004933,0x6C00687B,0x6C00687B,0xFEA053D2,0xFECC6050,0xFECC5F74,0xFE543C9E,0xFE04237A,0xEE00192E,0xC8000FFB,0xB6002345,0xFE885438,0xFE2C3966,0xB6003152,0x84004933,
-0x1D46878,0x401784,0x401784,0x401784,0x401784,0xFE0C0074,0xFE0C0074,0xFE0C0074,0x8E000004,0x8E000004,0x5E000005,0x94001142,0x94001142,0x94001142,0x74000631,0x74000631,0x5800030A,0x48001142,0x48001142,0x44000A69,0x30001142,0x601783,0x601783,0x601783,0x62000ACD,0x62000ACD,0x4C0005EB,0x4000132D,0x4000132D,0x3E000C3A,0x2E00122B,0xC41783,
-0xC41783,0x3200106D,0x260014AE,0x20001783,0xFE200C8D,0xFE2C110D,0x401784,0xFA00078A,0xBC000679,0x8C000682,0x8C000559,0x6A0007B2,0xFE000D14,0xD00009C2,0x5C001166,0x3E000C3A,0x8C1783,0x601144,0x601144,0x601144,0x601144,0xFE0C0050,0xFE0C0050,0xFE0C0050,0x8C040001,0x8C040001,0x5E040001,0x901142,0x901142,0x901142,0x74000631,0x74000631,
-0x5800030A,0x1241142,0x1241142,0x44000A69,0x30001142,0x901142,0x901142,0x901142,0x74000631,0x74000631,0x5800030A,0x1241142,0x1241142,0x44000A69,0x30001142,0x1241142,0x1241142,0x44000A69,0x30001142,0x30001142,0xFA340B48,0xFE4C0CA0,0x601144,0xFA00078A,0xBC000679,0x8C000682,0x8C000559,0x6A0007B2,0xFE0C0BE2,0xD0000919,0xD01142,0x44000A69,
-0xD01142,0x1482420,0xFF0C12C8,0xFECC02FD,0xE6C00001,0x1E82420,0xFE640949,0xE63C0001,0x79F82420,0xCA000989,0xA2002422,0x1E82420,0xFE640949,0xE63C0001,0x79F82420,0xCA000989,0xA2002422,0x79F82420,0xCA000989,0xA2002422,0xA2002422,0x1E82420,0xFE640949,0xE63C0001,0x79F82420,0xCA000989,0xA2002422,0x79F82420,0xCA000989,0xA2002422,0xA2002422,0x79F82420,
-0xCA000989,0xA2002422,0xA2002422,0xA2002422,0xFF142048,0x75C2420,0xF9402104,0xFED81A29,0xFE68133D,0xFC000928,0xD40004ED,0xC2000DFA,0xFF102000,0xFEC01919,0xE60003BA,0xA2002422,0x55FC2420,0x4,0x4,0x4,0x4,0x4,0x4,0x4,0x4,0x4,0x4,0x4000000,0x4000000,0x4000000,0x4000000,0x4000000,
-0x4000000,0x2000000,0x2000000,0x2000000,0x1,0x2000002,0x2000002,0x2000002,0x2000002,0x2000002,0x2000002,0x1,0x1,0x1,0x1,0x2,0x2,0x2,0x2,0x2,0x18000000,0x4,0x4,0xA000000,0x8000000,0x6000000,0x6000000,0x4000000,0x4000001,0x2000001,0x2000000,0x1,
-0x2,};
-static const uint32_t g_etc1_to_bc7_m6_table231[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x140000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,
-0x1C0000,0x380000,0x380000,0x380000,0x8000001,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x1C0000,0x380000,0x380000,0x380000,0x8000001,0x380000,0x380000,0x380000,0x8000001,0x8000001,0x2140000,0x140000,0x140000,0x180000,0x180000,0x2180000,0x2180000,0x200000,0x180000,0x180000,0x280000,0x380000,
-0x280000,0xD00000,0xD00000,0xD00000,0xD00000,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1FF80000,0x1FF80000,0x66000001,0x66000001,0x1340000,0x1340000,0x1340000,0x1FF80000,0x1FF80000,0x66000001,0x1FF80000,0x1FF80000,0x66000001,0x66000001,0x1FF80000,
-0x1FF80000,0x66000001,0x66000001,0x66000001,0x2F00000,0x6DC0000,0xD00000,0x3180000,0x15C0000,0x1B80000,0x1FC0000,0x33F40000,0x1040000,0x1340000,0x1B80000,0x66000001,0x1B80000,0x1E00000,0xD3FC0000,0xE9FC0000,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,
-0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xD3FC0000,0xE9FC0000,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xE9FC0000,0xEE000001,0xEE000001,0xEE000001,0xEE000001,0xB7FC0000,0x17FC0000,0x17FC0000,0xDBFC0000,0xE7F80000,0xEDF40000,0xEE000001,0xEE000001,0xC9FC0000,0xE1FC0000,0xEFF00000,0xEE000001,
-0xE3FC0000,0xB09144,0xFE805385,0xFE582588,0xEE4C19B9,0xFE5C4AD6,0xFE201109,0xF6140195,0xFC0C2454,0xD6000915,0xAA082456,0xFE3C6E48,0xFE08296D,0xE200142D,0xE80036E6,0xC60015E6,0xA2002BFB,0xAE006878,0xA6003DB5,0x90004763,0x7400687B,0x1049144,0xFA004419,0xD2002844,0xD2004B5A,0xBA00258F,0x9E0036E6,0xA2007431,0x9C0048BB,0x8A004FCF,0x6E006E1B,0x7FC9144,
-0x840064EC,0x780065C4,0x66007DB4,0x58009144,0xFE786AC5,0xFAA48366,0xFEAC8499,0xFE40451B,0xFE04257E,0xFC0015D9,0xD2000D46,0xB6002035,0xFE5C697E,0xFE184056,0xBE002F7B,0x8A004FCF,0x1749144,0xEC6878,0xFEB03D85,0xFE7C1A74,0xEE701145,0xFE8C3F52,0xFE380EE1,0xF22400D9,0xFA142420,0xD6000915,0xAA142422,0x35C6878,0xFE08295D,0xE200142D,0xE80036E6,0xC60015E6,
-0xA2002BFB,0x33FC6878,0xA6003DB5,0x90004763,0x7400687B,0x35C6878,0xFE08295D,0xE200142D,0xE80036E6,0xC60015E6,0xA2002BFB,0x33FC6878,0xA6003DB5,0x90004763,0x7400687B,0x33FC6878,0xA6003DB5,0x90004763,0x7400687B,0x7400687B,0xFEB45485,0xF8E06116,0xFAE4601D,0xFE6C3E89,0xFE102549,0xFC0015D9,0xD2000D46,0xB6002035,0xFE985529,0xFE403AFB,0xBE002EEB,0x90004763,
-0x1F46878,0x4C19B8,0x4C19B8,0x4C19B8,0x4C19B8,0xFE180124,0xFE180124,0xFE180124,0x98080034,0x98080034,0x66080035,0xAC001142,0xAC001142,0xAC001142,0x80000521,0x80000521,0x620001FD,0x54001142,0x54001142,0x4A000989,0x38001142,0x7019B8,0x7019B8,0x7019B8,0x6E000AFD,0x6E000AFD,0x58000593,0x4C0013E5,0x4C0013E5,0x44000BEA,0x34001283,0xE419B8,
-0xE419B8,0x38001155,0x2C0015E6,0x240019BB,0xFE200E1D,0xFA441304,0x4C19B8,0xFE08079A,0xD4000562,0xA2000562,0x96000448,0x6E0006AD,0xFE000E34,0xF2000912,0x6600116E,0x44000BEA,0xA019B8,0x701144,0x701144,0x701144,0x701144,0xFE240080,0xFE240080,0xFE240080,0x94140001,0x94140001,0x66140001,0xA81142,0xA81142,0xA81142,0x80000521,0x80000521,
-0x620001FD,0x1581142,0x1581142,0x4A000989,0x38001142,0xA81142,0xA81142,0xA81142,0x80000521,0x80000521,0x620001FD,0x1581142,0x1581142,0x4A000989,0x38001142,0x1581142,0x1581142,0x4A000989,0x38001142,0x38001142,0xFE3C0B68,0xFA640CD1,0x701144,0xFE08078A,0xD4000562,0xA2000562,0x96000448,0x6E0006AD,0xFE140C14,0xF2000831,0xF01142,0x4A000989,
-0xF01142,0x1582420,0xFF241408,0xFEE40425,0xEED00001,0x3FC2420,0xFE880AB5,0xEE4C0001,0x85F82420,0xD60008B1,0xAA002422,0x3FC2420,0xFE880AB5,0xEE4C0001,0x85F82420,0xD60008B1,0xAA002422,0x85F82420,0xD60008B1,0xAA002422,0xAA002422,0x3FC2420,0xFE880AB5,0xEE4C0001,0x85F82420,0xD60008B1,0xAA002422,0x85F82420,0xD60008B1,0xAA002422,0xAA002422,0x85F82420,
-0xD60008B1,0xAA002422,0xAA002422,0xAA002422,0xFF342081,0xF6C2420,0xFF4C2114,0xFEEC1AC8,0xFE90147D,0xFE040910,0xDE000431,0xD0000D22,0xFF182032,0xFECC1A04,0xF2000301,0xAA002422,0x63FC2420,0x80034,0x80034,0x80034,0x80034,0x80034,0x80034,0x80034,0x80034,0x80034,0x80034,0x1C000000,0x1C000000,0x1C000000,0x1C000000,0x1C000000,
-0x1C000000,0xE000000,0xE000000,0xE000000,0x8000001,0x20C0032,0x20C0032,0x20C0032,0x20C0032,0x20C0032,0x20C0032,0xC000011,0xC000011,0xC000011,0x800000A,0x180032,0x180032,0x180032,0x4000022,0x4000032,0x98000000,0x80034,0x80034,0x44000000,0x2E000000,0x24000000,0x24000000,0x18000000,0x3600000D,0x24000008,0x12000001,0xC000011,
-0x140032,};
-static const uint32_t g_etc1_to_bc7_m6_table232[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x240001,0x380000,0x380000,0x380000,0x380000,0x380000,
-0x380000,0x700000,0x700000,0x700000,0x12000000,0x380000,0x380000,0x380000,0x380000,0x380000,0x380000,0x700000,0x700000,0x700000,0x12000000,0x700000,0x700000,0x700000,0x12000000,0x12000000,0x280000,0x240001,0x240001,0x2C0000,0x300000,0x340000,0x340000,0x400000,0x2C0000,0x300000,0x500000,0x700000,
-0x500000,0xE00001,0xE00001,0xE00001,0xE00001,0x1500000,0x1500000,0x1500000,0x2DF80000,0x2DF80000,0x70000000,0x1500000,0x1500000,0x1500000,0x2DF80000,0x2DF80000,0x70000000,0x2DF80000,0x2DF80000,0x70000000,0x70000000,0x1500000,0x1500000,0x1500000,0x2DF80000,0x2DF80000,0x70000000,0x2DF80000,0x2DF80000,0x70000000,0x70000000,0x2DF80000,
-0x2DF80000,0x70000000,0x70000000,0x70000000,0x7040000,0xF00000,0xE00001,0x1340000,0x17C0000,0x1E00000,0x11FC0000,0x3FF40000,0x11C0000,0x1500000,0x1E00000,0x70000000,0x1E00000,0x1F00001,0xEFFC0000,0xF7F80000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,
-0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xEFFC0000,0xF7F80000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xF7F80000,0xF8000000,0xF8000000,0xF8000000,0xF8000000,0xE3FC0000,0xA7FC0000,0xA7FC0000,0xF1FC0000,0xF5FC0000,0xF9F00000,0xF8000000,0xF8000000,0xEBFC0000,0xF3FC0000,0xF9E40000,0xF8000000,
-0xF5FC0000,0xC097BB,0xFE8C5AEA,0xFE642B8F,0xF8581C9A,0xFE6850F9,0xFE3815EA,0xFC200282,0xFE14251D,0xDE040922,0xB41424D5,0xFE4471AC,0xFE082B02,0xEE0012C6,0xF40034A9,0xD20012B1,0xAE002A0C,0xBC006878,0xB2003B2E,0x9600456C,0x7E006878,0x31897B9,0xFA004686,0xE200290A,0xE2004B75,0xC6002442,0xA8003621,0xAE00761C,0xA60047DF,0x90004F04,0x7A006EE8,0x11FC97B9,
-0x900067C5,0x7E006825,0x6C008115,0x5E0097B9,0xFE8C716F,0xFEAC89AD,0xFEAC8B86,0xFE544AE1,0xFE1028CE,0xFC00135A,0xDE000A4D,0xCA001CD5,0xFE707042,0xFE244643,0xD6002C8C,0x90004F04,0x19497B9,0xFC687B,0xFEC43F8B,0xFE941CBB,0xF8841142,0xFE984159,0xFE5811F2,0xFC3800DA,0xFE2C2431,0xDE10090E,0xB4242421,0x1786878,0xFE082AF2,0xEE0012C6,0xF40034A9,0xD20012B1,
-0xAE002A0C,0x41FC6878,0xB2003B2E,0x9600456C,0x7E006878,0x1786878,0xFE082AF2,0xEE0012C6,0xF40034A9,0xD20012B1,0xAE002A0C,0x41FC6878,0xB2003B2E,0x9600456C,0x7E006878,0x41FC6878,0xB2003B2E,0x9600456C,0x7E006878,0x7E006878,0xFED055D8,0xFEEC6131,0xFEEC6086,0xFE8040FD,0xFE242846,0xFC00135A,0xDE000A4D,0xCA001CD5,0xFEB456A6,0xFE5C3DAE,0xD6002BE3,0x9600456C,
-0xFFC6878,0x581C9A,0x581C9A,0x581C9A,0x581C9A,0xFE240266,0xFE240266,0xFE240266,0xA21400B5,0xA21400B5,0x701400B5,0xC8001142,0xC8001142,0xC8001142,0x9200040D,0x9200040D,0x68000121,0x60001144,0x60001144,0x56000895,0x40001144,0x2801C9A,0x2801C9A,0x2801C9A,0x7A000B7D,0x7A000B7D,0x6200056D,0x580014D3,0x580014D3,0x50000BA2,0x400012FD,0x1081C9A,
-0x1081C9A,0x440012AD,0x32001788,0x2A001C9D,0xFC38101A,0xFE4C1596,0x581C9A,0xFE100871,0xF2000448,0xC0000464,0xAA00031D,0x80000585,0xFE140FDB,0xFC000884,0x7C001182,0x50000BA2,0xB81C9A,0x841142,0x841142,0x841142,0x841142,0xFE3400C1,0xFE3400C1,0xFE3400C1,0x9E240001,0x9E240001,0x70240001,0xC41142,0xC41142,0xC41142,0x9200040D,0x9200040D,
-0x68000121,0x18C1142,0x18C1142,0x56000895,0x40001144,0xC41142,0xC41142,0xC41142,0x9200040D,0x9200040D,0x68000121,0x18C1142,0x18C1142,0x56000895,0x40001144,0x18C1142,0x18C1142,0x56000895,0x40001144,0x40001144,0xFE580B95,0xFE6C0D0D,0x841142,0xFE1807E9,0xF2000448,0xC0000464,0xAA00031D,0x80000585,0xFE340C31,0xFC000784,0x1181142,0x56000895,
-0x1181142,0x1682422,0xFF301572,0xFEFC05B9,0xF8E00001,0x1FFC2420,0xFEA00C69,0xF85C0000,0x91FC2420,0xDC0007D9,0xB4002420,0x1FFC2420,0xFEA00C69,0xF85C0000,0x91FC2420,0xDC0007D9,0xB4002420,0x91FC2420,0xDC0007D9,0xB4002420,0xB4002420,0x1FFC2420,0xFEA00C69,0xF85C0000,0x91FC2420,0xDC0007D9,0xB4002420,0x91FC2420,0xDC0007D9,0xB4002420,0xB4002420,0x91FC2420,
-0xDC0007D9,0xB4002420,0xB4002420,0xB4002420,0xFD4C2102,0x9802420,0xFB642185,0xFF141BC1,0xFEA815C1,0xFE0409FA,0xE8000371,0xD8000C44,0xFF2C20D5,0xFEF01B2D,0xFE000248,0xB4002420,0x75FC2420,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x1400B5,0x38000000,0x38000000,0x38000000,0x38000000,0x38000000,
-0x38000000,0x1C000000,0x1C000000,0x1C000000,0x12000000,0x1C00B5,0x1C00B5,0x1C00B5,0x1C00B5,0x1C00B5,0x1C00B5,0x18000044,0x18000044,0x18000044,0x12000024,0x3000B5,0x3000B5,0x3000B5,0xA000071,0x80000B5,0xFA040005,0x1400B5,0x1400B5,0x84000000,0x5C000000,0x46000000,0x46000000,0x2E000000,0x76000035,0x5600001A,0x22000004,0x18000044,
-0x2400B5,};
-static const uint32_t g_etc1_to_bc7_m6_table233[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x340001,0x500000,0x500000,0x500000,0x500000,0x500000,
-0x500000,0xA00000,0xA00000,0xA00000,0x1A000000,0x500000,0x500000,0x500000,0x500000,0x500000,0x500000,0xA00000,0xA00000,0xA00000,0x1A000000,0xA00000,0xA00000,0xA00000,0x1A000000,0x1A000000,0x4380000,0x340001,0x340001,0x63C0000,0x440000,0x480000,0x480000,0x580000,0x63C0000,0x440000,0x700000,0xA00000,
-0x700000,0xF00001,0xF00001,0xF00001,0xF00001,0x1680000,0x1680000,0x1680000,0x39F80000,0x39F80000,0x78000000,0x1680000,0x1680000,0x1680000,0x39F80000,0x39F80000,0x78000000,0x39F80000,0x39F80000,0x78000000,0x78000000,0x1680000,0x1680000,0x1680000,0x39F80000,0x39F80000,0x78000000,0x39F80000,0x39F80000,0x78000000,0x78000000,0x39F80000,
-0x39F80000,0x78000000,0x78000000,0x78000000,0x3180000,0x9000000,0xF00001,0x1480000,0x3940000,0x3FC0000,0x1FF80000,0x49F80000,0x1300000,0x1680000,0x3FC0000,0x78000000,0x3FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xC89B73,0xFE98601A,0xFE70308B,0xFE641F86,0xFE7454F9,0xFE3819DE,0xFE280401,0xFE20257D,0xE4100916,0xBA1C2481,0xFE507334,0xFE142CA6,0xFA0011D2,0xFA0031B5,0xDE000F8D,0xB4002774,0xC6006693,0xB80037EE,0x9C004258,0x86006694,0x12C9B71,0xFA0048EA,0xE20029B6,0xE8004A85,0xCC0022AE,0xAE003499,0xBA0075EC,0xB20045F3,0x9A004CE6,0x80006DCC,0x1BF89B71,
-0x9600690D,0x8A0068A1,0x7200821D,0x64009B71,0xFE94756C,0xF8C08E33,0xFAC48FAF,0xFE544EB5,0xFE242B66,0xFC0011E2,0xEA0007C2,0xD0001968,0xFE7873EA,0xFE40496C,0xD6002964,0x9A004CE6,0x1AC9B71,0x10C6693,0xFED04023,0xFEAC1E33,0xFE941142,0xFEB0416D,0xFE6413AA,0xFE4400F5,0xFE402373,0xE41C08A6,0xBA342315,0x38C6693,0xFE202C26,0xFA0011D2,0xFA0031B5,0xDE000F8D,
-0xB4002774,0x4BFC6693,0xB80037EE,0x9C004258,0x86006694,0x38C6693,0xFE202C26,0xFA0011D2,0xFA0031B5,0xDE000F8D,0xB4002774,0x4BFC6693,0xB80037EE,0x9C004258,0x86006694,0x4BFC6693,0xB80037EE,0x9C004258,0x86006694,0x86006694,0xFED05534,0xF900600B,0xFD085F33,0xFE9440FA,0xFE3829B2,0xFC0011E2,0xEA0007C2,0xD0001968,0xFEC455C3,0xFE843E6A,0xDE0028A6,0x9C004258,
-0x1DF86693,0x641F86,0x641F86,0x641F86,0x641F86,0xFE280401,0xFE280401,0xFE280401,0xAC1C016D,0xAC1C016D,0x781C016D,0xE0001142,0xE0001142,0xE0001142,0xA200031D,0xA200031D,0x74000089,0x6C001144,0x6C001144,0x5C0007D9,0x48001144,0x901F85,0x901F85,0x901F85,0x86000C35,0x86000C35,0x680005B5,0x620015A4,0x620015A4,0x58000B8A,0x46001365,0x1241F85,
-0x1241F85,0x4A001419,0x3E001918,0x30001F85,0xFE3C123E,0xF45818B1,0x641F86,0xFE180989,0xFC000384,0xC8000368,0xB800025D,0x8C0004A5,0xFE201212,0xFE0008CA,0x8600118A,0x58000B8A,0xD01F85,0x941142,0x941142,0x941142,0x941142,0xFE4400F5,0xFE4400F5,0xFE4400F5,0xA6340001,0xA6340001,0x78340001,0xDC1142,0xDC1142,0xDC1142,0xA200031D,0xA200031D,
-0x74000089,0x1BC1142,0x1BC1142,0x5C0007D9,0x48001144,0xDC1142,0xDC1142,0xDC1142,0xA200031D,0xA200031D,0x74000089,0x1BC1142,0x1BC1142,0x5C0007D9,0x48001144,0x1BC1142,0x1BC1142,0x5C0007D9,0x48001144,0x48001144,0xFA6C0BE2,0xFC880D22,0x941142,0xFE340845,0xFC000384,0xC8000368,0xB800025D,0x8C0004A5,0xFA480C82,0xFE0407B4,0x1381142,0x5C0007D9,
-0x1381142,0x1782312,0xFF441595,0xFF0806B9,0xFEF00001,0x35FC2312,0xFEB80D39,0xFE700000,0x9DF42312,0xE60006C4,0xBA002314,0x35FC2312,0xFEB80D39,0xFE700000,0x9DF42312,0xE60006C4,0xBA002314,0x9DF42312,0xE60006C4,0xBA002314,0xBA002314,0x35FC2312,0xFEB80D39,0xFE700000,0x9DF42312,0xE60006C4,0xBA002314,0x9DF42312,0xE60006C4,0xBA002314,0xBA002314,0x9DF42312,
-0xE60006C4,0xBA002314,0xBA002314,0xBA002314,0xFF502032,0x1902312,0xFF6C2099,0xFF201B94,0xFED01619,0xFE180AF1,0xF20002A1,0xE0000B14,0xFD502000,0xFF081AAA,0xFE0001C4,0xBA002314,0x81FC2312,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x1C016D,0x50000000,0x50000000,0x50000000,0x50000000,0x50000000,
-0x50000000,0x26000001,0x26000001,0x26000001,0x1A000000,0x28016D,0x28016D,0x28016D,0x28016D,0x28016D,0x28016D,0x22000082,0x22000082,0x22000082,0x18000044,0x4C016D,0x4C016D,0x4C016D,0x100000DD,0xC00016D,0xFE0C003D,0x1C016D,0x1C016D,0xBC000000,0x82000000,0x64000000,0x64000000,0x42000000,0xA0000074,0x7400003A,0x32000009,0x22000082,
-0x34016D,};
-static const uint32_t g_etc1_to_bc7_m6_table234[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x440001,0x680000,0x680000,0x680000,0x680000,0x680000,
-0x680000,0xD00000,0xD00000,0xD00000,0x22000000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0xD00000,0xD00000,0xD00000,0x22000000,0xD00000,0xD00000,0xD00000,0x22000000,0x22000000,0xC480000,0x440001,0x440001,0x500000,0x580000,0x25C0000,0x25C0000,0x740000,0x500000,0x580000,0x940000,0xD00000,
-0x940000,0x1000001,0x1000001,0x1000001,0x1000001,0x1800000,0x1800000,0x1800000,0x45F80000,0x45F80000,0x80000000,0x1800000,0x1800000,0x1800000,0x45F80000,0x45F80000,0x80000000,0x45F80000,0x45F80000,0x80000000,0x80000000,0x1800000,0x1800000,0x1800000,0x45F80000,0x45F80000,0x80000000,0x45F80000,0x45F80000,0x80000000,0x80000000,0x45F80000,
-0x45F80000,0x80000000,0x80000000,0x80000000,0x12C0000,0x1140000,0x1000001,0x35C0000,0x1B00000,0x13FC0000,0x2BFC0000,0x55F40000,0x1440000,0x1800000,0x13FC0000,0x80000000,0x13FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xD0985B,0xFEA46016,0xFE7C32E7,0xFE6C22C6,0xFE80534D,0xFE4C1AEE,0xFE300602,0xFE2C22E1,0xE8180846,0xBE242169,0xFE5C6E98,0xFE142B26,0xFC001148,0xFA002BC5,0xE2000B41,0xBA0021D8,0xD0005F33,0xBE0031CA,0xA6003B0D,0x8A005F34,0x1389859,0xFC004866,0xE80029FE,0xEE004581,0xD2001F56,0xB4002FCD,0xC0006FCC,0xB20040B3,0x9C004648,0x86006704,0x21F89859,
-0x9C0065F9,0x8A0063E1,0x78007CE9,0x68009859,0xFEA072F9,0xFCC88B1B,0xFECC8CE7,0xFE6C4D3A,0xFE2829D6,0xFE040EE2,0xEE0004B9,0xD6001419,0xFE7871FA,0xFE4047CC,0xDE00246A,0x9C004648,0x1BC9859,0x1185F33,0xFEE83C13,0xFEB81D03,0xFEA41142,0xFEC43C4B,0xFE7C126A,0xFE5C013D,0xFE4C1F8B,0xE8300726,0xBE441F05,0x1A05F33,0xFE3829C6,0xFC001148,0xFA002BC5,0xE2000B41,
-0xBA0021D8,0x55F85F33,0xBE0031CA,0xA6003B0D,0x8A005F34,0x1A05F33,0xFE3829C6,0xFC001148,0xFA002BC5,0xE2000B41,0xBA0021D8,0x55F85F33,0xBE0031CA,0xA6003B0D,0x8A005F34,0x55F85F33,0xBE0031CA,0xA6003B0D,0x8A005F34,0x8A005F34,0xFEE44F59,0xFF0C58D3,0xFF0C5843,0xFEB03CE2,0xFE4826EA,0xFE040EDE,0xEE0004B9,0xD6001419,0xFEDC4F89,0xFE8439AA,0xDE0023A6,0xA6003B0D,
-0x27FC5F33,0x6C22C6,0x6C22C6,0x6C22C6,0x6C22C6,0xFE300602,0xFE300602,0xFE300602,0xB6240265,0xB6240265,0x80240265,0xF8001142,0xF8001142,0xF8001142,0xAE00025D,0xAE00025D,0x80000031,0x78001144,0x78001144,0x66000728,0x50001144,0xA422C5,0xA422C5,0xA422C5,0x92000D2D,0x92000D2D,0x7400063D,0x6E0016B4,0x6E0016B4,0x60000B84,0x4C0013E5,0x14C22C5,
-0x14C22C5,0x500015CD,0x44001AD8,0x360022C5,0xFE4C14D5,0xFA641BA9,0x6C22C6,0xFE240B44,0xFE040388,0xE20002B1,0xCC00019A,0x960003E8,0xFC301481,0xFE000A0A,0x9600119B,0x60000B84,0xE822C5,0xA41142,0xA41142,0xA41142,0xA41142,0xFE5C013D,0xFE5C013D,0xFE5C013D,0xAE440001,0xAE440001,0x80440001,0xF41142,0xF41142,0xF41142,0xAE00025D,0xAE00025D,
-0x80000031,0x1F01142,0x1F01142,0x66000728,0x50001144,0xF41142,0xF41142,0xF41142,0xAE00025D,0xAE00025D,0x80000031,0x1F01142,0x1F01142,0x66000728,0x50001144,0x1F01142,0x1F01142,0x66000728,0x50001144,0x50001144,0xFE740C02,0xFE8C0D6A,0xA41142,0xFE440894,0xFE040384,0xE20002B1,0xCC00019A,0x960003E8,0xFE500CB2,0xFE1807FD,0x15C1142,0x66000728,
-0x15C1142,0x1801F02,0xFF501315,0xFF2005F1,0xFF000001,0x41FC1F02,0xFECC0BD5,0xFE880000,0xA1FC1F02,0xEC0004C8,0xBE001F04,0x41FC1F02,0xFECC0BD5,0xFE880000,0xA1FC1F02,0xEC0004C8,0xBE001F04,0xA1FC1F02,0xEC0004C8,0xBE001F04,0xBE001F04,0x41FC1F02,0xFECC0BD5,0xFE880000,0xA1FC1F02,0xEC0004C8,0xBE001F04,0xA1FC1F02,0xEC0004C8,0xBE001F04,0xBE001F04,0xA1FC1F02,
-0xEC0004C8,0xBE001F04,0xBE001F04,0xBE001F04,0xF7681C99,0x5981F02,0xFF6C1D09,0xFF30184D,0xFEE81379,0xFE4809A1,0xF4000164,0xE0000894,0xFD581C22,0xFF1417A5,0xFE000124,0xBE001F04,0x89FC1F02,0x240265,0x240265,0x240265,0x240265,0x240265,0x240265,0x240265,0x240265,0x240265,0x240265,0x6A000000,0x6A000000,0x6A000000,0x6A000000,0x6A000000,
-0x6A000000,0x32000001,0x32000001,0x32000001,0x22000000,0x340265,0x340265,0x340265,0x340265,0x340265,0x340265,0x280000DA,0x280000DA,0x280000DA,0x1E000074,0x640265,0x640265,0x640265,0x16000171,0x10000265,0xF21400C8,0x240265,0x240265,0xF6000000,0xAA000000,0x82000000,0x82000000,0x54000000,0xC40000C1,0x96000061,0x40000010,0x280000DA,
-0x480265,};
-static const uint32_t g_etc1_to_bc7_m6_table235[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x540001,0x800000,0x800000,0x800000,0x800000,0x800000,
-0x800000,0x1000000,0x1000000,0x1000000,0x2A000000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x1000000,0x1000000,0x1000000,0x2A000000,0x1000000,0x1000000,0x1000000,0x2A000000,0x2A000000,0x5C0000,0x540001,0x540001,0x640000,0x6C0000,0x740000,0x740000,0x900000,0x640000,0x6C0000,0xB40000,0x1000000,
-0xB40000,0x1100001,0x1100001,0x1100001,0x1100001,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x51F80000,0x51F80000,0x88000000,0x88000000,0x1980000,0x1980000,0x1980000,0x51F80000,0x51F80000,0x88000000,0x51F80000,0x51F80000,0x88000000,0x88000000,0x51F80000,
-0x51F80000,0x88000000,0x88000000,0x88000000,0x73C0000,0x1240000,0x1100001,0x1740000,0x1CC0000,0x21FC0000,0x39FC0000,0x5FF80000,0x1580000,0x1980000,0x21FC0000,0x88000000,0x21FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xD895C3,0xFEB0608A,0xFE8835A3,0xFE78265A,0xFE80521D,0xFE581C6A,0xFE440859,0xFE3820E5,0xEC2007F6,0xC22C1ED1,0xFE686AA4,0xFE202A26,0xFE0C116E,0xFE04269A,0xE80007B5,0xC0001CD4,0xD600582B,0xC4002C2E,0xAC003425,0x90005828,0x14495C1,0xFE08488A,0xEE002B26,0xF400410D,0xD8001CB6,0xB4002B8D,0xC6006A14,0xB8003BF7,0xA4004028,0x8C00609C,0x27F895C1,
-0x9C006399,0x90005FA9,0x7E00782D,0x6C0095C1,0xFEA07179,0xFECC888F,0xFECC8AE7,0xFE6C4C2A,0xFE382902,0xFE040CD2,0xF4000288,0xD8000F71,0xFE906FEE,0xFE5046F2,0xE600202C,0xA4004028,0x1D095C1,0x120582B,0xFEF43833,0xFEC01C1A,0xFEB41142,0xFED03783,0xFE88113A,0xFE680195,0xFE641BE3,0xEA4005CA,0xC2541B35,0x1B05828,0xFE4C27C5,0xFE141142,0xFE08268D,0xE80007B5,
-0xC0001CD4,0x5DF45828,0xC4002C2E,0xAC003425,0x90005828,0x1B05828,0xFE4C27C5,0xFE141142,0xFE08268D,0xE80007B5,0xC0001CD4,0x5DF45828,0xC4002C2E,0xAC003425,0x90005828,0x5DF45828,0xC4002C2E,0xAC003425,0x90005828,0x90005828,0xFEF8494A,0xFF0C5253,0xFF0C5243,0xFEB03872,0xFE682411,0xFE040CCE,0xF4000288,0xD8000F71,0xFEDC4979,0xFE9035DE,0xE6001F68,0xAC003425,
-0x31FC5828,0x78265A,0x78265A,0x78265A,0x78265A,0xFE440859,0xFE440859,0xFE440859,0xC02C039D,0xC02C039D,0x882C039D,0xFE0C116E,0xFE0C116E,0xFE0C116E,0xBA0001BD,0xBA0001BD,0x88000004,0x84001144,0x84001144,0x72000668,0x58001144,0x2B0265A,0x2B0265A,0x2B0265A,0xA2000E36,0xA2000E36,0x7A00070D,0x740017D8,0x740017D8,0x68000B94,0x5200147D,0x168265A,
-0x168265A,0x560017C9,0x4A001CC8,0x3A00265D,0xFE5817D5,0xFE6C1F01,0x78265A,0xFE340D49,0xFE14045E,0xEE0001F4,0xDA000112,0xAA00031D,0xFE341721,0xFE100BF2,0xA60011A8,0x68000B94,0xFC265A,0xB41142,0xB41142,0xB41142,0xB41142,0xFE680195,0xFE680195,0xFE680195,0xB6540001,0xB6540001,0x88540001,0x10C1142,0x10C1142,0x10C1142,0xBA0001BD,0xBA0001BD,
-0x88000004,0xBF81142,0xBF81142,0x72000668,0x58001144,0x10C1142,0x10C1142,0x10C1142,0xBA0001BD,0xBA0001BD,0x88000004,0xBF81142,0xBF81142,0x72000668,0x58001144,0xBF81142,0xBF81142,0x72000668,0x58001144,0x58001144,0xFE900C31,0xFCA80D75,0xB41142,0xFE5C08C9,0xFE1803E8,0xEE0001F4,0xDA000112,0xAA00031D,0xFA700CD1,0xFE300869,0x17C1142,0x72000668,
-0x17C1142,0x1881B32,0xFF5C10BD,0xFF2C052D,0xFF100001,0x4DFC1B32,0xFEE40A55,0xFEA00000,0xA7FC1B32,0xEC000328,0xC2001B34,0x4DFC1B32,0xFEE40A55,0xFEA00000,0xA7FC1B32,0xEC000328,0xC2001B34,0xA7FC1B32,0xEC000328,0xC2001B34,0xC2001B34,0x4DFC1B32,0xFEE40A55,0xFEA00000,0xA7FC1B32,0xEC000328,0xC2001B34,0xA7FC1B32,0xEC000328,0xC2001B34,0xC2001B34,0xA7FC1B32,
-0xEC000328,0xC2001B34,0xC2001B34,0xC2001B34,0xFB7018F1,0x9A01B32,0xFB841962,0xFF401540,0xFEE81109,0xFE480871,0xF800009D,0xE4000665,0xFF5C1892,0xFF2014D2,0xFE1000E5,0xC2001B34,0x91FC1B32,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x2C039D,0x82000000,0x82000000,0x82000000,0x82000000,0x82000000,
-0x82000000,0x3E000001,0x3E000001,0x3E000001,0x2A000000,0x40039D,0x40039D,0x40039D,0x40039D,0x40039D,0x40039D,0x34000152,0x34000152,0x34000152,0x280000AD,0x7C039D,0x7C039D,0x7C039D,0x1C00022D,0x1400039D,0xF61C0188,0x2C039D,0x2C039D,0xFE04001D,0xD2000000,0xA0000000,0xA0000000,0x68000000,0xF6000121,0xC2000099,0x50000019,0x34000152,
-0x58039D,};
-static const uint32_t g_etc1_to_bc7_m6_table236[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x680000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,
-0x2980000,0x1380000,0x1380000,0x1380000,0x32000001,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x2980000,0x1380000,0x1380000,0x1380000,0x32000001,0x1380000,0x1380000,0x1380000,0x32000001,0x32000001,0xE6C0000,0x680000,0x680000,0x780000,0x2800000,0x8C0000,0x8C0000,0x2AC0000,0x780000,0x2800000,0xDC0000,0x1380000,
-0xDC0000,0x1240000,0x1240000,0x1240000,0x1240000,0x3B00000,0x3B00000,0x3B00000,0x5DFC0000,0x5DFC0000,0x90000001,0x3B00000,0x3B00000,0x3B00000,0x5DFC0000,0x5DFC0000,0x90000001,0x5DFC0000,0x5DFC0000,0x90000001,0x90000001,0x3B00000,0x3B00000,0x3B00000,0x5DFC0000,0x5DFC0000,0x90000001,0x5DFC0000,0x5DFC0000,0x90000001,0x90000001,0x5DFC0000,
-0x5DFC0000,0x90000001,0x90000001,0x90000001,0x1540000,0x1380000,0x1240000,0x18C0000,0x1EC0000,0x33FC0000,0x49F80000,0x6BF80000,0x36C0000,0x3B00000,0x33FC0000,0x90000001,0x33FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xE49371,0xFEB06174,0xFE943921,0xFE842AC8,0xFE8C5159,0xFE641E9A,0xFE500B99,0xFE401F71,0xEE300834,0xC8341C7D,0xFE7466F6,0xFE2C29B8,0xFE181234,0xFE0821B1,0xEE000485,0xC60017E2,0xE00050A2,0xCA002684,0xB2002CE1,0x960050A2,0x150936F,0xFE0849BC,0xEE002D78,0xFA003CBD,0xE2001A89,0xBA00277B,0xCC006426,0xC0003749,0xAA0039C2,0x920059D2,0x2DF8936F,
-0xA60061D4,0x96005B93,0x8400735F,0x7000936F,0xFEB46FEE,0xFECC86CD,0xF6DC89B8,0xFE804B93,0xFE3C288E,0xFE040BB2,0xF80000EC,0xE0000ADE,0xFE986E7F,0xFE544677,0xF0001BFE,0xAA0039C2,0x1E0936F,0x12C50A5,0xFF00340D,0xFED81AE4,0xFEC41144,0xFEDC3275,0xFEA01012,0xFE8001F9,0xFE701821,0xEA580474,0xC8681735,0x1C050A2,0xFE642553,0xFE2C1144,0xFE0821A1,0xEE000485,
-0xC60017E2,0x65F850A2,0xCA002684,0xB2002CE1,0x960050A2,0x1C050A2,0xFE642553,0xFE2C1144,0xFE0821A1,0xEE000485,0xC60017E2,0x65F850A2,0xCA002684,0xB2002CE1,0x960050A2,0x65F850A2,0xCA002684,0xB2002CE1,0x960050A2,0x960050A2,0xFEF84350,0xFB244B59,0xFD284B14,0xFEC033C8,0xFE7C2149,0xFE040BAE,0xF80000EC,0xE0000ADE,0xFEF843BD,0xFEA431C2,0xF0001B1D,0xB2002CE1,
-0x3DF850A2,0x842AC8,0x842AC8,0x842AC8,0x842AC8,0xFE500B99,0xFE500B99,0xFE500B99,0xCC340548,0xCC340548,0x90340549,0xFE181234,0xFE181234,0xFE181234,0xCC000121,0xCC000121,0x92040009,0x92001142,0x92001142,0x7E0005AA,0x62001142,0xC42AC8,0xC42AC8,0xC42AC8,0xA8000FDA,0xA8000FDA,0x86000849,0x80001946,0x80001946,0x74000BC6,0x5E001523,0x18C2AC8,
-0x18C2AC8,0x5C001A61,0x50001F2E,0x40002ACB,0xFE681BBA,0xF67C23A8,0x842AC8,0xFE441026,0xFE1805AA,0xFC000171,0xEA000088,0xB6000274,0xFE3C1B18,0xFE180E8E,0xB60011C3,0x74000BC6,0x1182AC8,0xC41144,0xC41144,0xC41144,0xC41144,0xFE8001F9,0xFE8001F9,0xFE8001F9,0xBE680001,0xBE680001,0x90680001,0x3241142,0x3241142,0x3241142,0xCC000121,0xCC000121,
-0x900C0001,0x17FC1142,0x17FC1142,0x7E0005AA,0x62001142,0x3241142,0x3241142,0x3241142,0xCC000121,0xCC000121,0x900C0001,0x17FC1142,0x17FC1142,0x7E0005AA,0x62001142,0x17FC1142,0x17FC1142,0x7E0005AA,0x62001142,0x62001142,0xFEA00C82,0xF6BC0DC8,0xC41144,0xFE78094D,0xFE2C046A,0xFC000171,0xEA000088,0xB6000274,0xFE780D0D,0xFE4808B4,0x1A41142,0x7E0005AA,
-0x1A41142,0x1901735,0xFF680E48,0xFF380464,0xFF240000,0x59FC1735,0xFEFC08C8,0xFEB80001,0xADFC1735,0xF20001B1,0xC8001735,0x59FC1735,0xFEFC08C8,0xFEB80001,0xADFC1735,0xF20001B1,0xC8001735,0xADFC1735,0xF20001B1,0xC8001735,0xC8001735,0x59FC1735,0xFEFC08C8,0xFEB80001,0xADFC1735,0xF20001B1,0xC8001735,0xADFC1735,0xF20001B1,0xC8001735,0xC8001735,0xADFC1735,
-0xF20001B1,0xC8001735,0xC8001735,0xC8001735,0xFF781522,0x1AC1735,0xFF8C1589,0xFF401231,0xFEFC0E90,0xFE840734,0xFC000014,0xEC000454,0xFF701520,0xFF4011AD,0xFE3000C5,0xC8001735,0x99FC1735,0x340548,0x340548,0x340548,0x340548,0x340548,0x340548,0x340548,0x340548,0x340548,0x340548,0x9C000000,0x9C000000,0x9C000000,0x9C000000,0x9C000000,
-0x9C000000,0x4C000000,0x4C000000,0x4C000000,0x32000001,0x4C0548,0x4C0548,0x4C0548,0x4C0548,0x4C0548,0x4C0548,0x3A0001F9,0x3A0001F9,0x3A0001F9,0x2E0000FA,0x980548,0x980548,0x980548,0x2000034D,0x1800054A,0xFA2402AD,0x340548,0x340548,0xFE100091,0xFE000000,0xC2000000,0xC2000000,0x7E000000,0xFC000200,0xE40000DD,0x60000024,0x3A0001F9,
-0x6C0548,};
-static const uint32_t g_etc1_to_bc7_m6_table237[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x780000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,
-0x2B00000,0x1680000,0x1680000,0x1680000,0x3A000001,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x2B00000,0x1680000,0x1680000,0x1680000,0x3A000001,0x1680000,0x1680000,0x1680000,0x3A000001,0x3A000001,0x800000,0x780000,0x780000,0x8C0000,0x2940000,0xA40000,0xA40000,0xC80000,0x8C0000,0x2940000,0xFC0000,0x1680000,
-0xFC0000,0x1340000,0x1340000,0x1340000,0x1340000,0x1C80000,0x1C80000,0x1C80000,0x69FC0000,0x69FC0000,0x98000001,0x1C80000,0x1C80000,0x1C80000,0x69FC0000,0x69FC0000,0x98000001,0x69FC0000,0x69FC0000,0x98000001,0x98000001,0x1C80000,0x1C80000,0x1C80000,0x69FC0000,0x69FC0000,0x98000001,0x69FC0000,0x69FC0000,0x98000001,0x98000001,0x69FC0000,
-0x69FC0000,0x98000001,0x98000001,0x98000001,0x5640000,0x1480000,0x1340000,0x1A40000,0x7FC0000,0x41FC0000,0x55FC0000,0x77F40000,0x3800000,0x1C80000,0x41FC0000,0x98000001,0x41FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xEC91E9,0xFEC46279,0xFEA03CA5,0xFE902F14,0xFE985139,0xFE64210A,0xFE5C0EF9,0xFE4C1EBD,0xF23808E4,0xCC3C1AF5,0xFE806462,0xFE3829E8,0xFE241378,0xFE081E61,0xF000026D,0xCC00141E,0xE8004A4E,0xD0002208,0xB80026E5,0x9C004A4E,0x15C91E7,0xFE084BDC,0xF4003058,0xFA0039ED,0xE2001939,0xC000247F,0xD8005F3E,0xC60033B5,0xAE00348E,0x98005432,0x33F891E7,
-0xA60060B4,0x9C005887,0x8A006F9B,0x740091E7,0xFEBC6F6D,0xF8E085F5,0xFAE48878,0xFE804B93,0xFE4C28D8,0xFE100BA9,0xFC000045,0xE40007A9,0xFE986E0F,0xFE5C46BB,0xF60018B4,0xAE00348E,0x1F091E7,0x1384A4D,0xFF0C3095,0xFEE419E8,0xFED41144,0xFEE82E49,0xFEAC0F1E,0xFE980269,0xFE881509,0xEE6C0364,0xCC7813ED,0x1D04A4D,0xFE70234B,0xFE441142,0xFE141E3D,0xF000026D,
-0xCC00141E,0x6DF84A4D,0xD0002208,0xB80026E5,0x9C004A4E,0x1D04A4D,0xFE70234B,0xFE441142,0xFE141E3D,0xF000026D,0xCC00141E,0x6DF84A4D,0xD0002208,0xB80026E5,0x9C004A4E,0x6DF84A4D,0xD0002208,0xB80026E5,0x9C004A4E,0x9C004A4E,0xFF143E02,0xFF2C4531,0xFF2C4538,0xFED43009,0xFE901F05,0xFE240B1D,0xFC000045,0xE40007A9,0xFEF83E4D,0xFEC02DC4,0xF60017D3,0xB80026E5,
-0x45FC4A4D,0x902F14,0x902F14,0x902F14,0x902F14,0xFE5C0EF9,0xFE5C0EF9,0xFE5C0EF9,0xD63C0708,0xD63C0708,0x983C0709,0xFE241378,0xFE241378,0xFE241378,0xD80000B9,0xD80000B9,0x9A080035,0x9E001142,0x9E001142,0x84000502,0x6A001142,0x2D02F13,0x2D02F13,0x2D02F13,0xB400119A,0xB400119A,0x8C0009B1,0x8C001AA6,0x8C001AA6,0x7A000C12,0x620015C2,0x1AC2F13,
-0x1AC2F13,0x66001D13,0x56002186,0x46002F13,0xFE681F7A,0xFC8827B4,0x902F14,0xFE4C1319,0xFE2C078E,0xFE040199,0xFC000041,0xC00001D4,0xFE501E81,0xFE301181,0xC60011D4,0x7A000C12,0x12C2F13,0xD41144,0xD41144,0xD41144,0xD41144,0xFE980269,0xFE980269,0xFE980269,0xC6780001,0xC6780001,0x98780001,0x33C1142,0x33C1142,0x33C1142,0xD80000B9,0xD80000B9,
-0x981C0001,0x23FC1142,0x23FC1142,0x84000502,0x6A001142,0x33C1142,0x33C1142,0x33C1142,0xD80000B9,0xD80000B9,0x981C0001,0x23FC1142,0x23FC1142,0x84000502,0x6A001142,0x23FC1142,0x23FC1142,0x84000502,0x6A001142,0x6A001142,0xFAB40CD1,0xFECC0DC8,0xD41144,0xFE880994,0xFE4804D9,0xFE0C0190,0xFC000041,0xC00001D4,0xFE940D2A,0xFE5C0901,0x1C81142,0x84000502,
-0x1C81142,0x19813ED,0xFF740C44,0xFF4C03D9,0xFF340000,0x65FC13ED,0xFF080784,0xFED00000,0xB3FC13ED,0xF80000CD,0xCC0013ED,0x65FC13ED,0xFF080784,0xFED00000,0xB3FC13ED,0xF80000CD,0xCC0013ED,0xB3FC13ED,0xF80000CD,0xCC0013ED,0xCC0013ED,0x65FC13ED,0xFF080784,0xFED00000,0xB3FC13ED,0xF80000CD,0xCC0013ED,0xB3FC13ED,0xF80000CD,0xCC0013ED,0xCC0013ED,0xB3FC13ED,
-0xF80000CD,0xCC0013ED,0xCC0013ED,0xCC0013ED,0xFF781232,0x1B413ED,0xFF8C1289,0xFF580FA1,0xFF100C84,0xFE980631,0xFE0C0000,0xF20002D0,0xFF701220,0xFF400F1D,0xFE5000A9,0xCC0013ED,0x9FFC13ED,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0x3C0708,0xB6000000,0xB6000000,0xB6000000,0xB6000000,0xB6000000,
-0xB6000000,0x58000000,0x58000000,0x58000000,0x3A000001,0x580708,0x580708,0x580708,0x580708,0x580708,0x580708,0x460002A1,0x460002A1,0x460002A1,0x34000152,0xB00708,0xB00708,0xB00708,0x2600045D,0x1C00070A,0xFE2C03F5,0x3C0708,0x3C0708,0xFA180154,0xFC080029,0xE0000000,0xE0000000,0x92000000,0xFA080322,0xFE000140,0x70000031,0x460002A1,
-0x7C0708,};
-static const uint32_t g_etc1_to_bc7_m6_table238[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0x880000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,
-0xC80000,0x1980000,0x1980000,0x1980000,0x42000001,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0xC80000,0x1980000,0x1980000,0x1980000,0x42000001,0x1980000,0x1980000,0x1980000,0x42000001,0x42000001,0x900000,0x880000,0x880000,0x49C0000,0x2A80000,0xB80000,0xB80000,0xE40000,0x49C0000,0x2A80000,0x1200000,0x1980000,
-0x1200000,0x1440000,0x1440000,0x1440000,0x1440000,0x1E00000,0x1E00000,0x1E00000,0x75FC0000,0x75FC0000,0xA0000001,0x1E00000,0x1E00000,0x1E00000,0x75FC0000,0x75FC0000,0xA0000001,0x75FC0000,0x75FC0000,0xA0000001,0xA0000001,0x1E00000,0x1E00000,0x1E00000,0x75FC0000,0x75FC0000,0xA0000001,0x75FC0000,0x75FC0000,0xA0000001,0xA0000001,0x75FC0000,
-0x75FC0000,0xA0000001,0xA0000001,0xA0000001,0x1780000,0x5580000,0x1440000,0x1B80000,0x1BFC0000,0x51FC0000,0x63FC0000,0x81F80000,0x3940000,0x1E00000,0x51FC0000,0xA0000001,0x51FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xF490E1,0xFEC46419,0xFEA04085,0xFE9833B4,0xFEA451A9,0xFE7023E2,0xFE5C12C9,0xFE581EA9,0xF6400A14,0xD04419ED,0xFE806262,0xFE402ADB,0xFE301544,0xFE141BF1,0xF60000F9,0xD00010E6,0xF000444D,0xD6001E14,0xBE002159,0xA000444E,0x16890DF,0xFE144EC8,0xFA003418,0xFA00381D,0xE80018CD,0xC6002223,0xE2005A92,0xCC0030B1,0xB4002FCA,0x98004EF2,0x39F890DF,
-0xAC006018,0x9C005607,0x8A006C1B,0x780090DF,0xFEBC6F7D,0xFCE884ED,0xFEEC87B8,0xFE944C1E,0xFE50299A,0xFE180BFB,0xFE040069,0xEA0004E2,0xFEB46DCE,0xFE704795,0xF6001634,0xB4002FCA,0x3FC90DF,0x144444D,0xFF182D4D,0xFEF01904,0xFEE41144,0xFEF42A65,0xFEB80E5A,0xFEA402D5,0xFE941241,0xF07C0278,0xD08810E5,0x1E0444D,0xFE882153,0xFE5C1142,0xFE2C1B35,0xF60000F9,
-0xD00010E6,0x75FC444D,0xD6001E14,0xBE002159,0xA000444E,0x1E0444D,0xFE882153,0xFE5C1142,0xFE2C1B35,0xF60000F9,0xD00010E6,0x75FC444D,0xD6001E14,0xBE002159,0xA000444E,0x75FC444D,0xD6001E14,0xBE002159,0xA000444E,0xA000444E,0xFF20392D,0xFF2C3FE1,0xF9404005,0xFEEC2C7A,0xFEA41CE9,0xFE380A7D,0xFE0C0050,0xEA0004E2,0xFF08395E,0xFEC82A3E,0xF6001553,0xBE002159,
-0x51FC444D,0x9833B4,0x9833B4,0x9833B4,0x9833B4,0xFE5C12C9,0xFE5C12C9,0xFE5C12C9,0xE0440908,0xE0440908,0xA0440909,0xFE301544,0xFE301544,0xFE301544,0xE8000059,0xE8000059,0xA20C0089,0xAA001142,0xAA001142,0x9000046A,0x72001142,0xE433B3,0xE433B3,0xE433B3,0xC000139A,0xC000139A,0x98000B59,0x98001C26,0x98001C26,0x86000C7A,0x6E001672,0x1D033B3,
-0x1D033B3,0x6C001FE3,0x5C00240E,0x4C0033B3,0xFE7423B8,0xFE8C2C28,0x9833B4,0xFE541675,0xFE2C09CE,0xFE0C0274,0xFE040069,0xCC00015A,0xFE5022C1,0xFE3014B1,0xD60011ED,0x86000C7A,0x14833B3,0xE41144,0xE41144,0xE41144,0xE41144,0xFEA402D5,0xFEA402D5,0xFEA402D5,0xCE880001,0xCE880001,0xA0880001,0x1541142,0x1541142,0x1541142,0xE8000059,0xE8000059,
-0xA02C0001,0x2FFC1142,0x2FFC1142,0x9000046A,0x72001142,0x1541142,0x1541142,0x1541142,0xE8000059,0xE8000059,0xA02C0001,0x2FFC1142,0x2FFC1142,0x9000046A,0x72001142,0x2FFC1142,0x2FFC1142,0x9000046A,0x72001142,0x72001142,0xFEBC0CF9,0xF6DC0E1D,0xE41144,0xFE9809E5,0xFE5C0551,0xFE2001E2,0xFE0C0050,0xCC00015A,0xFAAC0D75,0xFE740975,0x1E81142,0x9000046A,
-0x1E81142,0x1A010E5,0xFF800A68,0xFF58033D,0xFF440000,0x71FC10E5,0xFF200654,0xFEE80000,0xB9FC10E5,0xF800003D,0xD00010E5,0x71FC10E5,0xFF200654,0xFEE80000,0xB9FC10E5,0xF800003D,0xD00010E5,0xB9FC10E5,0xF800003D,0xD00010E5,0xD00010E5,0x71FC10E5,0xFF200654,0xFEE80000,0xB9FC10E5,0xF800003D,0xD00010E5,0xB9FC10E5,0xF800003D,0xD00010E5,0xD00010E5,0xB9FC10E5,
-0xF800003D,0xD00010E5,0xD00010E5,0xD00010E5,0xFD900F79,0x1BC10E5,0xF79C0FD4,0xFF6C0D2A,0xFF280A82,0xFEC00544,0xFE340000,0xF2000190,0xF7880F79,0xFF4C0CE2,0xFE700090,0xD00010E5,0xA7FC10E5,0x440908,0x440908,0x440908,0x440908,0x440908,0x440908,0x440908,0x440908,0x440908,0x440908,0xCE000000,0xCE000000,0xCE000000,0xCE000000,0xCE000000,
-0xCE000000,0x64000000,0x64000000,0x64000000,0x42000001,0x640908,0x640908,0x640908,0x640908,0x640908,0x640908,0x52000369,0x52000369,0x52000369,0x3A0001BA,0xCC0908,0xCC0908,0xCC0908,0x2C000595,0x2000090A,0xFE2C05A5,0x440908,0x440908,0xFE200244,0xFE0C009D,0xFE000000,0xFE000000,0xA6000000,0xFE100482,0xFE000200,0x7E000041,0x52000369,
-0x900908,};
-static const uint32_t g_etc1_to_bc7_m6_table239[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0x980000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,
-0xE00000,0x1CC0000,0x1CC0000,0x1CC0000,0x4A000001,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0xE00000,0x1CC0000,0x1CC0000,0x1CC0000,0x4A000001,0x1CC0000,0x1CC0000,0x1CC0000,0x4A000001,0x4A000001,0x8A00000,0x980000,0x980000,0xB00000,0x2BC0000,0x2CC0000,0x2CC0000,0x1000000,0xB00000,0x2BC0000,0x1400000,0x1CC0000,
-0x1400000,0x1540000,0x1540000,0x1540000,0x1540000,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x81FC0000,0x81FC0000,0xA8000001,0xA8000001,0x1F80000,0x1F80000,0x1F80000,0x81FC0000,0x81FC0000,0xA8000001,0x81FC0000,0x81FC0000,0xA8000001,0xA8000001,0x81FC0000,
-0x81FC0000,0xA8000001,0xA8000001,0xA8000001,0x18C0000,0xD680000,0x1540000,0x3CC0000,0x2FFC0000,0x5FFC0000,0x71FC0000,0x8BFC0000,0x3A80000,0x1F80000,0x5FFC0000,0xA8000001,0x5FFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0xFC9059,0xFED0662D,0xFEAC44A9,0xFEA438A8,0xFEB052A9,0xFE7C2732,0xFE6816F9,0xFE581F29,0xFA480BC4,0xD44C1965,0xFE8C60F6,0xFE4C2C1F,0xFE3C1798,0xFE201A39,0xFA000035,0xD4080E36,0xF8003EA2,0xDC001AA8,0xC4001C3D,0xA6003EA2,0x1749057,0xFE20521C,0xFC0038AF,0xFA00374D,0xEE001919,0xCC002067,0xE2005652,0xD2002E3D,0xBA002B7E,0xA20049CB,0x3FF89057,
-0xB2006024,0xA6005459,0x9000690F,0x7C009057,0xFED06FC2,0xFEEC8481,0xFEEC87D8,0xFE984D29,0xFE602AF8,0xFE240D01,0xFE100125,0xF00002D9,0xFEB46DDE,0xFE80487B,0xFE001462,0xBA002B7E,0xBFC9057,0x14C3EA5,0xFF242A35,0xFF081824,0xFEF41144,0xFF0026C9,0xFECC0DAC,0xFEBC0355,0xFEAC0FB9,0xF28C01B4,0xD4980E1D,0x1F03EA2,0xFEA01F7B,0xFE741142,0xFE381881,0xFA000035,
-0xD4100E1E,0x7DF83EA2,0xDC001AA8,0xC4001C3D,0xA6003EA2,0x1F03EA2,0xFEA01F7B,0xFE741142,0xFE381881,0xFA000035,0xD4100E1E,0x7DF83EA2,0xDC001AA8,0xC4001C3D,0xA6003EA2,0x7DF83EA2,0xDC001AA8,0xC4001C3D,0xA6003EA2,0xA6003EA2,0xFF343494,0xFB443A79,0xFD483A9D,0xFEEC292A,0xFEA81AF3,0xFE480A18,0xFE280088,0xF00002D9,0xFF1834C1,0xFEDC273A,0xFE001362,0xC4001C3D,
-0x5BFC3EA2,0xA438A8,0xA438A8,0xA438A8,0xA438A8,0xFE6816F9,0xFE6816F9,0xFE6816F9,0xEA4C0B48,0xEA4C0B48,0xA84C0B49,0xFE3C1798,0xFE3C1798,0xFE3C1798,0xFA000025,0xFA000025,0xAE100102,0xB6001142,0xB6001142,0x960003DA,0x7A001142,0xF438A8,0xF438A8,0xF438A8,0xCC0015DA,0xCC0015DA,0xA2000D3B,0xA2001D8D,0xA2001D8D,0x8C000CF6,0x7400172E,0x1F038A8,
-0x1F038A8,0x720022FB,0x660026FB,0x500038AB,0xFE842831,0xF69C3169,0xA438A8,0xFE5C1A55,0xFE400CA2,0xFE1C03D8,0xFE100125,0xDA0000FA,0xFE64274E,0xFE44188A,0xE6001206,0x8C000CF6,0x15C38A8,0xF41144,0xF41144,0xF41144,0xF41144,0xFEBC0355,0xFEBC0355,0xFEBC0355,0xD6980001,0xD6980001,0xA8980001,0x16C1142,0x16C1142,0x16C1142,0xFA000025,0xFA000025,
-0xA83C0001,0x3BFC1142,0x3BFC1142,0x960003DA,0x7A001142,0x16C1142,0x16C1142,0x16C1142,0xFA000025,0xFA000025,0xA83C0001,0x3BFC1142,0x3BFC1142,0x960003DA,0x7A001142,0x3BFC1142,0x3BFC1142,0x960003DA,0x7A001142,0x7A001142,0xFED80D22,0xFEEC0E1D,0xF41144,0xFEA40A68,0xFE7005D1,0xFE380249,0xFE280088,0xDA0000FA,0xFEB40D9D,0xFE9809D9,0x7FC1142,0x960003DA,
-0x7FC1142,0x1A80E1D,0xFF8C08B4,0xFF6402B1,0xFF540000,0x7DFC0E1D,0xFF380544,0xFF000000,0xBFFC0E1D,0xFE000001,0xD4000E1D,0x7DFC0E1D,0xFF380544,0xFF000000,0xBFFC0E1D,0xFE000001,0xD4000E1D,0xBFFC0E1D,0xFE000001,0xD4000E1D,0xD4000E1D,0x7DFC0E1D,0xFF380544,0xFF000000,0xBFFC0E1D,0xFE000001,0xD4000E1D,0xBFFC0E1D,0xFE000001,0xD4000E1D,0xD4000E1D,0xBFFC0E1D,
-0xFE000001,0xD4000E1D,0xD4000E1D,0xD4000E1D,0xFF940CD5,0x1C40E1D,0xFBA40D24,0xFF6C0AFA,0xFF3C08CA,0xFEC80451,0xFE5C0000,0xF60000B9,0xFB900CD1,0xFF640AD2,0xFE940079,0xD4000E1D,0xAFFC0E1D,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0x4C0B48,0xE6000000,0xE6000000,0xE6000000,0xE6000000,0xE6000000,
-0xE6000000,0x70000000,0x70000000,0x70000000,0x4A000001,0x700B48,0x700B48,0x700B48,0x700B48,0x700B48,0x700B48,0x58000449,0x58000449,0x58000449,0x4600022A,0xE40B48,0xE40B48,0xE40B48,0x320006F5,0x24000B4A,0xF63C0784,0x4C0B48,0x4C0B48,0xFE2C039D,0xFE140164,0xFE080029,0xFE080029,0xBA000000,0xFE100652,0xFE000340,0x86000050,0x58000449,
-0xA00B48,};
-static const uint32_t g_etc1_to_bc7_m6_table240[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xA80001,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,
-0xFC0000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0x3F80000,0x3F80000,0x3F80000,0x54000000,0x54000000,0x2B40000,0xA80001,0xA80001,0x2C40000,0xD40000,0xE80000,0xE80000,0x11C0000,0x2C40000,0xD40000,0x1680000,0x3F80000,
-0x1680000,0x1640001,0x1640001,0x1640001,0x1640001,0x19FC0000,0x19FC0000,0x19FC0000,0x8FF80000,0x8FF80000,0xB2000000,0x19FC0000,0x19FC0000,0x19FC0000,0x8FF80000,0x8FF80000,0xB2000000,0x8FF80000,0x8FF80000,0xB2000000,0xB2000000,0x19FC0000,0x19FC0000,0x19FC0000,0x8FF80000,0x8FF80000,0xB2000000,0x8FF80000,0x8FF80000,0xB2000000,0xB2000000,0x8FF80000,
-0x8FF80000,0xB2000000,0xB2000000,0xB2000000,0x1A00000,0x77C0000,0x1640001,0x1E80000,0x45FC0000,0x71FC0000,0x7FFC0000,0x99F40000,0x1C00000,0x19FC0000,0x71FC0000,0xB2000000,0x71FC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1049057,0xFEDC690F,0xFEB849CB,0xFEB03EA2,0xFEB05459,0xFE882B7E,0xFE741C3D,0xFE642067,0xFA540E36,0xD8541965,0xFE986024,0xFE582E3D,0xFE441AA8,0xFE201919,0xFE040035,0xDA080BC4,0xFE0438AF,0xE0001798,0xCA0016F9,0xAC0038A8,0x3809057,0xFE385652,0xFE0C3EA2,0xFE08374D,0xEE001A39,0xD2001F29,0xEE00521C,0xD8002C1F,0xC0002732,0xA80044A9,0x45FC9057,
-0xB80060F6,0xA60052A9,0x9600662D,0x80009059,0xFED870CD,0xFEEC8535,0xF90088B3,0xFE984F45,0xFE682CFD,0xFE2C0EC7,0xFE1C02D9,0xF6000125,0xFEB46F22,0xFE804A3D,0xFE001354,0xC0002732,0x15FC9057,0x15838AB,0xFF3026FB,0xFF14172E,0xFF081142,0xFF1822FB,0xFEE40CF6,0xFED003DA,0xFEB80D3B,0xF6A00102,0xD8A80B49,0x7FC38A8,0xFEB81D8D,0xFE901142,0xFE5815DA,0xFE080025,
-0xD8280B48,0x85FC38A8,0xE0001798,0xCA0016F9,0xAC0038A8,0x7FC38A8,0xFEB81D8D,0xFE901142,0xFE5815DA,0xFE080025,0xD8280B48,0x85FC38A8,0xE0001798,0xCA0016F9,0xAC0038A8,0x85FC38A8,0xE0001798,0xCA0016F9,0xAC0038A8,0xAC0038A8,0xFF342F72,0xFF4C34BB,0xFF4C3533,0xFF082595,0xFEC818C5,0xFE7009A1,0xFE3C00FA,0xF6000125,0xFF243009,0xFEE8241E,0xFE001254,0xCA0016F9,
-0x65FC38A8,0xB03EA2,0xB03EA2,0xB03EA2,0xB03EA2,0xFE741C3D,0xFE741C3D,0xFE741C3D,0xF4540E1E,0xF4540E1E,0xB2540E1D,0xFE441AA8,0xFE441AA8,0xFE441AA8,0xFE040035,0xFE040035,0xB81801B4,0xC4001142,0xC4001142,0xA0000355,0x82001144,0x1043EA2,0x1043EA2,0x1043EA2,0xE2001881,0xE2001881,0xA8000FB9,0xAE001F7B,0xAE001F7B,0x98000DAC,0x7A001824,0x7FC3EA2,
-0x7FC3EA2,0x7E0026C9,0x6C002A35,0x56003EA5,0xFE902DE5,0xFCA83721,0xB03EA2,0xFE701F65,0xFE441079,0xFE24061D,0xFE1C02D9,0xEA000088,0xFE782CA5,0xFE501D0A,0xF6001225,0x98000DAC,0x1743EA2,0x1081142,0x1081142,0x1081142,0x1081142,0xFED003DA,0xFED003DA,0xFED003DA,0xE0A80001,0xE0A80001,0xB2A80001,0x1881142,0x1881142,0x1881142,0xFE080025,0xFE080025,
-0xB24C0001,0x49F81142,0x49F81142,0xA0000355,0x82001144,0x1881142,0x1881142,0x1881142,0xFE080025,0xFE080025,0xB24C0001,0x49F81142,0x49F81142,0xA0000355,0x82001144,0x49F81142,0x49F81142,0xA0000355,0x82001144,0x82001144,0xFAEC0D75,0xF9000E72,0x1081142,0xFEC40ABA,0xFE98065D,0xFE5802F2,0xFE3C00FA,0xEA000088,0xFED00DD0,0xFEA40A40,0x19FC1142,0xA0000355,
-0x19FC1142,0x1B00B4A,0xFF9806F5,0xFF70022A,0xFF640001,0x8DFC0B48,0xFF4C0449,0xFF1C0000,0xC7FC0B48,0xFE2C0000,0xD8000B48,0x8DFC0B48,0xFF4C0449,0xFF1C0000,0xC7FC0B48,0xFE2C0000,0xD8000B48,0xC7FC0B48,0xFE2C0000,0xD8000B48,0xD8000B48,0x8DFC0B48,0xFF4C0449,0xFF1C0000,0xC7FC0B48,0xFE2C0000,0xD8000B48,0xC7FC0B48,0xFE2C0000,0xD8000B48,0xD8000B48,0xC7FC0B48,
-0xFE2C0000,0xD8000B48,0xD8000B48,0xD8000B48,0xFF940A68,0xFCC0B48,0xFFAC0A6D,0xFF8408D1,0xFF500709,0xFEF00382,0xFE880000,0xFA000029,0xFF980A22,0xFF7008B4,0xFEB80064,0xD8000B48,0xB9FC0B48,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0x540E1D,0xFE000001,0xFE000001,0xFE000001,0xFE000001,0xFE000001,
-0xFE000001,0x7E000000,0x7E000000,0x7E000000,0x54000000,0x7C0E1D,0x7C0E1D,0x7C0E1D,0x7C0E1D,0x7C0E1D,0x7C0E1D,0x62000544,0x62000544,0x62000544,0x4C0002B1,0xFC0E1D,0xFC0E1D,0xFC0E1D,0x380008B4,0x2A000E1D,0xFC4809D9,0x540E1D,0x540E1D,0xFE2C0568,0xFE200290,0xFE1000B9,0xFE1000B9,0xD0000000,0xFE20088A,0xFE1404E2,0x9E000065,0x62000544,
-0xB00E1D,};
-static const uint32_t g_etc1_to_bc7_m6_table241[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0xB80001,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,
-0x1140000,0xFF80000,0xFF80000,0xFF80000,0x5C000000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0x1140000,0xFF80000,0xFF80000,0xFF80000,0x5C000000,0xFF80000,0xFF80000,0xFF80000,0x5C000000,0x5C000000,0xAC40000,0xB80001,0xB80001,0xD80000,0xE80000,0xFC0000,0xFC0000,0x1380000,0xD80000,0xE80000,0x18C0000,0xFF80000,
-0x18C0000,0x1740001,0x1740001,0x1740001,0x1740001,0x31FC0000,0x31FC0000,0x31FC0000,0x9BF80000,0x9BF80000,0xBA000000,0x31FC0000,0x31FC0000,0x31FC0000,0x9BF80000,0x9BF80000,0xBA000000,0x9BF80000,0x9BF80000,0xBA000000,0xBA000000,0x31FC0000,0x31FC0000,0x31FC0000,0x9BF80000,0x9BF80000,0xBA000000,0x9BF80000,0x9BF80000,0xBA000000,0xBA000000,0x9BF80000,
-0x9BF80000,0xBA000000,0xBA000000,0xBA000000,0x1B40000,0xF8C0000,0x1740001,0x1FC0000,0x59FC0000,0x7FFC0000,0x8DFC0000,0xA3F80000,0x1D40000,0x31FC0000,0x7FFC0000,0xBA000000,0x7FFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x10C90DF,0xFEE86C1B,0xFEC04EF2,0xFEBC444E,0xFEC45607,0xFE942FCA,0xFE802159,0xFE702223,0xFE5C10E6,0xDC5C19ED,0xFEA46018,0xFE6430B1,0xFE501E14,0xFE2C18CD,0xFE1000F9,0xDE100A14,0xFE083418,0xE6001544,0xCA0012C9,0xB20033B4,0x38C90DF,0xFE385A92,0xFE1C444D,0xFE08381D,0xF4001BF1,0xD2001EA9,0xF4004EC8,0xDE002ADB,0xC60023E2,0xAE004085,0x4BFC90DF,
-0xBE006262,0xAC0051A9,0x9C006419,0x840090E1,0xFEE47241,0xFB048637,0xFD08897B,0xFEAC5155,0xFE742F9E,0xFE381111,0xFE2804E2,0xFC000069,0xFED07092,0xFE904C7B,0xFE001374,0xC60023E2,0x1DF890DF,0x16433B3,0xFF44240E,0xFF201672,0xFF181142,0xFF241FE3,0xFEF00C7A,0xFEDC046A,0xFECC0B59,0xF6B40089,0xDCB80909,0x15FC33B3,0xFECC1C26,0xFEA81142,0xFE70139A,0xFE2C0059,
-0xDC3C0908,0x8DFC33B3,0xE6001544,0xCA0012C9,0xB20033B4,0x15FC33B3,0xFECC1C26,0xFEA81142,0xFE70139A,0xFE2C0059,0xDC3C0908,0x8DFC33B3,0xE6001544,0xCA0012C9,0xB20033B4,0x8DFC33B3,0xE6001544,0xCA0012C9,0xB20033B4,0xB20033B4,0xFF3C2BBB,0xF75C3071,0xF96030C6,0xFF142266,0xFEDC1721,0xFE7C0965,0xFE64015A,0xFC000069,0xFF342BED,0xFF042108,0xFE141223,0xCA0012C9,
-0x6FFC33B3,0xBC444E,0xBC444E,0xBC444E,0xBC444E,0xFE802159,0xFE802159,0xFE802159,0xFE5C10E6,0xFE5C10E6,0xBA5C10E5,0xFE501E14,0xFE501E14,0xFE501E14,0xFE1000F9,0xFE1000F9,0xC01C0278,0xD0001142,0xD0001142,0xAC0002D5,0x8A001144,0x114444D,0x114444D,0x114444D,0xE8001B35,0xE8001B35,0xB4001241,0xBA002153,0xBA002153,0xA2000E5A,0x86001904,0xFF8444D,
-0xFF8444D,0x84002A65,0x72002D4D,0x5C00444D,0xFEA03348,0xFEAC3CB9,0xBC444E,0xFE78242D,0xFE541448,0xFE3408B9,0xFE2804E2,0xF8000050,0xFE7831F5,0xFE5021DA,0xFE001274,0xA2000E5A,0x18C444D,0x1181142,0x1181142,0x1181142,0x1181142,0xFEDC046A,0xFEDC046A,0xFEDC046A,0xE8B80001,0xE8B80001,0xBAB80001,0x1A01142,0x1A01142,0x1A01142,0xFE2C0059,0xFE2C0059,
-0xBA5C0001,0x55F81142,0x55F81142,0xAC0002D5,0x8A001144,0x1A01142,0x1A01142,0x1A01142,0xFE2C0059,0xFE2C0059,0xBA5C0001,0x55F81142,0x55F81142,0xAC0002D5,0x8A001144,0x55F81142,0x55F81142,0xAC0002D5,0x8A001144,0x8A001144,0xFEF40D9D,0xFF0C0E7A,0x1181142,0xFEDC0AFD,0xFEAC06D9,0xFE740361,0xFE64015A,0xF8000050,0xF8EC0E1D,0xFEC40AB2,0x27FC1142,0xAC0002D5,
-0x27FC1142,0x1B8090A,0xFFA40595,0xFF8801BA,0xFF740001,0x99FC0908,0xFF580369,0xFF340000,0xCDFC0908,0xFE600000,0xDC000908,0x99FC0908,0xFF580369,0xFF340000,0xCDFC0908,0xFE600000,0xDC000908,0xCDFC0908,0xFE600000,0xDC000908,0xDC000908,0x99FC0908,0xFF580369,0xFF340000,0xCDFC0908,0xFE600000,0xDC000908,0xCDFC0908,0xFE600000,0xDC000908,0xDC000908,0xCDFC0908,
-0xFE600000,0xDC000908,0xDC000908,0xDC000908,0xFBAC0841,0x1D80908,0xFFAC087D,0xFF94070A,0xFF680595,0xFF0402C2,0xFEB00000,0xFE000000,0xFF980832,0xFF8806D1,0xFEDC0051,0xDC000908,0xBFFC0908,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0x5C10E5,0xFE0C003D,0xFE0C003D,0xFE0C003D,0xFE0C003D,0xFE0C003D,
-0xFE0C003D,0x8A000000,0x8A000000,0x8A000000,0x5C000000,0x8810E5,0x8810E5,0x8810E5,0x8810E5,0x8810E5,0x8810E5,0x6E000654,0x6E000654,0x6E000654,0x5200033D,0x11410E5,0x11410E5,0x11410E5,0x3E000A68,0x2E0010E5,0xFE4C0C35,0x5C10E5,0x5C10E5,0xFE3C0745,0xFE280401,0xFE180190,0xFE180190,0xE4000000,0xFE200AFA,0xFE1406B2,0xA6000074,0x6E000654,
-0xC010E5,};
-static const uint32_t g_etc1_to_bc7_m6_table242[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0xC80001,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,
-0x12C0000,0x1BF80000,0x1BF80000,0x1BF80000,0x64000000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x12C0000,0x1BF80000,0x1BF80000,0x1BF80000,0x64000000,0x1BF80000,0x1BF80000,0x1BF80000,0x64000000,0x64000000,0xD80000,0xC80001,0xC80001,0x6E80000,0xFC0000,0x3100000,0x3100000,0x1540000,0x6E80000,0xFC0000,0x1AC0000,0x1BF80000,
-0x1AC0000,0x1840001,0x1840001,0x1840001,0x1840001,0x49FC0000,0x49FC0000,0x49FC0000,0xA7F80000,0xA7F80000,0xC2000000,0x49FC0000,0x49FC0000,0x49FC0000,0xA7F80000,0xA7F80000,0xC2000000,0xA7F80000,0xA7F80000,0xC2000000,0xC2000000,0x49FC0000,0x49FC0000,0x49FC0000,0xA7F80000,0xA7F80000,0xC2000000,0xA7F80000,0xA7F80000,0xC2000000,0xC2000000,0xA7F80000,
-0xA7F80000,0xC2000000,0xC2000000,0xC2000000,0x3C40000,0x1A00000,0x1840001,0x1FFC0000,0x6DFC0000,0x8FFC0000,0x9BFC0000,0xADFC0000,0x1E80000,0x49FC0000,0x8FFC0000,0xC2000000,0x8FFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x11491E7,0xFEE86F9B,0xFECC5432,0xFEC44A4E,0xFEC45887,0xFEA0348E,0xFE8C26E5,0xFE7C247F,0xFE64141E,0xE0641AF5,0xFEB060B4,0xFE7033B5,0xFE5C2208,0xFE381939,0xFE14026D,0xE21808E4,0xFE143058,0xEC001378,0xD0000EF9,0xB6002F14,0x39891E7,0xFE4C5F3E,0xFE2C4A4E,0xFE0839ED,0xFA001E61,0xD8001EBD,0xFA004BDC,0xE20029E8,0xCC00210A,0xAE003CA5,0x51FC91E7,
-0xBE006462,0xB2005139,0x9C006279,0x880091E9,0xFEE47431,0xFF0C873F,0xFF0C8ACF,0xFEB45441,0xFE7C3299,0xFE48140A,0xFE3407A9,0xFE040045,0xFED4724F,0xFE984F5B,0xFE001494,0xCC00210A,0x23FC91E7,0x1702F13,0xFF502186,0xFF3815C2,0xFF281142,0xFF301D13,0xFF080C12,0xFEF40502,0xFEE409B1,0xFAC80035,0xE0C80709,0x29FC2F13,0xFEE41AA6,0xFEC01142,0xFE88119A,0xFE4000B9,
-0xE0500708,0x97F82F13,0xEC001378,0xD0000EF9,0xB6002F14,0x29FC2F13,0xFEE41AA6,0xFEC01142,0xFE88119A,0xFE4000B9,0xE0500708,0x97F82F13,0xEC001378,0xD0000EF9,0xB6002F14,0x97F82F13,0xEC001378,0xD0000EF9,0xB6002F14,0xB6002F14,0xFF5027BA,0xFD682BE9,0xFF6C2C5A,0xFF30200A,0xFEE8158A,0xFEA4095B,0xFE7C01D4,0xFE040041,0xFF442838,0xFF041EA8,0xFE341206,0xD0000EF9,
-0x7BFC2F13,0xC44A4E,0xC44A4E,0xC44A4E,0xC44A4E,0xFE8C26E5,0xFE8C26E5,0xFE8C26E5,0xFE64141E,0xFE64141E,0xC26413ED,0xFE5C2208,0xFE5C2208,0xFE5C2208,0xFE14026D,0xFE14026D,0xC8200364,0xDC001142,0xDC001142,0xB2000269,0x92001144,0x3244A4D,0x3244A4D,0x3244A4D,0xF4001E3D,0xF4001E3D,0xBA001509,0xC000234B,0xC000234B,0xA8000F1E,0x8C0019E8,0x17FC4A4D,
-0x17FC4A4D,0x8A002E49,0x78003095,0x62004A4D,0xFEA03908,0xF8C0430A,0xC44A4E,0xFE802964,0xFE5818D5,0xFE3C0BE6,0xFE3407A9,0xFE040045,0xFE9437CD,0xFE5C2742,0xFE001394,0xA8000F1E,0x1A44A4D,0x1281142,0x1281142,0x1281142,0x1281142,0xFEF40502,0xFEF40502,0xFEF40502,0xF0C80001,0xF0C80001,0xC2C80001,0x1B81142,0x1B81142,0x1B81142,0xFE4000B9,0xFE4000B9,
-0xC26C0001,0x61F81142,0x61F81142,0xB2000269,0x92001144,0x1B81142,0x1B81142,0x1B81142,0xFE4000B9,0xFE4000B9,0xC26C0001,0x61F81142,0x61F81142,0xB2000269,0x92001144,0x61F81142,0x61F81142,0xB2000269,0x92001144,0x92001144,0xFF100DC8,0xF9200EC9,0x1281142,0xFEDC0B8D,0xFEC0075D,0xFE7C0414,0xFE7C01D4,0xFE040041,0xFEF80E21,0xFEDC0AFD,0x37FC1142,0xB2000269,
-0x37FC1142,0x1C0070A,0xFFB0045D,0xFF940152,0xFF840001,0xA5FC0708,0xFF7002A1,0xFF4C0000,0xD3FC0708,0xFE900000,0xE0000708,0xA5FC0708,0xFF7002A1,0xFF4C0000,0xD3FC0708,0xFE900000,0xE0000708,0xD3FC0708,0xFE900000,0xE0000708,0xE0000708,0xA5FC0708,0xFF7002A1,0xFF4C0000,0xD3FC0708,0xFE900000,0xE0000708,0xD3FC0708,0xFE900000,0xE0000708,0xE0000708,0xD3FC0708,
-0xFE900000,0xE0000708,0xE0000708,0xE0000708,0xFFB40659,0x1E00708,0xF9C00692,0xFF94057A,0xFF7C045D,0xFF2C0232,0xFED80000,0xFE380000,0xFFAC065D,0xFF940550,0xFEFC0040,0xE0000708,0xC7FC0708,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0x6413ED,0xFE0C00CD,0xFE0C00CD,0xFE0C00CD,0xFE0C00CD,0xFE0C00CD,
-0xFE0C00CD,0x96000000,0x96000000,0x96000000,0x64000000,0x9413ED,0x9413ED,0x9413ED,0x9413ED,0x9413ED,0x9413ED,0x7A000784,0x7A000784,0x7A000784,0x580003D9,0x12C13ED,0x12C13ED,0x12C13ED,0x44000C44,0x320013ED,0xFE4C0F05,0x6413ED,0x6413ED,0xFE3C0975,0xFE3405B4,0xFE1802D0,0xFE1802D0,0xF8000000,0xFC380D75,0xFE2008D1,0xB6000089,0x7A000784,
-0xD413ED,};
-static const uint32_t g_etc1_to_bc7_m6_table243[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0xD80001,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,
-0x1440000,0x27F80000,0x27F80000,0x27F80000,0x6C000000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x1440000,0x27F80000,0x27F80000,0x27F80000,0x6C000000,0x27F80000,0x27F80000,0x27F80000,0x6C000000,0x6C000000,0xE80000,0xD80001,0xD80001,0x2FC0000,0x1100000,0x1280000,0x1280000,0x36C0000,0x2FC0000,0x1100000,0x1D00000,0x27F80000,
-0x1D00000,0x1940001,0x1940001,0x1940001,0x1940001,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0xB3F80000,0xB3F80000,0xCA000000,0xCA000000,0x63FC0000,0x63FC0000,0x63FC0000,0xB3F80000,0xB3F80000,0xCA000000,0xB3F80000,0xB3F80000,0xCA000000,0xCA000000,0xB3F80000,
-0xB3F80000,0xCA000000,0xCA000000,0xCA000000,0x1D80000,0x1B00000,0x1940001,0x3DFC0000,0x81FC0000,0x9FF80000,0xA9F80000,0xB9F80000,0x1FC0000,0x63FC0000,0x9FF80000,0xCA000000,0x9FF80000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x11C936F,0xFEF4735F,0xFED859D2,0xFED050A2,0xFED05B93,0xFEA839C2,0xFE982CE1,0xFE88277B,0xFE7017E2,0xE46C1C7D,0xFEB061D4,0xFE7C3749,0xFE682684,0xFE381A89,0xFE200485,0xE6200834,0xFE142D78,0xF2001234,0xD6000B99,0xBC002AC8,0x3A4936F,0xFE646426,0xFE3C50A2,0xFE083CBD,0xFA0021B1,0xDE001F71,0xFA0049BC,0xE80029B8,0xCC001E9A,0xB4003921,0x57FC936F,
-0xC40066F6,0xB8005159,0xA6006174,0x8C009371,0xFEF476C5,0xFF0C88FF,0xFF0C8D0F,0xFEC45747,0xFE883642,0xFE541781,0xFE3C0ADE,0xFE0C00EC,0xFED474DF,0xFE9852EB,0xFE101676,0xCC001E9A,0x2BFC936F,0x1782ACB,0xFF5C1F2E,0xFF401523,0xFF381142,0xFF441A61,0xFF140BC6,0xFF0005AA,0xFEF00849,0xFCD80009,0xE4D80549,0x37FC2AC8,0xFEFC1946,0xFED81142,0xFEA00FDA,0xFE640121,
-0xE4640548,0x9DFC2AC8,0xF2001234,0xD6000B99,0xBC002AC8,0x37FC2AC8,0xFEFC1946,0xFED81142,0xFEA00FDA,0xFE640121,0xE4640548,0x9DFC2AC8,0xF2001234,0xD6000B99,0xBC002AC8,0x9DFC2AC8,0xF2001234,0xD6000B99,0xBC002AC8,0xBC002AC8,0xFF58247D,0xFF6C27E1,0xFF6C289A,0xFF301D7A,0xFEFC143A,0xFEB80925,0xFE900274,0xFE280088,0xFD5824CE,0xFF181C5E,0xFE5811EB,0xD6000B99,
-0x83FC2AC8,0xD050A2,0xD050A2,0xD050A2,0xD050A2,0xFE982CE1,0xFE982CE1,0xFE982CE1,0xFE7017E2,0xFE7017E2,0xCA6C1735,0xFE682684,0xFE682684,0xFE682684,0xFE200485,0xFE200485,0xD2280474,0xE6001144,0xE6001144,0xBE0001F9,0x9A001144,0x13450A2,0x13450A2,0x13450A2,0xFA0021A1,0xFA0021A1,0xC6001821,0xCC002553,0xCC002553,0xAE001012,0x92001AE4,0x1FF850A2,
-0x1FF850A2,0x90003275,0x7E00340D,0x660050A5,0xFEAC3F3D,0xFCC8492A,0xD050A2,0xFE902EFD,0xFE6C1DB1,0xFE3C0FE6,0xFE3C0ADE,0xFE0C00EC,0xFE943DAD,0xFE742C9D,0xFE101595,0xAE001012,0x1B850A2,0x1381142,0x1381142,0x1381142,0x1381142,0xFF0005AA,0xFF0005AA,0xFF0005AA,0xF8D80001,0xF8D80001,0xCAD80001,0x1D01142,0x1D01142,0x1D01142,0xFE640121,0xFE640121,
-0xCA7C0001,0x6DF81142,0x6DF81142,0xBE0001F9,0x9A001144,0x1D01142,0x1D01142,0x1D01142,0xFE640121,0xFE640121,0xCA7C0001,0x6DF81142,0x6DF81142,0xBE0001F9,0x9A001144,0x6DF81142,0x6DF81142,0xBE0001F9,0x9A001144,0x9A001144,0xFF200E1D,0xFF2C0ED5,0x1381142,0xFEF80BD1,0xFED407E9,0xFEA804A0,0xFE900274,0xFE280088,0xFD100E72,0xFEE80B5A,0x45FC1142,0xBE0001F9,
-0x45FC1142,0x1C8054A,0xFFB0034D,0xFFA000FA,0xFF940001,0xB1FC0548,0xFF7C01F9,0xFF640000,0xD9FC0548,0xFEC00000,0xE4000548,0xB1FC0548,0xFF7C01F9,0xFF640000,0xD9FC0548,0xFEC00000,0xE4000548,0xD9FC0548,0xFEC00000,0xE4000548,0xE4000548,0xB1FC0548,0xFF7C01F9,0xFF640000,0xD9FC0548,0xFEC00000,0xE4000548,0xD9FC0548,0xFEC00000,0xE4000548,0xE4000548,0xD9FC0548,
-0xFEC00000,0xE4000548,0xE4000548,0xE4000548,0xFFB404D9,0x1E80548,0xFDC804E2,0xFFB00422,0xFF7C034D,0xFF480195,0xFEFC0000,0xFE740000,0xF5C004E2,0xFFA00401,0xFF200031,0xE4000548,0xCFFC0548,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0x6C1735,0xFE1801B1,0xFE1801B1,0xFE1801B1,0xFE1801B1,0xFE1801B1,
-0xFE1801B1,0xA0000001,0xA0000001,0xA0000001,0x6C000000,0xA01735,0xA01735,0xA01735,0xA01735,0xA01735,0xA01735,0x800008C8,0x800008C8,0x800008C8,0x62000464,0x1441735,0x1441735,0x1441735,0x4A000E48,0x36001735,0xF8601200,0x6C1735,0x6C1735,0xFE4C0BE4,0xFE3407B4,0xFE240454,0xFE240454,0xFE040014,0xFE3C1031,0xFC300B48,0xC6000099,0x800008C8,
-0xE41735,};
-static const uint32_t g_etc1_to_bc7_m6_table244[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0xEC0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,
-0x35C0000,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x35C0000,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x33FC0000,0x33FC0000,0x33FC0000,0x74000001,0x74000001,0xFC0000,0xEC0000,0xEC0000,0x5100000,0x1280000,0x1400000,0x1400000,0x18C0000,0x5100000,0x1280000,0x1F40000,0x33FC0000,
-0x1F40000,0x1A80000,0x1A80000,0x1A80000,0x1A80000,0x7DFC0000,0x7DFC0000,0x7DFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0x7DFC0000,0x7DFC0000,0x7DFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xBFFC0000,0xD2000001,0xD2000001,0x7DFC0000,0x7DFC0000,0x7DFC0000,0xBFFC0000,0xBFFC0000,0xD2000001,0xBFFC0000,0xBFFC0000,0xD2000001,0xD2000001,0xBFFC0000,
-0xBFFC0000,0xD2000001,0xD2000001,0xD2000001,0x1EC0000,0x1C40000,0x1A80000,0x5FFC0000,0x97FC0000,0xAFFC0000,0xB7FC0000,0xC5F80000,0x2DFC0000,0x7DFC0000,0xAFFC0000,0xD2000001,0xAFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x12495C1,0xFF00782D,0xFEE4609C,0xFEDC5828,0xFEDC5FA9,0xFEB44028,0xFEA43425,0xFE942B8D,0xFE7C1CD4,0xE8741ED1,0xFEC46399,0xFE8C3BF7,0xFE742C2E,0xFE4C1CB6,0xFE2C07B5,0xEE2407F6,0xFE202B26,0xF800116E,0xDC000859,0xC200265A,0x3B095C1,0xFE706A14,0xFE4C582B,0xFE14410D,0xFC00269A,0xE20020E5,0xFA00488A,0xEE002A26,0xD2001C6A,0xBA0035A3,0x5DFC95C1,
-0xCA006AA4,0xBE00521D,0xA600608A,0x920095C3,0xFF007A21,0xFF0C8C29,0xF9208FF4,0xFED45B79,0xFE903AF9,0xFE5C1BF9,0xFE4C0F71,0xFE140288,0xFEF07883,0xFEB0571E,0xFE141976,0xD2001C6A,0x33FC95C1,0x184265D,0xFF681CC8,0xFF58147D,0xFF481144,0xFF5017C9,0xFF2C0B94,0xFF180668,0xFF08070D,0xFEEC0004,0xE8EC039D,0x49FC265A,0xFF1417D8,0xFEF01144,0xFEB80E36,0xFE8801BD,
-0xE87C039D,0xA7F8265A,0xF800116E,0xDC000859,0xC200265A,0x49FC265A,0xFF1417D8,0xFEF01144,0xFEB80E36,0xFE8801BD,0xE87C039D,0xA7F8265A,0xF800116E,0xDC000859,0xC200265A,0xA7F8265A,0xF800116E,0xDC000859,0xC200265A,0xC200265A,0xFF742111,0xF98023FD,0xFB842485,0xFF401AEA,0xFF1012F4,0xFEC80956,0xFEA8031D,0xFE480112,0xFF5C2106,0xFF401A06,0xFE7C11D2,0xDC000859,
-0x8FFC265A,0xDC5828,0xDC5828,0xDC5828,0xDC5828,0xFEA43425,0xFEA43425,0xFEA43425,0xFE7C1CD4,0xFE7C1CD4,0xD2741B35,0xFE742C2E,0xFE742C2E,0xFE742C2E,0xFE2C07B5,0xFE2C07B5,0xDE2805CA,0xF4001142,0xF4001142,0xCA000195,0xA4001142,0x3445828,0x3445828,0x3445828,0xFA00268D,0xFA00268D,0xCC001BE3,0xD80027C5,0xD80027C5,0xBA00113A,0x9E001C1A,0x27FC5828,
-0x27FC5828,0x96003783,0x84003833,0x6C00582B,0xFEBC4689,0xFECC50B4,0xDC5828,0xFE9835F1,0xFE6C2405,0xFE5414FE,0xFE4C0F71,0xFE140288,0xFEA04502,0xFE7433A5,0xFE141895,0xBA00113A,0x1D45828,0x1481144,0x1481144,0x1481144,0x1481144,0xFF180668,0xFF180668,0xFF180668,0xFEEC0004,0xFEEC0004,0xD2EC0001,0x3E81142,0x3E81142,0x3E81142,0xFE8801BD,0xFE8801BD,
-0xD2900001,0x79FC1142,0x79FC1142,0xCA000195,0xA4001142,0x3E81142,0x3E81142,0x3E81142,0xFE8801BD,0xFE8801BD,0xD2900001,0x79FC1142,0x79FC1142,0xCA000195,0xA4001142,0x79FC1142,0x79FC1142,0xCA000195,0xA4001142,0xA4001142,0xFB340E74,0xFB440F20,0x1481144,0xFF180C35,0xFEF008A2,0xFEB80562,0xFEA8031D,0xFE480112,0xF9280EC9,0xFF040BEA,0x57FC1142,0xCA000195,
-0x57FC1142,0x1D4039D,0xFFC4022D,0xFFAC00AD,0xFFA80000,0xBDFC039D,0xFF940152,0xFF7C0001,0xDFF8039D,0xFEF80000,0xE800039D,0xBDFC039D,0xFF940152,0xFF7C0001,0xDFF8039D,0xFEF80000,0xE800039D,0xDFF8039D,0xFEF80000,0xE800039D,0xE800039D,0xBDFC039D,0xFF940152,0xFF7C0001,0xDFF8039D,0xFEF80000,0xE800039D,0xDFF8039D,0xFEF80000,0xE800039D,0xE800039D,0xDFF8039D,
-0xFEF80000,0xE800039D,0xE800039D,0xE800039D,0xFDCC0349,0x1F0039D,0xFFCC0355,0xFDBC02D4,0xFFA80242,0xFF5C0119,0xFF280000,0xFEB80000,0xF7CC0349,0xFFAC02C5,0xFF50001D,0xE800039D,0xD7FC039D,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0x741B34,0xFE240328,0xFE240328,0xFE240328,0xFE240328,0xFE240328,
-0xFE240328,0xAE000000,0xAE000000,0xAE000000,0x74000001,0xB01B32,0xB01B32,0xB01B32,0xB01B32,0xB01B32,0xB01B32,0x8C000A55,0x8C000A55,0x8C000A55,0x6800052D,0x1641B32,0x1641B32,0x1641B32,0x500010BD,0x3A001B32,0xFC681589,0x741B34,0x741B34,0xFE4C0F05,0xFE3C0A68,0xFE340665,0xFE340665,0xFE0C009D,0xFE3C13D0,0xFE340E29,0xD60000B9,0x8C000A55,
-0xF81B32,};
-static const uint32_t g_etc1_to_bc7_m6_table245[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0xFC0000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,
-0x3740000,0x3FFC0000,0x3FFC0000,0x3FFC0000,0x7C000001,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3740000,0x3FFC0000,0x3FFC0000,0x3FFC0000,0x7C000001,0x3FFC0000,0x3FFC0000,0x3FFC0000,0x7C000001,0x7C000001,0x10C0000,0xFC0000,0xFC0000,0x1240000,0x13C0000,0x3540000,0x3540000,0x1A80000,0x1240000,0x13C0000,0xDFC0000,0x3FFC0000,
-0xDFC0000,0x1B80000,0x1B80000,0x1B80000,0x1B80000,0x95FC0000,0x95FC0000,0x95FC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0x95FC0000,0x95FC0000,0x95FC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xCBFC0000,0xDA000001,0xDA000001,0x95FC0000,0x95FC0000,0x95FC0000,0xCBFC0000,0xCBFC0000,0xDA000001,0xCBFC0000,0xCBFC0000,0xDA000001,0xDA000001,0xCBFC0000,
-0xCBFC0000,0xDA000001,0xDA000001,0xDA000001,0x9FC0000,0x3D40000,0x1B80000,0x7DFC0000,0xABFC0000,0xBFF80000,0xC5FC0000,0xD1F40000,0x55FC0000,0x95FC0000,0xBFF80000,0xDA000001,0xBFF80000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x12C9859,0xFF0C7CE9,0xFEF06704,0xFEE85F34,0xFEE863E1,0xFEC44648,0xFEB03B0D,0xFE942FCD,0xFE8821D8,0xEC7C2169,0xFEC465F9,0xFE9840B3,0xFE8031CA,0xFE581F56,0xFE380B41,0xF22C0846,0xFE2C29FE,0xFE041148,0xE6000602,0xC80022C6,0x3BC9859,0xFE7C6FCC,0xFE5C5F33,0xFE204581,0xFE082BC5,0xE80022E1,0xFE044866,0xEE002B26,0xD8001AEE,0xC00032E7,0x63FC9859,
-0xD0006E98,0xBE00534D,0xAC006016,0x9600985B,0xFF007DA1,0xFB248F01,0xFD2892C4,0xFED45F89,0xFEA03FAE,0xFE6820C1,0xFE501419,0xFE2004B9,0xFEF87BA9,0xFEC45BAD,0xFE201CEC,0xD8001AEE,0x3BFC9859,0x19022C5,0xFF741AD8,0xFF6413E5,0xFF581144,0xFF5C15CD,0xFF380B84,0xFF300728,0xFF14063D,0xFEFC0031,0xECFC0265,0x59FC22C5,0xFF2016B4,0xFF081144,0xFED80D2D,0xFEA0025D,
-0xEC900265,0xADFC22C5,0xFE081142,0xE6000602,0xC80022C6,0x59FC22C5,0xFF2016B4,0xFF081144,0xFED80D2D,0xFEA0025D,0xEC900265,0xADFC22C5,0xFE081142,0xE6000602,0xC80022C6,0xADFC22C5,0xFE081142,0xE6000602,0xC80022C6,0xC80022C6,0xFF781E0E,0xFF8C207D,0xFF8C211D,0xFF5C18D9,0xFF2411F8,0xFEEC0969,0xFED003E8,0xFE64019A,0xFD741E52,0xFF4817E4,0xFEA011BB,0xE6000602,
-0x99FC22C5,0xE85F34,0xE85F34,0xE85F34,0xE85F34,0xFEB03B0D,0xFEB03B0D,0xFEB03B0D,0xFE8821D8,0xFE8821D8,0xDA7C1F05,0xFE8031CA,0xFE8031CA,0xFE8031CA,0xFE380B41,0xFE380B41,0xE62C0726,0xFE041148,0xFE041148,0xD000013D,0xAC001142,0x1545F33,0x1545F33,0x1545F33,0xFE082BC5,0xFE082BC5,0xD8001F8B,0xE20029C6,0xE20029C6,0xC000126A,0xA2001D03,0x2FFC5F33,
-0x2FFC5F33,0x9C003C4B,0x8A003C13,0x72005F33,0xFECC4D68,0xF8E0580D,0xE85F34,0xFEAC3CD5,0xFE8029D5,0xFE5C1A05,0xFE501419,0xFE2004B9,0xFEB44BB9,0xFE903A4A,0xFE201C0B,0xC000126A,0x1E85F33,0x1581144,0x1581144,0x1581144,0x1581144,0xFF300728,0xFF300728,0xFF300728,0xFEFC0031,0xFEFC0031,0xDAFC0001,0x7FC1142,0x7FC1142,0x7FC1142,0xFEA0025D,0xFEA0025D,
-0xDAA00001,0x85FC1142,0x85FC1142,0xD000013D,0xAC001142,0x7FC1142,0x7FC1142,0x7FC1142,0xFEA0025D,0xFEA0025D,0xDAA00001,0x85FC1142,0x85FC1142,0xD000013D,0xAC001142,0x85FC1142,0x85FC1142,0xD000013D,0xAC001142,0xAC001142,0xFF3C0EA4,0xFF4C0F40,0x1581144,0xFF240CB2,0xFF04093A,0xFEE40632,0xFED003E8,0xFE64019A,0xFF340ECD,0xFF180C3D,0x65FC1142,0xD000013D,
-0x65FC1142,0x1DC0265,0xFFD00171,0xFFC00074,0xFFB80000,0xC9FC0265,0xFFAC00DA,0xFF940001,0xE5F80265,0xFF280000,0xEC000265,0xC9FC0265,0xFFAC00DA,0xFF940001,0xE5F80265,0xFF280000,0xEC000265,0xE5F80265,0xFF280000,0xEC000265,0xEC000265,0xC9FC0265,0xFFAC00DA,0xFF940001,0xE5F80265,0xFF280000,0xEC000265,0xE5F80265,0xFF280000,0xEC000265,0xEC000265,0xE5F80265,
-0xFF280000,0xEC000265,0xEC000265,0xEC000265,0xFFD00225,0x1FC0265,0xF5D80244,0xFFC001D4,0xFFA80172,0xFF8400B4,0xFF500000,0xFEF40000,0xFBD40221,0xFFC001C4,0xFF700014,0xEC000265,0xDFF80265,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0x7C1F04,0xFE2404C8,0xFE2404C8,0xFE2404C8,0xFE2404C8,0xFE2404C8,
-0xFE2404C8,0xBA000000,0xBA000000,0xBA000000,0x7C000001,0xBC1F02,0xBC1F02,0xBC1F02,0xBC1F02,0xBC1F02,0xBC1F02,0x98000BD5,0x98000BD5,0x98000BD5,0x6E0005F1,0x17C1F02,0x17C1F02,0x17C1F02,0x56001315,0x3E001F02,0xFE6C18FD,0x7C1F04,0x7C1F04,0xFE581220,0xFE500D24,0xFE3C0894,0xFE3C0894,0xFE140164,0xFC541735,0xFE341139,0xE60000CD,0x98000BD5,
-0x10C1F02,};
-static const uint32_t g_etc1_to_bc7_m6_table246[] = {
-0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,
-0x1,0x1,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x10C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,
-0x38C0000,0x4BFC0000,0x4BFC0000,0x4BFC0000,0x84000001,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x38C0000,0x4BFC0000,0x4BFC0000,0x4BFC0000,0x84000001,0x4BFC0000,0x4BFC0000,0x4BFC0000,0x84000001,0x84000001,0x71C0000,0x10C0000,0x10C0000,0x1380000,0x1500000,0x16C0000,0x16C0000,0x3C00000,0x1380000,0x1500000,0x1DF80000,0x4BFC0000,
-0x1DF80000,0x1C80000,0x1C80000,0x1C80000,0x1C80000,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xD7FC0000,0xD7FC0000,0xE2000001,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xD7FC0000,0xD7FC0000,0xE2000001,0xD7FC0000,0xD7FC0000,0xE2000001,0xE2000001,0xAFFC0000,0xAFFC0000,0xAFFC0000,0xD7FC0000,0xD7FC0000,0xE2000001,0xD7FC0000,0xD7FC0000,0xE2000001,0xE2000001,0xD7FC0000,
-0xD7FC0000,0xE2000001,0xE2000001,0xE2000001,0x43FC0000,0xBE40000,0x1C80000,0x9BFC0000,0xBDFC0000,0xCDFC0000,0xD3F80000,0xDBF80000,0x7DFC0000,0xAFFC0000,0xCDFC0000,0xE2000001,0xCDFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1349B71,0xFF18821D,0xFEFC6DCC,0xFEF06694,0xFEE868A1,0xFEC84CE6,0xFEC44258,0xFEA03499,0xFE942774,0xF0842481,0xFED0690D,0xFE9845F3,0xFE8C37EE,0xFE6422AE,0xFE400F8D,0xF6340916,0xFE3829B6,0xFE0811D2,0xEA000401,0xCC001F86,0x1C89B71,0xFE8875EC,0xFE706693,0xFE2C4A85,0xFE0831B5,0xEE00257D,0xFE0848EA,0xF4002CA6,0xE20019DE,0xC600308B,0x69FC9B71,
-0xD6007334,0xC40054F9,0xB200601A,0x9A009B73,0xFF0C818D,0xFF2C9219,0xFF2C962C,0xFED46499,0xFEA84503,0xFE7025F5,0xFE5C1968,0xFE2807C2,0xFEF87F79,0xFED06046,0xFE3020B2,0xE20019DE,0x41FC9B71,0x19C1F85,0xFF801918,0xFF701365,0xFF681144,0xFF681419,0xFF4C0B8A,0xFF4407D9,0xFF2C05B5,0xFF140089,0xF10C016D,0x6BFC1F85,0xFF3815A4,0xFF201144,0xFEF00C35,0xFEB8031D,
-0xF0A4016D,0xB7F81F85,0xFE381142,0xEA000401,0xCC001F86,0x6BFC1F85,0xFF3815A4,0xFF201144,0xFEF00C35,0xFEB8031D,0xF0A4016D,0xB7F81F85,0xFE381142,0xEA000401,0xCC001F86,0xB7F81F85,0xFE381142,0xEA000401,0xCC001F86,0xCC001F86,0xFF801BA1,0xFF8C1D9D,0xF5981E64,0xFF5C1729,0xFF401123,0xFEFC0989,0xFEE404A5,0xFE8C025D,0xFF781B86,0xFF5C1652,0xFED011A5,0xEA000401,
-0xA3FC1F85,0xF06694,0xF06694,0xF06694,0xF06694,0xFEC44258,0xFEC44258,0xFEC44258,0xFE942774,0xFE942774,0xE2842315,0xFE8C37EE,0xFE8C37EE,0xFE8C37EE,0xFE400F8D,0xFE400F8D,0xF03408A6,0xFE0811D2,0xFE0811D2,0xDC0000F5,0xB4001142,0x1686693,0x1686693,0x1686693,0xFE0831B5,0xFE0831B5,0xDE002373,0xEE002C26,0xEE002C26,0xCC0013AA,0xA8001E33,0x39F86693,
-0x39F86693,0xA600416D,0x90004023,0x78006693,0xFECC54C8,0xFEEC5F3D,0xF06694,0xFEB443E1,0xFE943075,0xFE681FB0,0xFE5C1968,0xFE2807C2,0xFEB45309,0xFE98413E,0xFE301FEE,0xCC0013AA,0x3FC6693,0x1681144,0x1681144,0x1681144,0x1681144,0xFF4407D9,0xFF4407D9,0xFF4407D9,0xFF140089,0xFF140089,0xE30C0001,0x1FFC1142,0x1FFC1142,0x1FFC1142,0xFEB8031D,0xFEB8031D,
-0xE2B00001,0x91FC1142,0x91FC1142,0xDC0000F5,0xB4001142,0x1FFC1142,0x1FFC1142,0x1FFC1142,0xFEB8031D,0xFEB8031D,0xE2B00001,0x91FC1142,0x91FC1142,0xDC0000F5,0xB4001142,0x91FC1142,0x91FC1142,0xDC0000F5,0xB4001142,0xB4001142,0xFF580EC9,0xFB640F79,0x1681144,0xFF440D22,0xFF1809DA,0xFEF406DA,0xFEE404A5,0xFE8C025D,0xFD4C0F20,0xFF240CC8,0x75FC1142,0xDC0000F5,
-0x75FC1142,0x1E4016D,0xFFDC00DD,0xFFCC0044,0xFFC80000,0xD5FC016D,0xFFB80082,0xFFAC0001,0xEBF8016D,0xFF580000,0xF000016D,0xD5FC016D,0xFFB80082,0xFFAC0001,0xEBF8016D,0xFF580000,0xF000016D,0xEBF8016D,0xFF580000,0xF000016D,0xF000016D,0xD5FC016D,0xFFB80082,0xFFAC0001,0xEBF8016D,0xFF580000,0xF000016D,0xEBF8016D,0xFF580000,0xF000016D,0xF000016D,0xEBF8016D,
-0xFF580000,0xF000016D,0xF000016D,0xF000016D,0xF9E00152,0x27FC016D,0xF9E00154,0xFFD40120,0xFFBC00DA,0xFF98006D,0xFF780000,0xFF300000,0xFFDC0139,0xFFCC010D,0xFF90000A,0xF000016D,0xE5FC016D,0x842314,0x842314,0x842314,0x842314,0x842314,0x842314,0x842314,0x842314,0x842314,0x842314,0xFE3006C4,0xFE3006C4,0xFE3006C4,0xFE3006C4,0xFE3006C4,
-0xFE3006C4,0xC6000000,0xC6000000,0xC6000000,0x84000001,0x2C42312,0x2C42312,0x2C42312,0x2C42312,0x2C42312,0x2C42312,0xA2000D39,0xA2000D39,0xA2000D39,0x7A0006B9,0x1942312,0x1942312,0x1942312,0x5C001595,0x42002312,0xFE6C1CED,0x842314,0x842314,0xFE681589,0xFE501004,0xFE3C0B14,0xFE3C0B14,0xFE1802A1,0xFE581AC1,0xFE3C14B4,0xF60000EA,0xA2000D39,
-0x11C2312,};
-static const uint32_t g_etc1_to_bc7_m6_table247[] = {
-0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x100000,0x200000,
-0x200000,0x200000,0x200000,0x4000001,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0xC0000,0x100000,0x180000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x11C0000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,
-0x3A40000,0x57FC0000,0x57FC0000,0x57FC0000,0x8C000001,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x3A40000,0x57FC0000,0x57FC0000,0x57FC0000,0x8C000001,0x57FC0000,0x57FC0000,0x57FC0000,0x8C000001,0x8C000001,0xF2C0000,0x11C0000,0x11C0000,0x5480000,0x1640000,0x3800000,0x3800000,0x1DC0000,0x5480000,0x1640000,0x2BFC0000,0x57FC0000,
-0x2BFC0000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xE3FC0000,0xEA000001,0xEA000001,0xC7FC0000,0xC7FC0000,0xC7FC0000,0xE3FC0000,0xE3FC0000,0xEA000001,0xE3FC0000,0xE3FC0000,0xEA000001,0xEA000001,0xE3FC0000,
-0xE3FC0000,0xEA000001,0xEA000001,0xEA000001,0x7BFC0000,0x1F80000,0x1D80000,0xB9FC0000,0xD1FC0000,0xDDF80000,0xDFFC0000,0xE5FC0000,0xA3FC0000,0xC7FC0000,0xDDF80000,0xEA000001,0xDDF80000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x14097B9,0xFF248115,0xFF086EE8,0xFF006878,0xFF006825,0xFEDC4F04,0xFED0456C,0xFEAC3621,0xFEA02A0C,0xF49424D5,0xFEDC67C5,0xFEB047DF,0xFE983B2E,0xFE702442,0xFE5812B1,0xFA3C0922,0xFE38290A,0xFE2012C6,0xEE040282,0xD20C1C9A,0x1DC97B9,0xFEA0761C,0xFE846878,0xFE384B75,0xFE1434A9,0xF400251D,0xFE084686,0xFA002B02,0xE20015EA,0xCC002B8F,0x73F897B9,
-0xDC0071AC,0xCA0050F9,0xB8005AEA,0x9E0097BB,0xFF207F85,0xFF2C8F25,0xF73C935D,0xFEF064CD,0xFEB446BE,0xFE7C292B,0xFE681CD5,0xFE400A4D,0xFF107DEB,0xFED0602E,0xFE3C238C,0xE20015EA,0x4DFC97B9,0x1A41C9D,0xFF8C1788,0xFF7C12FD,0xFF781144,0xFF7412AD,0xFF5C0BA2,0xFF500895,0xFF38056D,0xFF2C0121,0xF51C00B5,0x7BFC1C9A,0xFF4C14D3,0xFF381144,0xFF080B7D,0xFED8040D,
-0xF4B800B5,0xBFF81C9A,0xFE6C1142,0xEC000266,0xD2001C9A,0x7BFC1C9A,0xFF4C14D3,0xFF381144,0xFF080B7D,0xFED8040D,0xF4B800B5,0xBFF81C9A,0xFE6C1142,0xEC000266,0xD2001C9A,0xBFF81C9A,0xFE6C1142,0xEC000266,0xD2001C9A,0xD2001C9A,0xFF901929,0xF9A01B05,0xFBA41B98,0xFF781595,0xFF481062,0xFF200A13,0xFEFC0585,0xFEA8031D,0xFF88194D,0xFF6814FA,0xFEE41193,0xEC000266,
-0xADFC1C9A,0x1006878,0x1006878,0x1006878,0x1006878,0xFED0456C,0xFED0456C,0xFED0456C,0xFEA02A0C,0xFEA02A0C,0xEA942421,0xFE983B2E,0xFE983B2E,0xFE983B2E,0xFE5812B1,0xFE5812B1,0xF640090E,0xFE2012C6,0xFE2012C6,0xE20400DA,0xBC0C1142,0x17C6878,0x17C6878,0x17C6878,0xFE1434A9,0xFE1434A9,0xE8002431,0xFA002AF2,0xFA002AF2,0xD20011F2,0xB4001CBB,0x43F86878,
-0x43F86878,0xB2004159,0x9C003F8B,0x7E00687B,0xFEE85775,0xFEEC61A9,0x1006878,0xFEC446F4,0xFE943411,0xFE782328,0xFE681CD5,0xFE400A4D,0xFED055B3,0xFEB04469,0xFE3C22C8,0xD20011F2,0x11FC6878,0x1781144,0x1781144,0x1781144,0x1781144,0xFF500895,0xFF500895,0xFF500895,0xFF2C0121,0xFF2C0121,0xEB1C0001,0x37FC1142,0x37FC1142,0x37FC1142,0xFED8040D,0xFED8040D,
-0xEAC00001,0x9DFC1142,0x9DFC1142,0xE40000C1,0xBC001142,0x37FC1142,0x37FC1142,0x37FC1142,0xFED8040D,0xFED8040D,0xEAC00001,0x9DFC1142,0x9DFC1142,0xE40000C1,0xBC001142,0x9DFC1142,0x9DFC1142,0xE40000C1,0xBC001142,0xBC001142,0xFB6C0F20,0xFF6C0FA1,0x1781144,0xFD580D75,0xFF2C0A82,0xFEFC07D5,0xFEFC0585,0xFEA8031D,0xFF500F68,0xFF480D24,0x83FC1142,0xE40000C1,
-0x83FC1142,0x1EC00B5,0xFFE80071,0xFFD80024,0xFFD80000,0xE3FC00B5,0xFFCC0044,0xFFC40000,0xF1F800B5,0xFF880000,0xF40000B5,0xE3FC00B5,0xFFCC0044,0xFFC40000,0xF1F800B5,0xFF880000,0xF40000B5,0xF1F800B5,0xFF880000,0xF40000B5,0xF40000B5,0xE3FC00B5,0xFFCC0044,0xFFC40000,0xF1F800B5,0xFF880000,0xF40000B5,0xF1F800B5,0xFF880000,0xF40000B5,0xF40000B5,0xF1F800B5,
-0xFF880000,0xF40000B5,0xF40000B5,0xF40000B5,0xFDE800A2,0x67FC00B5,0xFDE800A4,0xFBE40091,0xFFD0006A,0xFFC00034,0xFFA00000,0xFF6C0000,0xFBE800A2,0xFFD80088,0xFFB00005,0xF40000B5,0xEDFC00B5,0x942420,0x942420,0x942420,0x942420,0x942420,0x942420,0x942420,0x942420,0x942420,0x942420,0xFE4407D9,0xFE4407D9,0xFE4407D9,0xFE4407D9,0xFE4407D9,
-0xFE4407D9,0xD00C0000,0xD00C0000,0xD00C0000,0x8C0C0001,0xDC2420,0xDC2420,0xDC2420,0xDC2420,0xDC2420,0xDC2420,0xAE000C69,0xAE000C69,0xAE000C69,0x800005B9,0x1BC2420,0x1BC2420,0x1BC2420,0x66001572,0x48002422,0xFC881E08,0x942420,0x942420,0xFE6816CD,0xFE641154,0xFE4C0C44,0xFE4C0C44,0xFE2C0371,0xFA6C1C20,0xFE5015C5,0xFE000095,0xAE000C69,
-0x1382420,};
-static const uint32_t g_etc1_to_bc7_m6_table248[] = {
-0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x2C0000,0x580000,
-0x580000,0x580000,0x580000,0xE000000,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x1C0001,0x200000,0x200000,0x200000,0x2C0000,0x3C0000,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x12C0001,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,
-0x1C00000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x1C00000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x65F80000,0x65F80000,0x65F80000,0x96000000,0x96000000,0x9400000,0x12C0001,0x12C0001,0x1600000,0x5780000,0x3980000,0x3980000,0x1FC0000,0x1600000,0x5780000,0x3DF80000,0x65F80000,
-0x3DF80000,0x1E80001,0x1E80001,0x1E80001,0x1E80001,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xF1F80000,0xF1F80000,0xF4000000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xF1F80000,0xF1F80000,0xF4000000,0xF1F80000,0xF1F80000,0xF4000000,0xF4000000,0xE3FC0000,0xE3FC0000,0xE3FC0000,0xF1F80000,0xF1F80000,0xF4000000,0xF1F80000,0xF1F80000,0xF4000000,0xF4000000,0xF1F80000,
-0xF1F80000,0xF4000000,0xF4000000,0xF4000000,0xBBFC0000,0x67FC0000,0x1E80001,0xDBFC0000,0xE7FC0000,0xEDFC0000,0xEFFC0000,0xF3F40000,0xD1FC0000,0xE3FC0000,0xEDFC0000,0xF4000000,0xEDFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x14C9144,0xFF307DB4,0xFF206E1B,0xFF10687B,0xFF0C65C4,0xFEE84FCF,0xFEDC4763,0xFEC036E6,0xFEB82BFB,0xF8A42456,0xFEF464EC,0xFEC448BB,0xFEB03DB5,0xFE88258F,0xFE7015E6,0xFE500915,0xFE582844,0xFE38142D,0xF4100195,0xD81C19B9,0x1EC9144,0xFEB87431,0xFEA06878,0xFE584B5A,0xFE2C36E6,0xF8042454,0xFE084419,0xFA00296D,0xEE001109,0xD2002588,0x7BFC9144,
-0xE0006E48,0xD0004AD6,0xBE005385,0xA6009144,0xFF207B7A,0xFD488994,0xFD488D3C,0xFF0062DC,0xFECC477A,0xFE982BD6,0xFE902035,0xFE580D46,0xFF1079CE,0xFEDC5F8A,0xFE582697,0xEE001109,0x59FC9144,0x1B019BB,0xFFA415E6,0xFF941283,0xFF8C1142,0xFF8C1155,0xFF740BEA,0xFF680989,0xFF4C0593,0xFF3801FD,0xF92C0035,0x8DFC19B8,0xFF6413E5,0xFF541142,0xFF200AFD,0xFEFC0521,
-0xFACC0034,0xC7FC19B8,0xFEA01142,0xF2000124,0xD80019B8,0x8DFC19B8,0xFF6413E5,0xFF541142,0xFF200AFD,0xFEFC0521,0xFACC0034,0xC7FC19B8,0xFEA01142,0xF2000124,0xD80019B8,0xC7FC19B8,0xFEA01142,0xF2000124,0xD80019B8,0xD80019B8,0xFFA016ED,0xFFAC1835,0xFFAC18E6,0xFF881403,0xFF5C0FD6,0xFF380A61,0xFF2006AD,0xFED00448,0xFF941715,0xFF841392,0xFF101175,0xF2000124,
-0xB9FC19B8,0x110687B,0x110687B,0x110687B,0x110687B,0xFEDC4763,0xFEDC4763,0xFEDC4763,0xFEB82BFB,0xFEB82BFB,0xF4A42422,0xFEB03DB5,0xFEB03DB5,0xFEB03DB5,0xFE7015E6,0xFE7015E6,0xFE500915,0xFE38142D,0xFE38142D,0xEC1800D9,0xC41C1145,0x1986878,0x1986878,0x1986878,0xFE2C36E6,0xFE2C36E6,0xF4082420,0xFA00295D,0xFA00295D,0xE2000EE1,0xC0001A74,0x51F86878,
-0x51F86878,0xB8003F52,0xA6003D85,0x88006878,0xFEF458B2,0xFF0C61E3,0x110687B,0xFED448E2,0xFEB0370B,0xFE9026C5,0xFE902035,0xFE580D46,0xFED85741,0xFEBC46B5,0xFE5825EE,0xE2000EE1,0x21FC6878,0x18C1142,0x18C1142,0x18C1142,0x18C1142,0xFF680989,0xFF680989,0xFF680989,0xFF3801FD,0xFF3801FD,0xF52C0001,0x53FC1142,0x53FC1142,0x53FC1142,0xFEFC0521,0xFEFC0521,
-0xF4D00001,0xABF81142,0xABF81142,0xEC000080,0xC4001144,0x53FC1142,0x53FC1142,0x53FC1142,0xFEFC0521,0xFEFC0521,0xF4D00001,0xABF81142,0xABF81142,0xEC000080,0xC4001144,0xABF81142,0xABF81142,0xEC000080,0xC4001144,0xC4001144,0xFF740F52,0xFD880FD2,0x18C1142,0xFF5C0DE5,0xFF540B50,0xFF2808B1,0xFF2006AD,0xFED00448,0xFD740F79,0xFF5C0D81,0x95FC1142,0xEC000080,
-0x95FC1142,0x1F40032,0xFDF00022,0xFFEC000A,0xFFE80001,0xF1FC0032,0xFFE40011,0xFFE00000,0xF9F80032,0xFFC00000,0xF8000034,0xF1FC0032,0xFFE40011,0xFFE00000,0xF9F80032,0xFFC00000,0xF8000034,0xF9F80032,0xFFC00000,0xF8000034,0xF8000034,0xF1FC0032,0xFFE40011,0xFFE00000,0xF9F80032,0xFFC00000,0xF8000034,0xF9F80032,0xFFC00000,0xF8000034,0xF8000034,0xF9F80032,
-0xFFC00000,0xF8000034,0xF8000034,0xF8000034,0xFDF40029,0xB7FC0032,0xF3F40032,0xFFEC0022,0xFFE80019,0xFFDC000D,0xFFCC0000,0xFFB00000,0xFFF00029,0xFFEC0029,0xFFD40001,0xF8000034,0xF7FC0032,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xA42422,0xFE5008B1,0xFE5008B1,0xFE5008B1,0xFE5008B1,0xFE5008B1,
-0xFE5008B1,0xD8200001,0xD8200001,0xD8200001,0x961C0001,0x2F42420,0x2F42420,0x2F42420,0x2F42420,0x2F42420,0x2F42420,0xBA000AB5,0xBA000AB5,0xBA000AB5,0x8C000425,0x1F42420,0x1F42420,0x1F42420,0x6C001408,0x52002420,0xFE8C1E6A,0xA42422,0xA42422,0xFE84174D,0xFE781212,0xFE5C0D22,0xFE5C0D22,0xFE400431,0xFE741C52,0xFE641699,0xFE1000D0,0xBA000AB5,
-0x1602420,};
-static const uint32_t g_etc1_to_bc7_m6_table249[] = {
-0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x440000,0x880000,
-0x880000,0x880000,0x880000,0x16000000,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x2C0001,0x300000,0x300000,0x300000,0x440000,0x600000,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x13C0001,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,
-0x1D80000,0x71F80000,0x71F80000,0x71F80000,0x9E000000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x1D80000,0x71F80000,0x71F80000,0x71F80000,0x9E000000,0x71F80000,0x71F80000,0x71F80000,0x9E000000,0x9E000000,0x1540000,0x13C0001,0x13C0001,0x3700000,0x58C0000,0x1B00000,0x1B00000,0x13FC0000,0x3700000,0x58C0000,0x4BFC0000,0x71F80000,
-0x4BFC0000,0x1F80001,0x1F80001,0x1F80001,0x1F80001,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFC000000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFC000000,0xFDF80000,0xFDF80000,0xFC000000,0xFC000000,0xFBFC0000,0xFBFC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFC000000,0xFDF80000,0xFDF80000,0xFC000000,0xFC000000,0xFDF80000,
-0xFDF80000,0xFC000000,0xFC000000,0xFC000000,0xF5FC0000,0xE7FC0000,0x1F80001,0xF9FC0000,0xFBFC0000,0xFDF80000,0xFDF80000,0xFDF80000,0xF7FC0000,0xFBFC0000,0xFDF80000,0xFC000000,0xFDF80000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1588BEC,0xFF447ADF,0xFF2C6D47,0xFF20687B,0xFF186404,0xFF00509F,0xFEF44933,0xFED837A6,0xFECC2E12,0xFCB42426,0xFF0062AC,0xFED049C7,0xFEC43FBB,0xFE942717,0xFE7C18FE,0xFE68098D,0xFE702804,0xFE4C15EB,0xF8240111,0xDE2C1785,0x3FC8BEC,0xFECC72CF,0xFEB86878,0xFE704B2A,0xFE4C3941,0xFC182424,0xFE204349,0xFE082945,0xF4000D79,0xD80020D8,0x85F88BEC,
-0xE6006BE4,0xD60045F6,0xC4004D71,0xAC008BEC,0xFF3C7814,0xFF4C849C,0xFF4C888C,0xFF1861D6,0xFED8483B,0xFEAC2E3E,0xFE902345,0xFE6C0FFB,0xFF3476CA,0xFEF45E5F,0xFE7028DB,0xF4000D79,0x63FC8BEC,0x1BC1783,0xFFB014AE,0xFFA0122B,0xFF9C1142,0xFF98106D,0xFF800C3A,0xFF740A69,0xFF6405EB,0xFF4C030A,0xFD3C0005,0x9BFC1783,0xFF7C132D,0xFF6C1142,0xFF380ACD,0xFF140631,
-0xFEE00004,0xCFF81783,0xFED41142,0xF8000074,0xDE001784,0x9BFC1783,0xFF7C132D,0xFF6C1142,0xFF380ACD,0xFF140631,0xFEE00004,0xCFF81783,0xFED41142,0xF8000074,0xDE001784,0xCFF81783,0xFED41142,0xF8000074,0xDE001784,0xDE001784,0xFFB41556,0xFFAC1685,0xF7BC1703,0xFF9812DD,0xFF700F7A,0xFF480B1E,0xFF2807B2,0xFEE40559,0xFFA41576,0xFF841292,0xFF301168,0xF8000074,
-0xC1FC1783,0x120687B,0x120687B,0x120687B,0x120687B,0xFEF44933,0xFEF44933,0xFEF44933,0xFECC2E12,0xFECC2E12,0xFCB42422,0xFEC43FBB,0xFEC43FBB,0xFEC43FBB,0xFE7C18FE,0xFE7C18FE,0xFE68098D,0xFE4C15EB,0xFE4C15EB,0xF42800D9,0xCC2C1145,0x1B06878,0x1B06878,0x1B06878,0xFE4C3941,0xFE4C3941,0xFC182420,0xFE082945,0xFE082945,0xE8000C8D,0xC6001888,0x5DF46878,
-0x5DF46878,0xC4003D82,0xAC003B8D,0x90006878,0xFF045983,0xF71C62B2,0x120687B,0xFEE84AEA,0xFEC4399B,0xFEAC29BA,0xFE902345,0xFE6C0FFB,0xFEF8582E,0xFED0482D,0xFE70284B,0xE8000C8D,0x31FC6878,0x19C1142,0x19C1142,0x19C1142,0x19C1142,0xFF740A69,0xFF740A69,0xFF740A69,0xFF4C030A,0xFF4C030A,0xFD3C0001,0x6BFC1142,0x6BFC1142,0x6BFC1142,0xFF140631,0xFF140631,
-0xFCE00001,0xB7F81142,0xB7F81142,0xF8000050,0xCC001144,0x6BFC1142,0x6BFC1142,0x6BFC1142,0xFF140631,0xFF140631,0xFCE00001,0xB7F81142,0xB7F81142,0xF8000050,0xCC001144,0xB7F81142,0xB7F81142,0xF8000050,0xCC001144,0xCC001144,0xFF900F79,0xFF8C101A,0x19C1142,0xFF780E45,0xFF5C0C05,0xFF3809A1,0xFF2807B2,0xFEE40559,0xFF780FCD,0xFF680E10,0xA3FC1142,0xF8000050,
-0xA3FC1142,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFF80001,0xFDFC0002,0xFFF80001,0xFFF80000,0xFFF80002,0xFFF00000,0xFC000004,0xFDFC0002,0xFFF80001,0xFFF80000,0xFFF80002,0xFFF00000,0xFC000004,0xFFF80002,0xFFF00000,0xFC000004,0xFC000004,0xFDFC0002,0xFFF80001,0xFFF80000,0xFFF80002,0xFFF00000,0xFC000004,0xFFF80002,0xFFF00000,0xFC000004,0xFC000004,0xFFF80002,
-0xFFF00000,0xFC000004,0xFC000004,0xFC000004,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFF80001,0xFFF40001,0xFFF40000,0xFFEC0000,0xFDFC0002,0xFDFC0002,0xFFF40000,0xFC000004,0xFFF80002,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xB42422,0xFE680989,0xFE680989,0xFE680989,0xFE680989,0xFE680989,
-0xFE680989,0xE0300001,0xE0300001,0xE0300001,0x9E2C0001,0x30C2420,0x30C2420,0x30C2420,0x30C2420,0x30C2420,0x30C2420,0xCC000949,0xCC000949,0xCC000949,0x980002FD,0xBFC2420,0xBFC2420,0xBFC2420,0x780012C8,0x5A002420,0xFEAC1E85,0xB42422,0xB42422,0xFE9017EA,0xFE8C12CA,0xFE780DFA,0xFE780DFA,0xFE5404ED,0xFE901C99,0xFE7816FA,0xFE200140,0xCC000949,
-0x1802420,};
-static const uint32_t g_etc1_to_bc7_m6_table250[] = {
-0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0x5C0000,0xB80000,
-0xB80000,0xB80000,0xB80000,0x1E000000,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x3C0001,0x8400000,0x8400000,0x8400000,0x5C0000,0x800000,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x14C0001,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,
-0x1F00000,0x7DF80000,0x7DF80000,0x7DF80000,0xA6000000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x1F00000,0x7DF80000,0x7DF80000,0x7DF80000,0xA6000000,0x7DF80000,0x7DF80000,0x7DF80000,0xA6000000,0xA6000000,0x1640000,0x14C0001,0x14C0001,0x1840000,0x5A00000,0x3C40000,0x3C40000,0x27FC0000,0x1840000,0x5A00000,0x5BFC0000,0x7DF80000,
-0x5BFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1608334,0xFF4473EF,0xFF3867CB,0xFF2C6383,0xFF245F04,0xFF0C4D93,0xFF0046EB,0xFEE4366A,0xFED82DC2,0xFEC42422,0xFF0C5CE4,0xFEE8465F,0xFED03D53,0xFEAC2617,0xFE941956,0xFE740A69,0xFE7C2534,0xFE5814AB,0xFA3C009A,0xE03C142D,0x13FC8330,0xFEE46CB7,0xFEC86380,0xFE88483A,0xFE643831,0xFE2C2422,0xFE2C3F39,0xFE082639,0xF4000A2D,0xDE001B24,0x8BFC8330,
-0xEC006584,0xDC003FEA,0xCA004541,0xB0008330,0xFF3C711C,0xFF4C7D04,0xF9608037,0xFF185C82,0xFEEC452B,0xFEBC2D3F,0xFEA82336,0xFE7410B2,0xFF346F66,0xFF00597C,0xFE7C2771,0xF4000A2D,0x6DFC8330,0x1C4142B,0xFFB411F3,0xFFAC0FDB,0xFFA40F22,0xFFA40E41,0xFF8C0ACE,0xFF800955,0xFF700543,0xFF6402EA,0xFF4C0001,0xA9FC142B,0xFF8810A9,0xFF7C0F20,0xFF4C09A2,0xFF2C05A5,
-0xFEF80000,0xD5F8142B,0xFEF00F20,0xFE000010,0xE000142C,0xA9FC142B,0xFF8810A9,0xFF7C0F20,0xFF4C09A2,0xFF2C05A5,0xFEF80000,0xD5F8142B,0xFEF00F20,0xFE000010,0xE000142C,0xD5F8142B,0xFEF00F20,0xFE000010,0xE000142C,0xE000142C,0xFFB41242,0xF9C01343,0xF9C013BB,0xFF98106D,0xFF840D72,0xFF6409D1,0xFF500709,0xFEFC04FD,0xFFB41262,0xFF980FFE,0xFF440F44,0xFE000010,
-0xC9FC142B,0x12C6383,0x12C6383,0x12C6383,0x12C6383,0xFF0046EB,0xFF0046EB,0xFF0046EB,0xFED82DC2,0xFED82DC2,0xFEC42422,0xFED03D53,0xFED03D53,0xFED03D53,0xFE941956,0xFE941956,0xFE740A69,0xFE5814AB,0xFE5814AB,0xF838007E,0xD23C0F21,0x1C06380,0x1C06380,0x1C06380,0xFE643831,0xFE643831,0xFE2C2422,0xFE082639,0xFE082639,0xEE0009C5,0xCC001478,0x65F86380,
-0x65F86380,0xCA003962,0xB2003611,0x96006380,0xFF1055F5,0xFD285DF2,0x12C6383,0xFEF8483A,0xFED437FA,0xFEBC28FE,0xFEA82336,0xFE7410B2,0xFEF8548A,0xFEDC4611,0xFE7C26E1,0xEE0009C5,0x3DF86380,0x1A40F22,0x1A40F22,0x1A40F22,0x1A40F22,0xFF800955,0xFF800955,0xFF800955,0xFF6402EA,0xFF6402EA,0xFF4C0001,0x7BFC0F20,0x7BFC0F20,0x7BFC0F20,0xFF2C05A5,0xFF2C05A5,
-0xFEF80000,0xBFF80F20,0xBFF80F20,0xFC00000D,0xD2000F20,0x7BFC0F20,0x7BFC0F20,0x7BFC0F20,0xFF2C05A5,0xFF2C05A5,0xFEF80000,0xBFF80F20,0xBFF80F20,0xFC00000D,0xD2000F20,0xBFF80F20,0xBFF80F20,0xFC00000D,0xD2000F20,0xD2000F20,0xFF900DB1,0xFBA40E1D,0x1A40F22,0xFF880C92,0xFF700AA5,0xFF5808CA,0xFF500709,0xFEFC04FD,0xFF940DC8,0xFF840C82,0xADFC0F20,0xFC00000D,
-0xADFC0F20,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xC42422,0xFE740A69,0xFE740A69,0xFE740A69,0xFE740A69,0xFE740A69,
-0xFE740A69,0xE8400001,0xE8400001,0xE8400001,0xA63C0001,0x3242420,0x3242420,0x3242420,0x3242420,0x3242420,0x3242420,0xD80007F9,0xD80007F9,0xD80007F9,0xA20001F4,0x17FC2420,0x17FC2420,0x17FC2420,0x7E0011A4,0x62002420,0xFEAC1EF5,0xC42422,0xC42422,0xFEA01865,0xFE941379,0xFE880EBA,0xFE880EBA,0xFE6C05D2,0xFEA01D14,0xFE9417A4,0xFE3401BA,0xD80007F9,
-0x1A42420,};
-static const uint32_t g_etc1_to_bc7_m6_table251[] = {
-0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0x740000,0xE80000,
-0xE80000,0xE80000,0xE80000,0x26000000,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x4C0001,0x540000,0x540000,0x540000,0x740000,0xA40000,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0x15C0001,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,
-0xDFC0000,0x89F80000,0x89F80000,0x89F80000,0xAE000000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0xDFC0000,0x89F80000,0x89F80000,0x89F80000,0xAE000000,0x89F80000,0x89F80000,0x89F80000,0xAE000000,0xAE000000,0x3740000,0x15C0001,0x15C0001,0x1980000,0x5B40000,0x1DC0000,0x1DC0000,0x3BFC0000,0x1980000,0x5B40000,0x69FC0000,0x89F80000,
-0x69FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x16879B4,0xFF506BE7,0xFF406114,0xFF385D2B,0xFF305924,0xFF18496F,0xFF0C4373,0xFEF03492,0xFEE42CC6,0xFED42422,0xFF185634,0xFEF441BB,0xFEDC3993,0xFEB8246F,0xFEA018E6,0xFE8C0B51,0xFE9421B4,0xFE701283,0xFC4C003A,0xE44C10AD,0x1FFC79B0,0xFEF0656B,0xFED85D2B,0xFEA0448A,0xFE703629,0xFE442420,0xFE383A6D,0xFE0822E9,0xFA00077D,0xE2001579,0x91FC79B0,
-0xF2005E40,0xE0003A48,0xD0003CE5,0xB40079B0,0xFF4C694B,0xFB647402,0xFD6876DF,0xFF2C56B2,0xFF004133,0xFED42B7A,0xFEBC21F6,0xFE84109D,0xFF3467C6,0xFF1053A2,0xFE94250F,0xFA00077D,0x75FC79B0,0x1C810AB,0xFFBC0EC6,0xFFB40D22,0xFFAC0C82,0xFFA40BD1,0xFF9808EE,0xFF8C07B5,0xFF7C0453,0xFF700262,0xFF5C0001,0xAFFC10AB,0xFF940DC1,0xFF880C80,0xFF5807F2,0xFF3804A9,
-0xFF100000,0xD7FC10AB,0xFF080C80,0xFE140000,0xE40010AC,0xAFFC10AB,0xFF940DC1,0xFF880C80,0xFF5807F2,0xFF3804A9,0xFF100000,0xD7FC10AB,0xFF080C80,0xFE140000,0xE40010AC,0xD7FC10AB,0xFF080C80,0xFE140000,0xE40010AC,0xE40010AC,0xFFBC0F15,0xFBC40FDB,0xFDC81043,0xFFB00D9E,0xFF980B2E,0xFF6C0809,0xFF5005C9,0xFF20040D,0xFFB40F22,0xFF980D4E,0xFF540C99,0xFE140000,
-0xCDFC10AB,0x1385D2B,0x1385D2B,0x1385D2B,0x1385D2B,0xFF0C4373,0xFF0C4373,0xFF0C4373,0xFEE42CC6,0xFEE42CC6,0xFED42422,0xFEDC3993,0xFEDC3993,0xFEDC3993,0xFEA018E6,0xFEA018E6,0xFE8C0B51,0xFE701283,0xFE701283,0xFA480032,0xD64C0C81,0x1D05D2B,0x1D05D2B,0x1D05D2B,0xFE703629,0xFE703629,0xFE442420,0xFE0822E9,0xFE0822E9,0xF4000759,0xD2001024,0x6DF85D2B,
-0x6DF85D2B,0xD00034E6,0xB8002FC1,0x9C005D2C,0xFF205092,0xFF2C5816,0x1385D2B,0xFF004403,0xFEEC354B,0xFEBC276E,0xFEBC21F6,0xFE84109D,0xFF104F65,0xFEF441EE,0xFE942496,0xF4000759,0x45FC5D2B,0x1AC0C82,0x1AC0C82,0x1AC0C82,0x1AC0C82,0xFF8C07B5,0xFF8C07B5,0xFF8C07B5,0xFF700262,0xFF700262,0xFF5C0001,0x87FC0C80,0x87FC0C80,0x87FC0C80,0xFF3804A9,0xFF3804A9,
-0xFF100000,0xC5F80C80,0xC5F80C80,0xFE140000,0xD6000C80,0x87FC0C80,0x87FC0C80,0x87FC0C80,0xFF3804A9,0xFF3804A9,0xFF100000,0xC5F80C80,0xC5F80C80,0xFE140000,0xD6000C80,0xC5F80C80,0xC5F80C80,0xFE140000,0xD6000C80,0xD6000C80,0xFFA00B50,0xFFAC0B95,0x1AC0C82,0xFF980A69,0xFF8408D1,0xFF640721,0xFF5005C9,0xFF20040D,0xFF940B68,0xFF840A52,0xB5FC0C80,0xFE140000,
-0xB5FC0C80,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xD42422,0xFE8C0B51,0xFE8C0B51,0xFE8C0B51,0xFE8C0B51,0xFE8C0B51,
-0xFE8C0B51,0xF0500001,0xF0500001,0xF0500001,0xAE4C0001,0x33C2420,0x33C2420,0x33C2420,0x33C2420,0x33C2420,0x33C2420,0xE800069D,0xE800069D,0xE800069D,0xA8000124,0x23FC2420,0x23FC2420,0x23FC2420,0x8A001074,0x6A002420,0xFECC1F02,0xD42422,0xD42422,0xFEBC18F5,0xFEA81429,0xFE980F82,0xFE980F82,0xFE8006B2,0xFAB41D8D,0xFEA01829,0xFE50027D,0xE800069D,
-0x1C82420,};
-static const uint32_t g_etc1_to_bc7_m6_table252[] = {
-0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x28C0000,0x1200000,
-0x1200000,0x1200000,0x1200000,0x2E000001,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0x600000,0xA640000,0xA640000,0xA640000,0x28C0000,0xCC0000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x1700000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,
-0x29FC0000,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x29FC0000,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0x97F80000,0x97F80000,0x97F80000,0xB6000001,0xB6000001,0x1880000,0x1700000,0x1700000,0x1AC0000,0x3CC0000,0x1F40000,0x1F40000,0x51FC0000,0x1AC0000,0x3CC0000,0x7BFC0000,0x97F80000,
-0x7BFC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1746F9A,0xFF5C636D,0xFF4C59E2,0xFF445671,0xFF4452B0,0xFF244509,0xFF183FC9,0xFF0832A0,0xFEFC2BCC,0xFEE82420,0xFF244F2A,0xFF003CE1,0xFEF43599,0xFECC22EC,0xFEB8187A,0xFEA40C6D,0xFEA01E26,0xFE88107D,0xFE5C0008,0xE6600D21,0x2BFC6F9A,0xFEFC5DB5,0xFEE85671,0xFEAC40A4,0xFE8833FB,0xFE602420,0xFE5835B7,0xFE142035,0xFA000585,0xE2000FFD,0x97FC6F9A,
-0xF80056D6,0xE6003482,0xD0003409,0xB8006F9A,0xFF5860F6,0xFF6C6A48,0xFF6C6D09,0xFF2C503A,0xFF0C3CF6,0xFEDC2926,0xFECC20F9,0xFEA01085,0xFF505FB2,0xFF184DB4,0xFEB0221D,0xFA000585,0x7DF86F9A,0x1D00D22,0xFFC40B96,0xFFB80A52,0xFFB809D9,0xFFB00948,0xFFA40709,0xFF980614,0xFF880362,0xFF7C01DD,0xFF700000,0xB7FC0D21,0xFFA00AD2,0xFF9409D9,0xFF700633,0xFF5803BA,
-0xFF2C0000,0xDDF40D21,0xFF2009D9,0xFE4C0000,0xE6000D21,0xB7FC0D21,0xFFA00AD2,0xFF9409D9,0xFF700633,0xFF5803BA,0xFF2C0000,0xDDF40D21,0xFF2009D9,0xFE4C0000,0xE6000D21,0xDDF40D21,0xFF2009D9,0xFE4C0000,0xE6000D21,0xE6000D21,0xFFBC0BF4,0xFFCC0C66,0xFFCC0CC6,0xFFB40AAE,0xFF9808C1,0xFF7C063E,0xFF680484,0xFF380334,0xFFC40C0E,0xFFA40A8D,0xFF6409F2,0xFE4C0000,
-0xD3FC0D21,0x1445671,0x1445671,0x1445671,0x1445671,0xFF183FC9,0xFF183FC9,0xFF183FC9,0xFEFC2BCC,0xFEFC2BCC,0xFEE82420,0xFEF43599,0xFEF43599,0xFEF43599,0xFEB8187A,0xFEB8187A,0xFEA40C6D,0xFE88107D,0xFE88107D,0xFC600008,0xDA6009D9,0x1E05671,0x1E05671,0x1E05671,0xFE8833FB,0xFE8833FB,0xFE602420,0xFE142035,0xFE142035,0xFA000575,0xD8000BF6,0x75FC5671,
-0x75FC5671,0xD6003080,0xBE002931,0xA2005672,0xFF2C4B21,0xF9405231,0x1445671,0xFF103F92,0xFEEC3269,0xFEDC25DD,0xFECC20F9,0xFEA01085,0xFF2049F1,0xFF003DFD,0xFEB021B9,0xFA000575,0x51FC5671,0x1B809D9,0x1B809D9,0x1B809D9,0x1B809D9,0xFF980614,0xFF980614,0xFF980614,0xFF7C01DD,0xFF7C01DD,0xFF700000,0x93FC09D9,0x93FC09D9,0x93FC09D9,0xFF5803BA,0xFF5803BA,
-0xFF2C0000,0xCBF809D9,0xCBF809D9,0xFE4C0000,0xDA0009D9,0x93FC09D9,0x93FC09D9,0x93FC09D9,0xFF5803BA,0xFF5803BA,0xFF2C0000,0xCBF809D9,0xCBF809D9,0xFE4C0000,0xDA0009D9,0xCBF809D9,0xCBF809D9,0xFE4C0000,0xDA0009D9,0xDA0009D9,0xF9B00908,0xFFAC0928,0x1B809D9,0xFF980832,0xFF8406F4,0xFF740590,0xFF680484,0xFF380334,0xFFA80908,0xFF980808,0xBDF809D9,0xFE4C0000,
-0xBDF809D9,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xE82420,0xFEA40C6D,0xFEA40C6D,0xFEA40C6D,0xFEA40C6D,0xFEA40C6D,
-0xFEA40C6D,0xFA600000,0xFA600000,0xFA600000,0xB6600001,0x1582420,0x1582420,0x1582420,0x1582420,0x1582420,0x1582420,0xFA000565,0xFA000565,0xFA000565,0xB4000082,0x31F82420,0x31F82420,0x31F82420,0x96000F3A,0x72002422,0xF8E01F81,0xE82420,0xE82420,0xFECC1974,0xFEB414F4,0xFEAC109D,0xFEAC109D,0xFE9407B4,0xFEBC1DC9,0xFEB418A0,0xFE60034D,0xFA000565,
-0x1EC2420,};
-static const uint32_t g_etc1_to_bc7_m6_table253[] = {
-0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x2A40000,0x1500000,
-0x1500000,0x1500000,0x1500000,0x36000001,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x700000,0x780000,0x780000,0x780000,0x2A40000,0xEC0000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x1800000,0x41FC0000,0x41FC0000,0x41FC0000,0x41FC0000,0x41FC0000,
-0x41FC0000,0xA1FC0000,0xA1FC0000,0xA1FC0000,0xBE000001,0x41FC0000,0x41FC0000,0x41FC0000,0x41FC0000,0x41FC0000,0x41FC0000,0xA1FC0000,0xA1FC0000,0xA1FC0000,0xBE000001,0xA1FC0000,0xA1FC0000,0xA1FC0000,0xBE000001,0xBE000001,0x5980000,0x1800000,0x1800000,0x7BC0000,0x3E00000,0x13FC0000,0x13FC0000,0x65FC0000,0x7BC0000,0x3E00000,0x89FC0000,0xA1FC0000,
-0x89FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x17C672A,0xFF685C5D,0xFF5853E2,0xFF5050D1,0xFF444D70,0xFF30415D,0xFF243CB9,0xFF1430F0,0xFF082AE0,0xFEF82420,0xFF304956,0xFF0C38E9,0xFF00325D,0xFED821A0,0xFECC1848,0xFEB00D61,0xFEAC1B66,0xFE940EED,0xFE700004,0xE8700A59,0x37FC672A,0xFF145745,0xFEFC50D1,0xFEB83D84,0xFEA03223,0xFE782420,0xFE703187,0xFE2C1E1D,0xFC0004C5,0xE8000BE9,0x9DFC672A,
-0xFC0050E5,0xE6003022,0xD6002CD1,0xBC00672A,0xFF645A02,0xFF6C6288,0xF578652A,0xFF444AE6,0xFF143961,0xFEF0276A,0xFEDC2036,0xFEB010A0,0xFF5058A2,0xFF304877,0xFEB41FE1,0xFC0004C5,0x83FC672A,0x1D40A56,0xFFC8092D,0xFFC00825,0xFFC007C1,0xFFBC074C,0xFFB0058D,0xFFA404C8,0xFF9402A6,0xFF880179,0xFF800000,0xC1FC0A56,0xFFAC0882,0xFFA007C1,0xFF7C04DB,0xFF6402EA,
-0xFF400001,0xE1F80A56,0xFF3807C1,0xFE7C0000,0xE8000A59,0xC1FC0A56,0xFFAC0882,0xFFA007C1,0xFF7C04DB,0xFF6402EA,0xFF400001,0xE1F80A56,0xFF3807C1,0xFE7C0000,0xE8000A59,0xE1F80A56,0xFF3807C1,0xFE7C0000,0xE8000A59,0xE8000A59,0xFDCC0965,0xFFCC09C6,0xFFCC0A26,0xFFC00865,0xFFAC06E5,0xFF7C050E,0xFF7C0385,0xFF50028A,0xFFC4096E,0xFFB0085D,0xFF7407D1,0xFE7C0000,
-0xD9FC0A56,0x15050D1,0x15050D1,0x15050D1,0x15050D1,0xFF243CB9,0xFF243CB9,0xFF243CB9,0xFF082AE0,0xFF082AE0,0xFEF82420,0xFF00325D,0xFF00325D,0xFF00325D,0xFECC1848,0xFECC1848,0xFEB00D61,0xFE940EED,0xFE940EED,0xFE700004,0xDE7007C1,0x1F450D1,0x1F450D1,0x1F450D1,0xFEA03223,0xFEA03223,0xFE782420,0xFE2C1E1D,0xFE2C1E1D,0xFC0004C1,0xDE0008E2,0x7FF850D1,
-0x7FF850D1,0xDC002D24,0xC40023CD,0xA60050D2,0xFF3C469D,0xFF4C4CD1,0x15050D1,0xFF183C30,0xFF002FDD,0xFEE82482,0xFEDC2036,0xFEB010A0,0xFF344580,0xFF103A91,0xFEB41F7D,0xFC0004C1,0x5DF850D1,0x1C007C1,0x1C007C1,0x1C007C1,0x1C007C1,0xFFA404C8,0xFFA404C8,0xFFA404C8,0xFF880179,0xFF880179,0xFF800000,0x9FFC07C1,0x9FFC07C1,0x9FFC07C1,0xFF6402EA,0xFF6402EA,
-0xFF400001,0xD1F807C1,0xD1F807C1,0xFE7C0000,0xDE0007C1,0x9FFC07C1,0x9FFC07C1,0x9FFC07C1,0xFF6402EA,0xFF6402EA,0xFF400001,0xD1F807C1,0xD1F807C1,0xFE7C0000,0xDE0007C1,0xD1F807C1,0xD1F807C1,0xFE7C0000,0xDE0007C1,0xDE0007C1,0xFDB80708,0xF7BC0745,0x1C007C1,0xFFA40681,0xFF980568,0xFF7C0465,0xFF7C0385,0xFF50028A,0xFDB0070A,0xFFA4065D,0xC3FC07C1,0xFE7C0000,
-0xC3FC07C1,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xF82420,0xFEB00D61,0xFEB00D61,0xFEB00D61,0xFEB00D61,0xFEB00D61,
-0xFEB00D61,0xFE700004,0xFE700004,0xFE700004,0xBE700001,0x1702420,0x1702420,0x1702420,0x1702420,0x1702420,0x1702420,0xFA0004B5,0xFA0004B5,0xFA0004B5,0xBC00002D,0x3DF82420,0x3DF82420,0x3DF82420,0x9C000E2A,0x7A002422,0xFEEC1F85,0xF82420,0xF82420,0xFED81A11,0xFED015B1,0xFEC41174,0xFEC41174,0xFE9C08C8,0xFED81E0A,0xFED01962,0xFE74040D,0xFA0004B5,
-0x9FC2420,};
-static const uint32_t g_etc1_to_bc7_m6_table254[] = {
-0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x2BC0000,0x1800000,
-0x1800000,0x1800000,0x1800000,0x3E000001,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x800000,0x880000,0x880000,0x880000,0x2BC0000,0x1100000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x1900000,0x59FC0000,0x59FC0000,0x59FC0000,0x59FC0000,0x59FC0000,
-0x59FC0000,0xADFC0000,0xADFC0000,0xADFC0000,0xC6000001,0x59FC0000,0x59FC0000,0x59FC0000,0x59FC0000,0x59FC0000,0x59FC0000,0xADFC0000,0xADFC0000,0xADFC0000,0xC6000001,0xADFC0000,0xADFC0000,0xADFC0000,0xC6000001,0xC6000001,0xDA80000,0x1900000,0x1900000,0x1D00000,0x3F40000,0x31FC0000,0x31FC0000,0x79FC0000,0x1D00000,0x3F40000,0x99FC0000,0xADFC0000,
-0x99FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x1845F3A,0xFF7455C5,0xFF644E42,0xFF584B89,0xFF504888,0xFF3C3DE9,0xFF3039D9,0xFF202F60,0xFF142A0C,0xFF082420,0xFF4443BD,0xFF183541,0xFF0C2F69,0xFEF02070,0xFED81820,0xFEC40E48,0xFEB81906,0xFEA00DB5,0xFE880034,0xEC8007E1,0x43FC5F3A,0xFF205129,0xFF084B89,0xFED83AA0,0xFEB8306B,0xFE902420,0xFE882DB7,0xFE381C71,0xFE0804BD,0xEA000892,0xA3FC5F3A,
-0xFE084B86,0xEC002C5A,0xDC002651,0xC0005F3A,0xFF645372,0xF9805B72,0xF9805D5A,0xFF4445D6,0xFF20362E,0xFEFC25F8,0xFEE81F71,0xFEC010D5,0xFF58526E,0xFF3043C7,0xFEBC1E2D,0xFE0804BD,0x8BFC5F3A,0x1D807E2,0xFFD006F2,0xFFCC0635,0xFFC805E9,0xFFC40576,0xFFBC0441,0xFFB003A4,0xFFA00202,0xFFA00121,0xFF900000,0xC9FC07E1,0xFFB8067A,0xFFAC05E9,0xFF9403AB,0xFF7C0232,
-0xFF580001,0xE5F807E1,0xFF5405E9,0xFEAC0000,0xEC0007E1,0xC9FC07E1,0xFFB8067A,0xFFAC05E9,0xFF9403AB,0xFF7C0232,0xFF580001,0xE5F807E1,0xFF5405E9,0xFEAC0000,0xEC0007E1,0xE5F807E1,0xFF5405E9,0xFEAC0000,0xEC0007E1,0xEC0007E1,0xFFD00715,0xF5D80798,0xF5D807BD,0xFFC4065E,0xFFAC0545,0xFFA003D8,0xFF8C02BA,0xFF6401E1,0xFFD0072E,0xFFC0063D,0xFF9005F6,0xFEAC0000,
-0xDFF807E1,0x1584B89,0x1584B89,0x1584B89,0x1584B89,0xFF3039D9,0xFF3039D9,0xFF3039D9,0xFF142A0C,0xFF142A0C,0xFF082420,0xFF0C2F69,0xFF0C2F69,0xFF0C2F69,0xFED81820,0xFED81820,0xFEC40E48,0xFEA00DB5,0xFEA00DB5,0xFE880034,0xE28005E9,0x7FC4B86,0x7FC4B86,0x7FC4B86,0xFEB8306B,0xFEB8306B,0xFE902420,0xFE381C71,0xFE381C71,0xFE0804BD,0xE200064D,0x85FC4B86,
-0x85FC4B86,0xE0002A76,0xCA001ED9,0xAC004B86,0xFF4C4244,0xFF4C4811,0x1584B89,0xFF2C38D6,0xFF142DA1,0xFEF8232A,0xFEE81F71,0xFEC010D5,0xFF344100,0xFF18376B,0xFEBC1DC9,0xFE0804BD,0x65FC4B86,0x1C805E9,0x1C805E9,0x1C805E9,0x1C805E9,0xFFB003A4,0xFFB003A4,0xFFB003A4,0xFFA00121,0xFFA00121,0xFF900000,0xABFC05E9,0xABFC05E9,0xABFC05E9,0xFF7C0232,0xFF7C0232,
-0xFF580001,0xD7F805E9,0xD7F805E9,0xFEAC0000,0xE20005E9,0xABFC05E9,0xABFC05E9,0xABFC05E9,0xFF7C0232,0xFF7C0232,0xFF580001,0xD7F805E9,0xD7F805E9,0xFEAC0000,0xE20005E9,0xD7F805E9,0xD7F805E9,0xFEAC0000,0xE20005E9,0xE20005E9,0xFFBC0550,0xFBC4057D,0x1C805E9,0xFFB404EA,0xFFAC0424,0xFF940371,0xFF8C02BA,0xFF6401E1,0xFFB4055A,0xFFB004E4,0xCBFC05E9,0xFEAC0000,
-0xCBFC05E9,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0x1082420,0xFEC40E48,0xFEC40E48,0xFEC40E48,0xFEC40E48,0xFEC40E48,
-0xFEC40E48,0xFE880034,0xFE880034,0xFE880034,0xC6800001,0x1882420,0x1882420,0x1882420,0x1882420,0x1882420,0x1882420,0xFE0804BD,0xFE0804BD,0xFE0804BD,0xC6000002,0x49F82420,0x49F82420,0x49F82420,0xA6000D41,0x82002422,0xF9002000,0x1082420,0x1082420,0xFEE81A90,0xFEE41675,0xFED41248,0xFED41248,0xFEAC09CD,0xFAEC1E85,0xFED019E2,0xFE90052A,0xFE0804BD,
-0x19FC2420,};
-static const uint32_t g_etc1_to_bc7_m6_table255[] = {
-0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0xD40000,0x1B00000,
-0x1B00000,0x1B00000,0x1B00000,0x46000001,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x900000,0x4980000,0x4980000,0x4980000,0xD40000,0x1300000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x1A00000,0x71FC0000,0x71FC0000,0x71FC0000,0x71FC0000,0x71FC0000,
-0x71FC0000,0xB9FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0x71FC0000,0x71FC0000,0x71FC0000,0x71FC0000,0x71FC0000,0x71FC0000,0xB9FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0xB9FC0000,0xB9FC0000,0xB9FC0000,0xCE000001,0xCE000001,0x1BC0000,0x1A00000,0x1A00000,0x1E40000,0x1BFC0000,0x4FFC0000,0x4FFC0000,0x8DFC0000,0x1E40000,0x1BFC0000,0xA7FC0000,0xB9FC0000,
-0xA7FC0000,0x1FC0001,0x1FC0001,0x1FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFBFC0001,0xF7FC0001,0x1FC0001,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0x1FC0001,0xFDFC0001,0xFFFC0000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,
-0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xFFFC0000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0001,0xF7FC0001,0xF7FC0001,0xFDFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0001,0xFDFC0001,0xFFF00000,0xFE000000,
-0xFFFC0000,0x18C57CA,0xFF744F85,0xFF6C48F9,0xFF644691,0xFF5C43F0,0xFF443AAD,0xFF4436EC,0xFF2C2DF0,0xFF202950,0xFF182420,0xFF443E8D,0xFF2431E9,0xFF182CBD,0xFEFC1F6C,0xFEEC1816,0xFED00F64,0xFED81711,0xFEB80CAD,0xFE940094,0xEE9005C1,0x4FFC57CA,0xFF2C4B75,0xFF184691,0xFEF037C0,0xFECC2F04,0xFEA82420,0xFEA02A47,0xFE641B02,0xFE140585,0xEE0005E5,0xA9FC57CA,
-0xFE284691,0xF2002962,0xDC002041,0xC40057CA,0xFF744D72,0xFD885442,0xFD88560A,0xFF5440F0,0xFF343326,0xFF1024C0,0xFEF81EEA,0xFED81141,0xFF704C95,0xFF503F25,0xFED81CB7,0xFE140585,0x93FC57CA,0x1E005C2,0xFFDC051E,0xFFD4048E,0xFFD00451,0xFFD00402,0xFFC40311,0xFFBC02A8,0xFFAC0176,0xFFAC00CD,0xFFA00000,0xCFFC05C1,0xFFC004C1,0xFFB80451,0xFFA002A3,0xFF88019A,
-0xFF700001,0xE7FC05C1,0xFF6C0451,0xFEE00000,0xEE0005C1,0xCFFC05C1,0xFFC004C1,0xFFB80451,0xFFA002A3,0xFF88019A,0xFF700001,0xE7FC05C1,0xFF6C0451,0xFEE00000,0xEE0005C1,0xE7FC05C1,0xFF6C0451,0xFEE00000,0xEE0005C1,0xEE0005C1,0xFFD8052A,0xF7DC0584,0xF7DC05A5,0xFFD404BA,0xFFC003D1,0xFFA802C2,0xFFA001F9,0xFF7C0161,0xFFD0053E,0xFFC80479,0xFF98045A,0xFEE00000,
-0xE1FC05C1,0x1644691,0x1644691,0x1644691,0x1644691,0xFF4436EC,0xFF4436EC,0xFF4436EC,0xFF202950,0xFF202950,0xFF182420,0xFF182CBD,0xFF182CBD,0xFF182CBD,0xFEEC1816,0xFEEC1816,0xFED00F64,0xFEB80CAD,0xFEB80CAD,0xFE940094,0xE6900451,0x15FC4691,0x15FC4691,0x15FC4691,0xFECC2F04,0xFECC2F04,0xFEA82420,0xFE641B02,0xFE641B02,0xFE140585,0xE6000461,0x8DFC4691,
-0x8DFC4691,0xE6002822,0xD0001A55,0xB2004692,0xFF4C3E24,0xF96043A4,0x1644691,0xFF3C35A5,0xFF202BD5,0xFF08226D,0xFEF81EEA,0xFED81141,0xFF3C3D49,0xFF303446,0xFED81C66,0xFE140585,0x6FFC4691,0x1D00451,0x1D00451,0x1D00451,0x1D00451,0xFFBC02A8,0xFFBC02A8,0xFFBC02A8,0xFFAC00CD,0xFFAC00CD,0xFFA00000,0xB7FC0451,0xB7FC0451,0xB7FC0451,0xFF88019A,0xFF88019A,
-0xFF700001,0xDDF40451,0xDDF40451,0xFEE00000,0xE6000451,0xB7FC0451,0xB7FC0451,0xB7FC0451,0xFF88019A,0xFF88019A,0xFF700001,0xDDF40451,0xDDF40451,0xFEE00000,0xE6000451,0xDDF40451,0xDDF40451,0xFEE00000,0xE6000451,0xE6000451,0xF9CC03F5,0xFFCC03F5,0x1D00451,0xFFC4039D,0xFFAC0304,0xFFA80271,0xFFA001F9,0xFF7C0161,0xFBC803F5,0xFBC0039D,0xD3FC0451,0xFEE00000,
-0xD3FC0451,0x1FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFDFC0002,0xFFFC0001,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFE000000,0xFFFC0000,0xFFFC0000,0xFE000000,0xFE000000,0xFFFC0000,
-0xFFFC0000,0xFE000000,0xFE000000,0xFE000000,0xFDFC0002,0xF7FC0002,0xF7FC0002,0xFDFC0002,0xFFFC0001,0xFFFC0000,0xFFFC0000,0xFFFC0000,0xFDFC0002,0xFDFC0002,0xFFFC0000,0xFE000000,0xFFFC0000,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0x1182420,0xFED00F64,0xFED00F64,0xFED00F64,0xFED00F64,0xFED00F64,
-0xFED00F64,0xFE940094,0xFE940094,0xFE940094,0xCE900001,0x1A02420,0x1A02420,0x1A02420,0x1A02420,0x1A02420,0x1A02420,0xFE140585,0xFE140585,0xFE140585,0xCE0C0001,0x55F82420,0x55F82420,0x55F82420,0xB2000C49,0x8A002422,0xFF0C2008,0x1182420,0x1182420,0xFF041B34,0xFEF81741,0xFEDC133D,0xFEDC133D,0xFEC00AE1,0xFEF41EAD,0xFEE41AAA,0xFE9C0631,0xFE140585,
-0x27FC2420,};
-const uint32_t *g_etc1_to_bc7_m6_table[] = {
-g_etc1_to_bc7_m6_table0, g_etc1_to_bc7_m6_table1, g_etc1_to_bc7_m6_table2, g_etc1_to_bc7_m6_table3, g_etc1_to_bc7_m6_table4, g_etc1_to_bc7_m6_table5, g_etc1_to_bc7_m6_table6, g_etc1_to_bc7_m6_table7, g_etc1_to_bc7_m6_table8, g_etc1_to_bc7_m6_table9, g_etc1_to_bc7_m6_table10, g_etc1_to_bc7_m6_table11, g_etc1_to_bc7_m6_table12, g_etc1_to_bc7_m6_table13, g_etc1_to_bc7_m6_table14, g_etc1_to_bc7_m6_table15, 
-g_etc1_to_bc7_m6_table16, g_etc1_to_bc7_m6_table17, g_etc1_to_bc7_m6_table18, g_etc1_to_bc7_m6_table19, g_etc1_to_bc7_m6_table20, g_etc1_to_bc7_m6_table21, g_etc1_to_bc7_m6_table22, g_etc1_to_bc7_m6_table23, g_etc1_to_bc7_m6_table24, g_etc1_to_bc7_m6_table25, g_etc1_to_bc7_m6_table26, g_etc1_to_bc7_m6_table27, g_etc1_to_bc7_m6_table28, g_etc1_to_bc7_m6_table29, g_etc1_to_bc7_m6_table30, g_etc1_to_bc7_m6_table31, 
-g_etc1_to_bc7_m6_table32, g_etc1_to_bc7_m6_table33, g_etc1_to_bc7_m6_table34, g_etc1_to_bc7_m6_table35, g_etc1_to_bc7_m6_table36, g_etc1_to_bc7_m6_table37, g_etc1_to_bc7_m6_table38, g_etc1_to_bc7_m6_table39, g_etc1_to_bc7_m6_table40, g_etc1_to_bc7_m6_table41, g_etc1_to_bc7_m6_table42, g_etc1_to_bc7_m6_table43, g_etc1_to_bc7_m6_table44, g_etc1_to_bc7_m6_table45, g_etc1_to_bc7_m6_table46, g_etc1_to_bc7_m6_table47, 
-g_etc1_to_bc7_m6_table48, g_etc1_to_bc7_m6_table49, g_etc1_to_bc7_m6_table50, g_etc1_to_bc7_m6_table51, g_etc1_to_bc7_m6_table52, g_etc1_to_bc7_m6_table53, g_etc1_to_bc7_m6_table54, g_etc1_to_bc7_m6_table55, g_etc1_to_bc7_m6_table56, g_etc1_to_bc7_m6_table57, g_etc1_to_bc7_m6_table58, g_etc1_to_bc7_m6_table59, g_etc1_to_bc7_m6_table60, g_etc1_to_bc7_m6_table61, g_etc1_to_bc7_m6_table62, g_etc1_to_bc7_m6_table63, 
-g_etc1_to_bc7_m6_table64, g_etc1_to_bc7_m6_table65, g_etc1_to_bc7_m6_table66, g_etc1_to_bc7_m6_table67, g_etc1_to_bc7_m6_table68, g_etc1_to_bc7_m6_table69, g_etc1_to_bc7_m6_table70, g_etc1_to_bc7_m6_table71, g_etc1_to_bc7_m6_table72, g_etc1_to_bc7_m6_table73, g_etc1_to_bc7_m6_table74, g_etc1_to_bc7_m6_table75, g_etc1_to_bc7_m6_table76, g_etc1_to_bc7_m6_table77, g_etc1_to_bc7_m6_table78, g_etc1_to_bc7_m6_table79, 
-g_etc1_to_bc7_m6_table80, g_etc1_to_bc7_m6_table81, g_etc1_to_bc7_m6_table82, g_etc1_to_bc7_m6_table83, g_etc1_to_bc7_m6_table84, g_etc1_to_bc7_m6_table85, g_etc1_to_bc7_m6_table86, g_etc1_to_bc7_m6_table87, g_etc1_to_bc7_m6_table88, g_etc1_to_bc7_m6_table89, g_etc1_to_bc7_m6_table90, g_etc1_to_bc7_m6_table91, g_etc1_to_bc7_m6_table92, g_etc1_to_bc7_m6_table93, g_etc1_to_bc7_m6_table94, g_etc1_to_bc7_m6_table95, 
-g_etc1_to_bc7_m6_table96, g_etc1_to_bc7_m6_table97, g_etc1_to_bc7_m6_table98, g_etc1_to_bc7_m6_table99, g_etc1_to_bc7_m6_table100, g_etc1_to_bc7_m6_table101, g_etc1_to_bc7_m6_table102, g_etc1_to_bc7_m6_table103, g_etc1_to_bc7_m6_table104, g_etc1_to_bc7_m6_table105, g_etc1_to_bc7_m6_table106, g_etc1_to_bc7_m6_table107, g_etc1_to_bc7_m6_table108, g_etc1_to_bc7_m6_table109, g_etc1_to_bc7_m6_table110, g_etc1_to_bc7_m6_table111, 
-g_etc1_to_bc7_m6_table112, g_etc1_to_bc7_m6_table113, g_etc1_to_bc7_m6_table114, g_etc1_to_bc7_m6_table115, g_etc1_to_bc7_m6_table116, g_etc1_to_bc7_m6_table117, g_etc1_to_bc7_m6_table118, g_etc1_to_bc7_m6_table119, g_etc1_to_bc7_m6_table120, g_etc1_to_bc7_m6_table121, g_etc1_to_bc7_m6_table122, g_etc1_to_bc7_m6_table123, g_etc1_to_bc7_m6_table124, g_etc1_to_bc7_m6_table125, g_etc1_to_bc7_m6_table126, g_etc1_to_bc7_m6_table127, 
-g_etc1_to_bc7_m6_table128, g_etc1_to_bc7_m6_table129, g_etc1_to_bc7_m6_table130, g_etc1_to_bc7_m6_table131, g_etc1_to_bc7_m6_table132, g_etc1_to_bc7_m6_table133, g_etc1_to_bc7_m6_table134, g_etc1_to_bc7_m6_table135, g_etc1_to_bc7_m6_table136, g_etc1_to_bc7_m6_table137, g_etc1_to_bc7_m6_table138, g_etc1_to_bc7_m6_table139, g_etc1_to_bc7_m6_table140, g_etc1_to_bc7_m6_table141, g_etc1_to_bc7_m6_table142, g_etc1_to_bc7_m6_table143, 
-g_etc1_to_bc7_m6_table144, g_etc1_to_bc7_m6_table145, g_etc1_to_bc7_m6_table146, g_etc1_to_bc7_m6_table147, g_etc1_to_bc7_m6_table148, g_etc1_to_bc7_m6_table149, g_etc1_to_bc7_m6_table150, g_etc1_to_bc7_m6_table151, g_etc1_to_bc7_m6_table152, g_etc1_to_bc7_m6_table153, g_etc1_to_bc7_m6_table154, g_etc1_to_bc7_m6_table155, g_etc1_to_bc7_m6_table156, g_etc1_to_bc7_m6_table157, g_etc1_to_bc7_m6_table158, g_etc1_to_bc7_m6_table159, 
-g_etc1_to_bc7_m6_table160, g_etc1_to_bc7_m6_table161, g_etc1_to_bc7_m6_table162, g_etc1_to_bc7_m6_table163, g_etc1_to_bc7_m6_table164, g_etc1_to_bc7_m6_table165, g_etc1_to_bc7_m6_table166, g_etc1_to_bc7_m6_table167, g_etc1_to_bc7_m6_table168, g_etc1_to_bc7_m6_table169, g_etc1_to_bc7_m6_table170, g_etc1_to_bc7_m6_table171, g_etc1_to_bc7_m6_table172, g_etc1_to_bc7_m6_table173, g_etc1_to_bc7_m6_table174, g_etc1_to_bc7_m6_table175, 
-g_etc1_to_bc7_m6_table176, g_etc1_to_bc7_m6_table177, g_etc1_to_bc7_m6_table178, g_etc1_to_bc7_m6_table179, g_etc1_to_bc7_m6_table180, g_etc1_to_bc7_m6_table181, g_etc1_to_bc7_m6_table182, g_etc1_to_bc7_m6_table183, g_etc1_to_bc7_m6_table184, g_etc1_to_bc7_m6_table185, g_etc1_to_bc7_m6_table186, g_etc1_to_bc7_m6_table187, g_etc1_to_bc7_m6_table188, g_etc1_to_bc7_m6_table189, g_etc1_to_bc7_m6_table190, g_etc1_to_bc7_m6_table191, 
-g_etc1_to_bc7_m6_table192, g_etc1_to_bc7_m6_table193, g_etc1_to_bc7_m6_table194, g_etc1_to_bc7_m6_table195, g_etc1_to_bc7_m6_table196, g_etc1_to_bc7_m6_table197, g_etc1_to_bc7_m6_table198, g_etc1_to_bc7_m6_table199, g_etc1_to_bc7_m6_table200, g_etc1_to_bc7_m6_table201, g_etc1_to_bc7_m6_table202, g_etc1_to_bc7_m6_table203, g_etc1_to_bc7_m6_table204, g_etc1_to_bc7_m6_table205, g_etc1_to_bc7_m6_table206, g_etc1_to_bc7_m6_table207, 
-g_etc1_to_bc7_m6_table208, g_etc1_to_bc7_m6_table209, g_etc1_to_bc7_m6_table210, g_etc1_to_bc7_m6_table211, g_etc1_to_bc7_m6_table212, g_etc1_to_bc7_m6_table213, g_etc1_to_bc7_m6_table214, g_etc1_to_bc7_m6_table215, g_etc1_to_bc7_m6_table216, g_etc1_to_bc7_m6_table217, g_etc1_to_bc7_m6_table218, g_etc1_to_bc7_m6_table219, g_etc1_to_bc7_m6_table220, g_etc1_to_bc7_m6_table221, g_etc1_to_bc7_m6_table222, g_etc1_to_bc7_m6_table223, 
-g_etc1_to_bc7_m6_table224, g_etc1_to_bc7_m6_table225, g_etc1_to_bc7_m6_table226, g_etc1_to_bc7_m6_table227, g_etc1_to_bc7_m6_table228, g_etc1_to_bc7_m6_table229, g_etc1_to_bc7_m6_table230, g_etc1_to_bc7_m6_table231, g_etc1_to_bc7_m6_table232, g_etc1_to_bc7_m6_table233, g_etc1_to_bc7_m6_table234, g_etc1_to_bc7_m6_table235, g_etc1_to_bc7_m6_table236, g_etc1_to_bc7_m6_table237, g_etc1_to_bc7_m6_table238, g_etc1_to_bc7_m6_table239, 
-g_etc1_to_bc7_m6_table240, g_etc1_to_bc7_m6_table241, g_etc1_to_bc7_m6_table242, g_etc1_to_bc7_m6_table243, g_etc1_to_bc7_m6_table244, g_etc1_to_bc7_m6_table245, g_etc1_to_bc7_m6_table246, g_etc1_to_bc7_m6_table247, g_etc1_to_bc7_m6_table248, g_etc1_to_bc7_m6_table249, g_etc1_to_bc7_m6_table250, g_etc1_to_bc7_m6_table251, g_etc1_to_bc7_m6_table252, g_etc1_to_bc7_m6_table253, g_etc1_to_bc7_m6_table254, g_etc1_to_bc7_m6_table255, 
-};
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
index 3e7610ff53..8244550959 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
@@ -491,4 +491,4 @@
 {17,31,10897},{16,31,12077},{13,31,6285},{13,31,6285},{8,31,68},{4,31,7686},{0,31,1341},{27,31,968},{27,31,968},{27,31,968},{25,31,325},{31,21,1513},{23,31,605},{23,31,605},{11,31,0},{31,26,1513},{11,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,31,3626},
 {12,31,3626},{12,31,3626},{8,31,68},{0,31,1341},{0,31,1341},{21,31,17476},{20,31,14998},{20,31,14098},{18,31,10672},{20,31,16018},{15,31,8154},{15,31,6218},{9,31,200},{10,31,11338},{0,31,1613},{28,31,1041},{27,31,801},{27,31,680},{26,31,232},{29,29,1473},{26,31,753},{24,31,442},{14,31,0},{31,28,1473},{14,31,0},{20,31,14098},{20,31,14098},{20,31,14098},{18,31,10672},{17,31,11453},{15,31,6218},{15,31,6218},
 {9,31,200},{6,31,7270},{0,31,1613},{27,31,680},{27,31,680},{27,31,680},{26,31,232},{28,28,1105},{24,31,442},{24,31,442},{14,31,0},{28,28,1105},{14,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{13,31,3929},{13,31,3929},{13,31,3929},{9,31,200},{0,31,1613},
-{0,31,1613},
-\ No newline at end of file
+{0,31,1613},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
index 2441fbe859..fad45fe22d 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
@@ -491,4 +491,4 @@
 {34,63,10841},{34,63,12089},{26,63,6206},{26,63,6206},{17,63,74},{9,63,7678},{0,63,1341},{54,63,937},{54,63,937},{54,63,937},{51,63,305},{63,43,1513},{47,63,605},{47,63,605},{22,63,1},{62,53,1513},{22,63,1},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{0,63,0},{63,63,0},{0,63,0},{32,0,9256},{32,0,9256},{32,0,9256},{32,0,9256},{23,63,3650},
 {23,63,3650},{23,63,3650},{17,63,74},{0,63,1341},{0,63,1341},{43,63,17392},{40,63,15021},{40,63,14060},{37,63,10673},{40,63,16013},{32,63,8261},{29,63,6166},{19,63,194},{20,63,11338},{1,63,1594},{57,63,1041},{56,63,822},{54,63,697},{52,63,234},{63,51,1473},{51,63,737},{49,63,442},{28,63,1},{63,57,1473},{28,63,1},{40,63,14060},{40,63,14060},{40,63,14060},{37,63,10673},{34,63,11401},{29,63,6166},{29,63,6166},
 {19,63,194},{12,63,7270},{1,63,1594},{54,63,697},{54,63,697},{54,63,697},{52,63,234},{63,46,1105},{49,63,442},{49,63,442},{28,63,1},{63,54,1105},{28,63,1},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{0,63,0},{63,63,0},{0,63,0},{34,0,9256},{34,0,9256},{34,0,9256},{34,0,9256},{26,63,3898},{26,63,3898},{26,63,3898},{19,63,194},{1,63,1594},
-{1,63,1594},
-\ No newline at end of file
+{1,63,1594},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc
index 0bca0bbddc..fbaf988d78 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc
@@ -478,4 +478,4 @@
 {8,31,11312},{8,31,11249},{7,31,6499},{7,31,6499},{4,31,260},{3,31,10457},{0,31,2642},{13,31,925},{13,31,925},{13,31,925},{12,31,397},{15,22,1513},{11,31,794},{11,31,794},{7,31,4},{14,27,1513},{7,31,4},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{0,31,0},{15,31,0},{0,31,0},{8,0,9376},{8,0,9376},{8,0,9376},{8,0,9376},{6,31,3074},
 {6,31,3074},{6,31,3074},{4,31,260},{0,31,2642},{0,31,2642},{8,31,58848},{7,31,39683},{6,31,25130},{6,31,19007},{8,31,54849},{6,31,27132},{5,31,8569},{4,31,756},{4,31,51302},{0,31,5046},{13,31,1078},{13,31,806},{13,31,637},{12,31,365},{15,26,1473},{12,31,978},{12,31,617},{8,31,9},{14,29,1473},{8,31,9},{9,31,13604},{9,31,13604},{9,31,13604},{8,31,11184},{8,31,10433},{7,31,6339},{7,31,6339},
 {5,31,424},{4,31,9713},{0,31,2930},{13,31,637},{13,31,637},{13,31,637},{12,31,365},{14,27,1105},{12,31,617},{12,31,617},{8,31,9},{13,29,1105},{8,31,9},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{0,31,0},{15,31,0},{0,31,0},{8,0,9248},{8,0,9248},{8,0,9248},{8,0,9248},{6,31,3330},{6,31,3330},{6,31,3330},{5,31,424},{0,31,2930},
-{0,31,2930},
-\ No newline at end of file
+{0,31,2930},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc
index 10c94153ad..3b9d7022e7 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc
@@ -478,4 +478,4 @@
 {4,7,11305},{4,7,11209},{3,7,6489},{3,7,6489},{2,7,272},{1,7,10377},{0,7,2642},{6,7,1040},{6,7,1040},{6,7,1040},{6,7,416},{7,6,1537},{6,7,929},{6,7,929},{3,7,9},{7,6,1513},{3,7,9},{7,7,242},{7,7,170},{7,7,121},{7,7,49},{7,7,242},{7,7,98},{7,7,49},{0,7,0},{7,7,98},{0,7,0},{4,0,9280},{4,0,9280},{4,0,9280},{4,0,9280},{3,7,3125},
 {3,7,3125},{3,7,3125},{2,7,272},{0,7,2642},{0,7,2642},{4,7,59414},{4,7,41414},{3,7,24952},{3,7,19100},{4,7,55014},{3,7,27085},{2,7,10021},{2,7,656},{1,7,52310},{0,7,5046},{7,7,1142},{7,7,1070},{7,7,1021},{6,7,416},{7,7,1538},{6,7,1025},{6,7,625},{4,7,4},{6,7,1529},{4,7,4},{5,7,13964},{5,7,13964},{5,7,13964},{4,7,11305},{4,7,10505},{3,7,6665},{3,7,6665},
 {2,7,592},{2,7,9973},{0,7,2930},{7,7,1021},{7,7,1021},{7,7,1021},{6,7,416},{7,6,1105},{6,7,625},{6,7,625},{4,7,4},{6,7,1129},{4,7,4},{7,7,242},{7,7,170},{7,7,121},{7,7,49},{7,7,242},{7,7,98},{7,7,49},{0,7,0},{7,7,98},{0,7,0},{4,0,9280},{4,0,9280},{4,0,9280},{4,0,9280},{3,7,3301},{3,7,3301},{3,7,3301},{2,7,592},{0,7,2930},
-{0,7,2930},
-\ No newline at end of file
+{0,7,2930},
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
new file mode 100644
index 0000000000..d501a2af6e
--- /dev/null
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
@@ -0,0 +1,297 @@
+// basisu_transcoder_uastc.h
+#pragma once
+#include "basisu_transcoder_internal.h"
+
+namespace basist
+{
+	struct color_quad_u8
+	{ 
+		uint8_t m_c[4]; 
+	};
+
+	const uint32_t TOTAL_UASTC_MODES = 19;
+	const uint32_t UASTC_MODE_INDEX_SOLID_COLOR = 8;
+
+	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS2 = 30;
+	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS3 = 11;
+	const uint32_t TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS = 19;
+
+	extern const uint8_t g_uastc_mode_weight_bits[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_weight_ranges[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_endpoint_ranges[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_subsets[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_planes[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_comps[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_etc1_bias[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_bc1_hint0[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_bc1_hint1[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_alpha[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_is_la[TOTAL_UASTC_MODES];
+
+	struct astc_bc7_common_partition2_desc
+	{
+		uint8_t m_bc7;
+		uint16_t m_astc;
+		bool m_invert;
+	};
+
+	extern const astc_bc7_common_partition2_desc g_astc_bc7_common_partitions2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2];
+
+	struct bc73_astc2_common_partition_desc
+	{
+		uint8_t m_bc73;
+		uint16_t m_astc2;
+		uint8_t k;		// 0-5 - how to modify the BC7 3-subset pattern to match the ASTC pattern (LSB=invert)
+	};
+
+	extern const bc73_astc2_common_partition_desc g_bc7_3_astc2_common_partitions[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS];
+
+	struct astc_bc7_common_partition3_desc
+	{
+		uint8_t m_bc7;
+		uint16_t m_astc;
+		uint8_t m_astc_to_bc7_perm; // converts ASTC to BC7 partition using g_astc_bc7_partition_index_perm_tables[][]
+	};
+
+	extern const astc_bc7_common_partition3_desc g_astc_bc7_common_partitions3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3];
+
+	extern const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16];
+	extern const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16];
+	extern const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16];
+
+	extern const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3];
+	extern const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3];
+	extern const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3];
+
+	extern const uint32_t g_uastc_mode_huff_codes[TOTAL_UASTC_MODES + 1][2];
+
+	extern const uint8_t g_astc_to_bc7_partition_index_perm_tables[6][3];
+	extern const uint8_t g_bc7_to_astc_partition_index_perm_tables[6][3]; // inverse of g_astc_to_bc7_partition_index_perm_tables
+
+	extern const uint8_t* s_uastc_to_bc1_weights[6];
+
+	uint32_t bc7_convert_partition_index_3_to_2(uint32_t p, uint32_t k);
+
+	inline uint32_t astc_interpolate(uint32_t l, uint32_t h, uint32_t w, bool srgb)
+	{
+		if (srgb)
+		{
+			l = (l << 8) | 0x80;
+			h = (h << 8) | 0x80;
+		}
+		else
+		{
+			l = (l << 8) | l;
+			h = (h << 8) | h;
+		}
+
+		uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
+
+		return k >> 8;
+	}
+
+	struct astc_block_desc
+	{
+		int m_weight_range;	// weight BISE range
+
+		int m_subsets;			// number of ASTC partitions
+		int m_partition_seed;	// partition pattern seed
+		int m_cem;				// color endpoint mode used by all subsets
+
+		int m_ccs;				// color component selector (dual plane only)
+		bool m_dual_plane;	// true if dual plane
+
+		// Weight and endpoint BISE values. 
+		// Note these values are NOT linear, they must be BISE encoded. See Table 97 and Table 107.
+		uint8_t m_endpoints[18];	// endpoint values, in RR GG BB etc. order 
+		uint8_t m_weights[64];		// weight index values, raster order, in P0 P1, P0 P1, etc. or P0, P0, P0, P0, etc. order
+	};
+
+	const uint32_t BC7ENC_TOTAL_ASTC_RANGES = 21;
+
+	// See tables 81, 93, 18.13.Endpoint Unquantization
+	const uint32_t TOTAL_ASTC_RANGES = 21;
+	extern const int g_astc_bise_range_table[TOTAL_ASTC_RANGES][3];
+
+	struct astc_quant_bin
+	{
+		uint8_t m_unquant; // unquantized value
+		uint8_t m_index; // sorted index
+	};
+
+	extern astc_quant_bin g_astc_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [ASTC encoded endpoint index]
+
+	int astc_get_levels(int range);
+	bool astc_is_valid_endpoint_range(uint32_t range);
+	uint32_t unquant_astc_endpoint(uint32_t packed_bits, uint32_t packed_trits, uint32_t packed_quints, uint32_t range);
+	uint32_t unquant_astc_endpoint_val(uint32_t packed_val, uint32_t range);
+
+	const uint8_t* get_anchor_indices(uint32_t subsets, uint32_t mode, uint32_t common_pattern, const uint8_t*& pPartition_pattern);
+
+	// BC7
+	const uint32_t BC7ENC_BLOCK_SIZE = 16;
+
+	struct bc7_block
+	{
+		uint64_t m_qwords[2];
+	};
+
+	struct bc7_optimization_results
+	{
+		uint32_t m_mode;
+		uint32_t m_partition;
+		uint8_t m_selectors[16];
+		uint8_t m_alpha_selectors[16];
+		color_quad_u8 m_low[3];
+		color_quad_u8 m_high[3];
+		uint32_t m_pbits[3][2];
+		uint32_t m_index_selector;
+		uint32_t m_rotation;
+	};
+
+	extern const uint32_t g_bc7_weights1[2];
+	extern const uint32_t g_bc7_weights2[4];
+	extern const uint32_t g_bc7_weights3[8];
+	extern const uint32_t g_bc7_weights4[16];
+	extern const uint32_t g_astc_weights4[16];
+	extern const uint32_t g_astc_weights5[32];
+	extern const uint32_t g_astc_weights_3levels[3];
+	extern const uint8_t g_bc7_partition1[16];
+	extern const uint8_t g_bc7_partition2[64 * 16];
+	extern const uint8_t g_bc7_partition3[64 * 16];
+	extern const uint8_t g_bc7_table_anchor_index_second_subset[64];
+	extern const uint8_t g_bc7_table_anchor_index_third_subset_1[64];
+	extern const uint8_t g_bc7_table_anchor_index_third_subset_2[64];
+	extern const uint8_t g_bc7_num_subsets[8];
+	extern const uint8_t g_bc7_partition_bits[8];
+	extern const uint8_t g_bc7_color_index_bitcount[8];
+	extern const uint8_t g_bc7_mode_has_p_bits[8];
+	extern const uint8_t g_bc7_mode_has_shared_p_bits[8];
+	extern const uint8_t g_bc7_color_precision_table[8];
+	extern const int8_t g_bc7_alpha_precision_table[8];
+	extern const uint8_t g_bc7_alpha_index_bitcount[8];
+
+	inline bool get_bc7_mode_has_seperate_alpha_selectors(int mode) { return (mode == 4) || (mode == 5); }
+	inline int get_bc7_color_index_size(int mode, int index_selection_bit) { return g_bc7_color_index_bitcount[mode] + index_selection_bit; }
+	inline int get_bc7_alpha_index_size(int mode, int index_selection_bit) { return g_bc7_alpha_index_bitcount[mode] - index_selection_bit; }
+
+	struct endpoint_err
+	{
+		uint16_t m_error; uint8_t m_lo; uint8_t m_hi;
+	};
+
+	extern endpoint_err g_bc7_mode_6_optimal_endpoints[256][2]; // [c][pbit]
+	const uint32_t BC7ENC_MODE_6_OPTIMAL_INDEX = 5;
+
+	extern endpoint_err g_bc7_mode_5_optimal_endpoints[256]; // [c]
+	const uint32_t BC7ENC_MODE_5_OPTIMAL_INDEX = 1;
+
+	// Packs a BC7 block from a high-level description. Handles all BC7 modes.
+	void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults);
+
+	// Packs an ASTC block
+	// Constraints: Always 4x4, all subset CEM's must be equal, only tested with LDR CEM's.
+	bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t mode);
+
+	void pack_astc_solid_block(void* pDst_block, const color32& color);
+
+#ifdef _DEBUG
+	int astc_compute_texel_partition(int seed, int x, int y, int z, int partitioncount, bool small_block);
+#endif
+		
+	struct uastc_block
+	{
+		union
+		{
+			uint8_t m_bytes[16];
+			uint32_t m_dwords[4];
+
+#ifndef __EMSCRIPTEN__
+			uint64_t m_qwords[2];
+#endif
+		};
+	};
+
+	struct unpacked_uastc_block
+	{
+		astc_block_desc m_astc;
+
+		uint32_t m_mode;
+		uint32_t m_common_pattern;
+
+		color32 m_solid_color;
+
+		bool m_bc1_hint0;
+		bool m_bc1_hint1;
+
+		bool m_etc1_flip;
+		bool m_etc1_diff;
+		uint32_t m_etc1_inten0;
+		uint32_t m_etc1_inten1;
+
+		uint32_t m_etc1_bias;
+
+		uint32_t m_etc2_hints;
+
+		uint32_t m_etc1_selector;
+		uint32_t m_etc1_r, m_etc1_g, m_etc1_b;
+	};
+
+	color32 apply_etc1_bias(const color32 &block_color, uint32_t bias, uint32_t limit, uint32_t subblock);
+	
+	struct decoder_etc_block;
+	struct eac_block;
+		
+	bool unpack_uastc(uint32_t mode, uint32_t common_pattern, const color32& solid_color, const astc_block_desc& astc, color32* pPixels, bool srgb);
+	bool unpack_uastc(const unpacked_uastc_block& unpacked_blk, color32* pPixels, bool srgb);
+
+	bool unpack_uastc(const uastc_block& blk, color32* pPixels, bool srgb);
+	bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool undo_blue_contract, bool read_hints = true);
+
+	bool transcode_uastc_to_astc(const uastc_block& src_blk, void* pDst);
+
+	bool transcode_uastc_to_bc7(const unpacked_uastc_block& unpacked_src_blk, bc7_optimization_results& dst_blk);
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, bc7_optimization_results& dst_blk);
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, void* pDst);
+
+	void transcode_uastc_to_etc1(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst);
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst);
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst, uint32_t channel);
+
+	void transcode_uastc_to_etc2_eac_a8(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst);
+	bool transcode_uastc_to_etc2_rgba(const uastc_block& src_blk, void* pDst);
+
+	// Packs 16 scalar values to BC4. Same PSNR as stb_dxt's BC4 encoder, around 13% faster.
+	void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride);
+	
+	void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb);
+
+	enum
+	{
+		cEncodeBC1HighQuality = 1,
+		cEncodeBC1HigherQuality = 2,
+		cEncodeBC1UseSelectors = 4,
+	};
+	void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags);
+	
+	// Alternate PCA-free encoder, around 15% faster, same (or slightly higher) avg. PSNR
+	void encode_bc1_alt(void* pDst, const uint8_t* pPixels, uint32_t flags);
+
+	void transcode_uastc_to_bc1_hint0(const unpacked_uastc_block& unpacked_src_blk, void* pDst);
+	void transcode_uastc_to_bc1_hint1(const unpacked_uastc_block& unpacked_src_blk, const color32 block_pixels[4][4], void* pDst, bool high_quality);
+
+	bool transcode_uastc_to_bc1(const uastc_block& src_blk, void* pDst, bool high_quality);
+	bool transcode_uastc_to_bc3(const uastc_block& src_blk, void* pDst, bool high_quality);
+	bool transcode_uastc_to_bc4(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0);
+	bool transcode_uastc_to_bc5(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1);
+
+	bool transcode_uastc_to_etc2_eac_r11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0);
+	bool transcode_uastc_to_etc2_eac_rg11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1);
+
+	bool transcode_uastc_to_pvrtc1_4_rgb(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality, bool from_alpha);
+	bool transcode_uastc_to_pvrtc1_4_rgba(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality);
+		
+	// uastc_init() MUST be called before using this module.
+	void uastc_init();
+
+} // namespace basist
diff --git a/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp b/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
index 6873a95d90..c79623bd57 100644
--- a/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
@@ -80,7 +80,6 @@ struct ClipVertex
 	btVector3 v;
 	int id;
 	//b2ContactID id;
-	//b2ContactID id;
 };
 
 #define b2Dot(a, b) (a).dot(b)
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
index fec9b03213..4372489fa1 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
@@ -43,7 +43,6 @@ void btMultiBodyJointMotor::finalizeMultiDof()
 	unsigned int offset = 6 + (m_bodyA->getLink(m_linkA).m_dofOffset + linkDoF);
 
 	// row 0: the lower bound
-	// row 0: the lower bound
 	jacobianA(0)[offset] = 1;
 
 	m_numDofsFinalized = m_jacSizeBoth;
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
index 25ddd539bf..5c20d2a0d4 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
@@ -45,7 +45,6 @@ void btMultiBodySphericalJointMotor::finalizeMultiDof()
 	unsigned int offset = 6 + (m_bodyA->getLink(m_linkA).m_dofOffset + linkDoF);
 
 	// row 0: the lower bound
-	// row 0: the lower bound
 	jacobianA(0)[offset] = 1;
 
 	m_numDofsFinalized = m_jacSizeBoth;
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..a64e4a1889
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    bool ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..090ef164c2
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..645681ac63
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL)
+    if (N) {
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        });
+      if (!TaskScheduler::wait())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    }
+    
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL)
+    TaskScheduler::spawn(first,last,minStepSize,func);
+    if (!TaskScheduler::wait())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..92c37a4a38
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    } 
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<array2.size(); i++) {
+	N += array2[i] ? array2[i]->size() : 0;
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<array2.size());
+	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..b15b44a991
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..15c098fe20
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..a1cbdc8e04
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft   = 0;
+      size_t numMisplacedItemsRight  = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..208bb4e480
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..8271372ea4
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invokation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL)
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..7eae577457
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..30e56c2bfc
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -0,0 +1,454 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
diff --git a/thirdparty/embree/common/lexers/parsestream.h b/thirdparty/embree/common/lexers/parsestream.h
new file mode 100644
index 0000000000..f65a52cb47
--- /dev/null
+++ b/thirdparty/embree/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h
new file mode 100644
index 0000000000..a40c15f8eb
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+
+    FileStream (FILE* file, const std::string& name = "file")
+      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      file = fopen(fileName.c_str(),"r");
+      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { if (file) fclose(file); }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = fgetc(file);
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    FILE* file;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
diff --git a/thirdparty/embree/common/lexers/streamfilters.h b/thirdparty/embree/common/lexers/streamfilters.h
new file mode 100644
index 0000000000..3592b77b03
--- /dev/null
+++ b/thirdparty/embree/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
new file mode 100644
index 0000000000..a037869506
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -0,0 +1,51 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.h b/thirdparty/embree/common/lexers/stringstream.h
new file mode 100644
index 0000000000..6d9c27e3cd
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp
new file mode 100644
index 0000000000..6ed6f2045a
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.h b/thirdparty/embree/common/lexers/tokenstream.h
new file mode 100644
index 0000000000..6e49dd0b39
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h
new file mode 100644
index 0000000000..9d4a0f0846
--- /dev/null
+++ b/thirdparty/embree/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
new file mode 100644
index 0000000000..bc43155358
--- /dev/null
+++ b/thirdparty/embree/common/math/bbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h
new file mode 100644
index 0000000000..3f50c04393
--- /dev/null
+++ b/thirdparty/embree/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<unsigned char> Col3uc;
+  typedef Col3<float        > Col3f;
+}
diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h
new file mode 100644
index 0000000000..788508516b
--- /dev/null
+++ b/thirdparty/embree/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<unsigned char> Col4uc;
+  typedef Col4<float        > Col4f;
+}
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
new file mode 100644
index 0000000000..529584ea16
--- /dev/null
+++ b/thirdparty/embree/common/math/color.h
@@ -0,0 +1,241 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = (unsigned char)(s[3]); 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
new file mode 100644
index 0000000000..03919ae20c
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -0,0 +1,27 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "constants.h"
+
+namespace embree
+{
+  TrueTy True;
+  FalseTy False;
+  ZeroTy zero;
+  OneTy one;
+  NegInfTy neg_inf;
+  PosInfTy inf;
+  PosInfTy pos_inf;
+  NaNTy nan;
+  UlpTy ulp;
+  PiTy pi;
+  OneOverPiTy one_over_pi;
+  TwoPiTy two_pi;
+  OneOverTwoPiTy one_over_two_pi;
+  FourPiTy four_pi;
+  OneOverFourPiTy one_over_four_pi;
+  StepTy step;
+  ReverseStepTy reverse_step;
+  EmptyTy empty;
+  UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
new file mode 100644
index 0000000000..578473a8ab
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  extern MAYBE_UNUSED TrueTy True;
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  extern MAYBE_UNUSED FalseTy False;
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          char     ( ) const { return 0; }
+    __forceinline operator unsigned char     ( ) const { return 0; }
+  }; 
+
+  extern MAYBE_UNUSED ZeroTy zero;
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          char     ( ) const { return 1; }
+    __forceinline operator unsigned char     ( ) const { return 1; }
+  };
+
+  extern MAYBE_UNUSED OneTy one;
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::min(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::min(); }
+
+  };
+
+  extern MAYBE_UNUSED NegInfTy neg_inf;
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::max(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
+  };
+
+  extern MAYBE_UNUSED PosInfTy inf;
+  extern MAYBE_UNUSED PosInfTy pos_inf;
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  extern MAYBE_UNUSED NaNTy nan;
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+
+  extern MAYBE_UNUSED UlpTy ulp;
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  extern MAYBE_UNUSED PiTy pi;
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED TwoPiTy two_pi;
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED FourPiTy four_pi;
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+  struct StepTy {
+  };
+
+  extern MAYBE_UNUSED StepTy step;
+
+  struct ReverseStepTy {
+  };
+
+  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+  struct EmptyTy {
+  };
+
+  extern MAYBE_UNUSED EmptyTy empty;
+
+  struct FullTy {
+  };
+
+  extern MAYBE_UNUSED FullTy full;
+
+  struct UndefinedTy {
+  };
+
+  extern MAYBE_UNUSED UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/interval.h b/thirdparty/embree/common/math/interval.h
new file mode 100644
index 0000000000..310add2129
--- /dev/null
+++ b/thirdparty/embree/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h
new file mode 100644
index 0000000000..2b397a05c8
--- /dev/null
+++ b/thirdparty/embree/common/math/lbbox.h
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h
new file mode 100644
index 0000000000..184ee695fb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace2.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h
new file mode 100644
index 0000000000..9eaa2cc2bb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace3.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
new file mode 100644
index 0000000000..4bc54c1a6a
--- /dev/null
+++ b/thirdparty/embree/common/math/math.h
@@ -0,0 +1,369 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__)
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+#if defined(__WIN32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+  }
+
+  __forceinline float signmsk ( const float x ) {
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+  }
+  __forceinline float xorf( const float x, const float y ) {
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+  }
+  __forceinline float rsqrt( const float x )
+  {
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    __m128 r = _mm_rsqrt_ss(a);
+#endif
+    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#if defined(__ARM_NEON)
+    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#endif
+    return _mm_cvtss_f32(r);
+  }
+
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__64BIT__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__64BIT__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool all(bool s) { return s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+  /*  load/store */
+  template<typename Ty> struct mem;
+ 
+  template<> struct mem<float> {
+    static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+    static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+  
+    static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+    static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+  };
+  
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
diff --git a/thirdparty/embree/common/math/obbox.h b/thirdparty/embree/common/math/obbox.h
new file mode 100644
index 0000000000..2fe8bbf071
--- /dev/null
+++ b/thirdparty/embree/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h
new file mode 100644
index 0000000000..080800efcd
--- /dev/null
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -0,0 +1,254 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+    if (unlikely(all(cosTheta > 0.9995f))) {
+      return normalize(lerp(q0, q1, t));
+    }
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    return msub(cosPhi, q0, qperp);
+  }
+}
diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h
new file mode 100644
index 0000000000..909fadb995
--- /dev/null
+++ b/thirdparty/embree/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h
new file mode 100644
index 0000000000..fd16c26e81
--- /dev/null
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
new file mode 100644
index 0000000000..d62aef51f3
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
new file mode 100644
index 0000000000..a51fb68fd0
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -0,0 +1,301 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+  __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+  __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
new file mode 100644
index 0000000000..ce94eff327
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3.h
@@ -0,0 +1,337 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
+
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__)
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h
new file mode 100644
index 0000000000..a021b522dc
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ba.h
@@ -0,0 +1,120 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    ALIGNED_STRUCT_(16);
+    
+    union {
+      __m128 m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ba( bool a )
+      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline Vec3ba( bool a, bool b, bool c)
+      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+  }
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
new file mode 100644
index 0000000000..586039741d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -0,0 +1,727 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      return Vec3fa(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) {
+    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+  }
+
+  __forceinline Vec3fa rcp  ( const Vec3fa& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fa& v) { 
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fa& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fa& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE4_1__)
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3fa Vec3fa_t;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; union { int a; unsigned u; float w; }; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+        
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
+
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {      
+#if defined (__SSE4_1__)
+      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+      const vint4 mask(-1,-1,-1,0);
+      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+    }
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    
+    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      return Vec3fx(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) {
+    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+  }
+
+  __forceinline Vec3fx rcp  ( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) { 
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
+  __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  
+  typedef Vec3fx Vec3ff;
+}
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
new file mode 100644
index 0000000000..694804c40d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    union {
+      __m128i m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+    __forceinline operator const __m128i&() const { return m128; }
+    __forceinline operator       __m128i&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if defined(__SSSE3__)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+#endif
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
new file mode 100644
index 0000000000..0ed107928a
--- /dev/null
+++ b/thirdparty/embree/common/math/vec4.h
@@ -0,0 +1,243 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "vec3.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 4D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec4
+  {
+    enum { N = 4 };    
+    union {
+      struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ) {}
+    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+    __forceinline Vec4( const Vec3fx& other );
+
+    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
+    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
+    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Swizzles
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
+
+  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
+  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    if (a.w != b.w) return a.w < b.w;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T>
+    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec4<bool         > Vec4b;
+  typedef Vec4<unsigned char> Vec4uc;
+  typedef Vec4<int          > Vec4i;
+  typedef Vec4<float        > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#elif defined(__SSE__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+  }
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+}
diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h
new file mode 100644
index 0000000000..1c3875fb27
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* Make precision match SSE, at the cost of some performance */
+#if !defined(__aarch64__)
+#  define SSE2NEON_PRECISE_DIV 1
+#  define SSE2NEON_PRECISE_SQRT 1
+#endif
+
+#include "sse2neon.h"
+
+__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
+   __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
+  return _mm_fmadd_ps(a, b, neg_c);
+}
+
+__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_sub_ps(c, _mm_mul_ps(a, b));
+#endif
+}
+
+__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
+  return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
+}
+
+
+/* Dummy defines for floating point control */
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_MASK_DENORM 0x100
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+
+__forceinline int _mm_getcsr()
+{
+  return 0;
+}
+
+__forceinline void _mm_mfence()
+{
+  __sync_synchronize();
+}
diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h
new file mode 100644
index 0000000000..7eb25cf2c5
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
@@ -0,0 +1,6996 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min_ps and _mm_max_ps */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+#ifndef SSE2NEON_PRECISE_RSQRT
+#define SSE2NEON_PRECISE_RSQRT (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&   \
+    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
+     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(
+        vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int64_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int64_t f = (int64_t) floor(data);
+        int64_t c = (int64_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int64_t) floor(data);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return vgetq_lane_s64(
+        vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return _mm_set_pd(0, a);
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if (__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        __m128i ret;                                                          \
+        if (unlikely((imm) >= 32)) {                                          \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            uint8x16_t tmp_low, tmp_high;                                     \
+            if (imm >= 16) {                                                  \
+                const int idx = imm - 16;                                     \
+                tmp_low = vreinterpretq_u8_m128i(a);                          \
+                tmp_high = vdupq_n_u8(0);                                     \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                          \
+                const int idx = imm;                                          \
+                tmp_low = vreinterpretq_u8_m128i(b);                          \
+                tmp_high = vreinterpretq_u8_m128i(a);                         \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            }                                                                 \
+        }                                                                     \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (unlikely((imm) >= 16)) {                                        \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if (imm >= 8) {                                                 \
+                const int idx = imm - 8;                                    \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = imm;                                        \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                        \
+    __extension__({                                                       \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
+    })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                   \
+    __extension__({                                              \
+        __m128i ret;                                             \
+        if (unlikely((imm)) <= 0) {                              \
+            ret = a;                                             \
+        }                                                        \
+        if (unlikely((imm) > 15)) {                              \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s16(                       \
+                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely(imm) == 0) {                                          \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 16)) {                             \
+            ret = vreinterpretq_m128i_u16(                                 \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_u32(                                 \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 64)) {                             \
+            ret = vreinterpretq_m128i_u64(                                 \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                              \
+    __extension__({                                                         \
+        __m128i ret;                                                        \
+        if (unlikely((imm) <= 0)) {                                         \
+            ret = a;                                                        \
+        }                                                                   \
+        if (unlikely((imm) > 15)) {                                         \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_s8(                                   \
+                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                          \
+    __extension__({                                                     \
+        __m128i ret;                                                    \
+        if (unlikely((imm) <= 0)) {                                     \
+            ret = a;                                                    \
+        }                                                               \
+        if (unlikely((imm) > 15)) {                                     \
+            ret = _mm_setzero_si128();                                  \
+        } else {                                                        \
+            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
+                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+        }                                                               \
+        ret;                                                            \
+    })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                           : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
+//
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      tmp[31:0] := a[i+15:i] * b[i+15:i]
+//      dst[i+15:i] := tmp[31:16]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+    return _mm_fmadd_ps(b, mask, a);
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint16x4_t t =
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+//   ENDFOR
+//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+//  FOR j := 0 to 1
+//    i := 64*j
+//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_RSQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsubq_f32(
+        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+    float32x4x2_t c =
+        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    // ARMv7 lacks vcgtq_s64.
+    // This is based off of Clang's SSE2 polyfill:
+    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+    // Mask the sign bit out since we need a signed AND an unsigned comparison
+    // and it is ugly to try and split them.
+    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+    // Check if a > b
+    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi > b_hi
+    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+    // Copy lower mask to upper mask
+    // a_lo > b_lo
+    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+    // Compare for equality
+    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi == b_hi
+    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+    return vreinterpretq_m128i_s64(ret);
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+    uint32x4_t a_neq_b = vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int32_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int32_t f = (int32_t) floor(data);
+        int32_t c = (int32_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int32_t) floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+    uint32x4_t signmask = vdupq_n_u32(0x80000000);
+    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
+    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+    int32x4_t r_trunc =
+        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+    float32x4_t delta = vsubq_f32(
+        vreinterpretq_f32_m128(a),
+        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t res2;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+    __m128 zero, neg_inf, pos_inf;
+
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
+                        floorf(v_float[2]), floorf(v_float[3])};
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+                        ceilf(v_float[3])};
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+                             floorf(v_float[2]), floorf(v_float[3]));
+        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+                             ceilf(v_float[2]), ceilf(v_float[3]));
+        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
+                        roundf(v_float[2]), roundf(v_float[3])};
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
+#else
+    return vreinterpret_m64_s32(
+        vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
+            _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := CEIL(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h
new file mode 100644
index 0000000000..d3100306ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
+
diff --git a/thirdparty/embree/common/simd/avx512.h b/thirdparty/embree/common/simd/avx512.h
new file mode 100644
index 0000000000..d43bbacea1
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Prefetching
+  ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1   0
+#define PFHINT_L2   1
+#define PFHINT_NT   2
+
+  template<const unsigned int mode>
+    __forceinline void prefetch(const void * __restrict__ const m)
+  {
+    if (mode == PFHINT_L1)
+      _mm_prefetch((const char*)m,_MM_HINT_T0); 
+    else if (mode == PFHINT_L2) 
+      _mm_prefetch((const char*)m,_MM_HINT_T1); 
+    else if (mode == PFHINT_NT) 
+      _mm_prefetch((const char*)m,_MM_HINT_NTA); 
+  }
+}
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
new file mode 100644
index 0000000000..195506b530
--- /dev/null
+++ b/thirdparty/embree/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__)
+#  include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+#  include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+#  include "avx512.h"
+#endif
+
+namespace embree
+{
+  template <int N>
+  __forceinline vbool<N> isfinite(const vfloat<N>& v)
+  {
+    return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+         & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+  }
+  
+  /* foreach unique */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i);
+    }
+  }
+
+  /* returns the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return i;
+  }
+
+  /* foreach unique index */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i, j);
+    }
+  }
+
+  /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return j;
+  }
+
+  template<typename Closure>
+  __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+  {
+    __aligned(64) int U[2*VSIZEX];
+    __aligned(64) int V[2*VSIZEX];
+    int index = 0;
+    for (int y=y0; y<y1; y++) {
+      const bool lasty = y+1>=y1;
+      const vintx vy = y;
+      for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+        const bool lastx = x+VSIZEX >= x1;
+        vintx vx = x+vintx(step);
+        vintx::storeu(&U[index], vx);
+        vintx::storeu(&V[index], vy);
+        const int dx = min(x1-x,VSIZEX);
+        index += dx;
+        x += dx;
+        if (index >= VSIZEX || (lastx && lasty)) {
+          const vboolx valid = vintx(step) < vintx(index);
+          closure(valid, vintx::load(U), vintx::load(V));
+          x-= max(0, index-VSIZEX);
+          index = 0;
+        }
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree/common/simd/sse.cpp b/thirdparty/embree/common/simd/sse.cpp
new file mode 100644
index 0000000000..535d6943d8
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree 
+{
+  const __m128 mm_lookupmask_ps[16] = {
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+  const __m128d mm_lookupmask_pd[4] = {
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+}
diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h
new file mode 100644
index 0000000000..1465fb4fb0
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree 
+{
+#if defined(__SSE4_1__)
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_blendv_ps(f,t,mask);
+  }
+#else
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); 
+  }
+#endif
+
+  extern const __m128  mm_lookupmask_ps[16];
+  extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h
new file mode 100644
index 0000000000..9b98d326be
--- /dev/null
+++ b/thirdparty/embree/common/simd/varying.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+  /* Varying numeric types */
+  template<int N>
+  struct vfloat_impl
+  {
+    union { float f[N]; int i[N]; };
+    __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vdouble_impl
+  {
+    union { double f[N]; long long i[N]; };
+    __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       double& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vint_impl
+  {
+    int i[N];
+    __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vuint_impl
+  {
+    unsigned int i[N];
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vllong_impl
+  {
+    long long i[N];
+    __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       long long& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  /* Varying bool types */
+  template<int N> struct vboolf_impl { int       i[N]; }; // for float/int
+  template<int N> struct vboold_impl { long long i[N]; }; // for double/long long
+ 
+  /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+  const int VSIZEX = 8;  // default size
+  const int VSIZEL = 16; // large size
+#elif defined(__AVX__)
+  const int VSIZEX = 8;
+  const int VSIZEL = 8;
+#else
+  const int VSIZEX = 4;
+  const int VSIZEL = 4;
+#endif
+
+  template<int N>
+  struct vtypes {
+    using vbool = vboolf_impl<N>;
+    using vboolf = vboolf_impl<N>;
+    using vboold = vboold_impl<N>;
+    using vint = vint_impl<N>;
+    using vuint = vuint_impl<N>;
+    using vllong = vllong_impl<N>;
+    using vfloat = vfloat_impl<N>;
+    using vdouble = vdouble_impl<N>;
+  };
+
+  template<>
+  struct vtypes<1> {
+    using vbool = bool;
+    using vboolf = bool;
+    using vboold = bool;
+    using vint = int;
+    using vuint = unsigned int;
+    using vllong = long long;
+    using vfloat = float;
+    using vdouble = double;
+  };
+
+  /* Aliases to default types */
+  template<int N> using vbool = typename vtypes<N>::vbool;
+  template<int N> using vboolf = typename vtypes<N>::vboolf;
+  template<int N> using vboold = typename vtypes<N>::vboold;
+  template<int N> using vint = typename vtypes<N>::vint;
+  template<int N> using vuint = typename vtypes<N>::vuint;
+  template<int N> using vllong = typename vtypes<N>::vllong;
+  template<int N> using vreal = typename vtypes<N>::vfloat;
+  template<int N> using vfloat = typename vtypes<N>::vfloat;
+  template<int N> using vdouble = typename vtypes<N>::vdouble;
+
+  /* 4-wide shortcuts */
+  typedef vfloat<4>  vfloat4;
+  typedef vdouble<4> vdouble4;
+  typedef vreal<4>   vreal4;
+  typedef vint<4>    vint4;
+  typedef vuint<4>  vuint4;
+  typedef vllong<4>  vllong4;
+  typedef vbool<4>   vbool4;
+  typedef vboolf<4>  vboolf4;
+  typedef vboold<4>  vboold4;
+
+  /* 8-wide shortcuts */
+  typedef vfloat<8>  vfloat8;
+  typedef vdouble<8> vdouble8;
+  typedef vreal<8>   vreal8;
+  typedef vint<8>    vint8;
+  typedef vuint<8>    vuint8;
+  typedef vllong<8>  vllong8;
+  typedef vbool<8>   vbool8;
+  typedef vboolf<8>  vboolf8;
+  typedef vboold<8>  vboold8;
+
+  /* 16-wide shortcuts */
+  typedef vfloat<16>  vfloat16;
+  typedef vdouble<16> vdouble16;
+  typedef vreal<16>   vreal16;
+  typedef vint<16>    vint16;
+  typedef vuint<16>   vuint16;
+  typedef vllong<16>  vllong16;
+  typedef vbool<16>   vbool16;
+  typedef vboolf<16>  vboolf16;
+  typedef vboold<16>  vboold16;
+
+  /* Default shortcuts */
+  typedef vfloat<VSIZEX>  vfloatx;
+  typedef vdouble<VSIZEX> vdoublex;
+  typedef vreal<VSIZEX>   vrealx;
+  typedef vint<VSIZEX>    vintx;
+  typedef vuint<VSIZEX>   vuintx;
+  typedef vllong<VSIZEX>  vllongx;
+  typedef vbool<VSIZEX>   vboolx;
+  typedef vboolf<VSIZEX>  vboolfx;
+  typedef vboold<VSIZEX>  vbooldx;
+}
diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h
new file mode 100644
index 0000000000..7db0d1c5c1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX bool type for 64bit data types*/
+  template<>
+  struct vboold<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 };       // number of SIMD elements
+    union {                   // data
+      __m256d v;
+      struct { __m128d vl,vh; };
+      long long i[4];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& a) { v = a.v; }
+    __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+    __forceinline vboold(__m256d a) : v(a) {}
+    __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+    __forceinline operator const __m256d() const { return v; }
+
+    __forceinline vboold(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi64x(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+      vl = mm_lookupmask_pd[a & 0x3];
+      vh = mm_lookupmask_pd[a >> 2];
+#endif
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool       operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+    __forceinline long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+  __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+    return _mm256_blendv_pd(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+
+
+#if defined(__AVX2__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+  __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+  __forceinline void set  (vboold4& a, size_t index)     { a[index] = -1; }
+  __forceinline void clear(vboold4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold4_avx512.h b/thirdparty/embree/common/simd/vboold4_avx512.h
new file mode 100644
index 0000000000..ceaad7bba5
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx512.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboold<4>
+  {
+    typedef vboold4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& t) { v = t.v; }
+    __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x0) {}
+    __forceinline vboold(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboold4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold8_avx512.h b/thirdparty/embree/common/simd/vboold8_avx512.h
new file mode 100644
index 0000000000..66d2054872
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold8_avx512.h
@@ -0,0 +1,151 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboold<8>
+  {
+    typedef vboold8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold8& t) { v = t.v; }
+    __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8& t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+    
+    __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const { 
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x00) {}
+    __forceinline vboold(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+  __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+  __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+  __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboold8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h
new file mode 100644
index 0000000000..19841dcea8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf16_avx512.h
@@ -0,0 +1,153 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 bool type */
+  template<>
+  struct vboolf<16>
+  {
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum { size = 16 }; // number of SIMD elements
+    __mmask16 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf16& t) { v = t.v; }
+    __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask16& t) { v = t; }
+    __forceinline operator __mmask16() const { return v; }
+    
+    __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+    __forceinline vboolf(int t) { v = (__mmask16)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m512i mask32() const {
+      return _mm512_movm_epi32(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0000) {}
+    __forceinline vboolf(TrueTy)  : v(0xffff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+  
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  
+   ////////////////////////////////////////////////////////////////////////////////
+   /// Binary Operators
+   ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+  __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+  __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+
+  __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+  __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Convertion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf16& a, size_t index)       { assert(index < 16); a |= 1 << index; }
+  __forceinline void clear(vboolf16& a, size_t index)     { assert(index < 16); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<16; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_avx512.h b/thirdparty/embree/common/simd/vboolf4_avx512.h
new file mode 100644
index 0000000000..e65f66b025
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboolf<4>
+  {
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& t) { v = t.v; }
+    __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0) {}
+    __forceinline vboolf(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboolf4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
new file mode 100644
index 0000000000..fa84b1b6ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE bool type */
+  template<>
+  struct vboolf<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };            // number of SIMD elements
+    union { __m128 v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& other) { v = other.v; }
+    __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+    __forceinline vboolf(__m128 input) : v(input) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    
+    __forceinline vboolf(bool a)
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b)
+      : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+    __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const { 
+      return _mm_castps_si128(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  
+  __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if defined(__SSE4_1__)
+    return _mm_blendv_ps(f, t, m); 
+#else
+    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return shuffle<i0,i0,i0,i0>(v);
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+  __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__SSE4_2__)
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+  __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf4& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h
new file mode 100644
index 0000000000..ba77cc3c5e
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -0,0 +1,202 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX bool type */
+  template<>
+  struct vboolf<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };       // number of SIMD elements
+    union {                   // data
+      __m256 v;
+      struct { __m128 vl,vh; };
+      int i[8];
+    };  
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& a) { v = a.v; }
+    __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+    __forceinline vboolf(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+    __forceinline vboolf(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi32(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+      vl = mm_lookupmask_ps[a & 0xF];
+      vh = mm_lookupmask_ps[a >> 4];
+#endif
+    }
+
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const { 
+      return _mm256_castps_si256(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+  __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+    return _mm256_blendv_ps(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+  __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf8& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf8& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx512.h b/thirdparty/embree/common/simd/vboolf8_avx512.h
new file mode 100644
index 0000000000..73ff5666e1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboolf<8>
+  {
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& t) { v = t.v; }
+    __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+      : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const {
+      return _mm256_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const {
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x00) {}
+    __forceinline vboolf(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+  __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboolf8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h
new file mode 100644
index 0000000000..55326de7dd
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX 64-bit double type */
+  template<>
+  struct vdouble<4>
+  {
+    ALIGNED_STRUCT_(32);
+            
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256d v; 
+      double i[4]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble4& t) { v = t.v; }
+    __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m256d& t) { v = t; }
+    __forceinline operator __m256d() const { return v; }
+
+    __forceinline vdouble(double i) {
+      v = _mm256_set1_pd(i);
+    }
+    
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm256_set_pd(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm256_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+      _mm256_stream_pd(ptr, a);
+    }
+
+    static __forceinline vdouble4 loadu(const double* addr) {
+      return _mm256_loadu_pd(addr);
+    }
+
+    static __forceinline vdouble4 load(const vdouble4* addr) {
+      return _mm256_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble4 load(const double* addr) {
+      return _mm256_load_pd(addr);
+    }
+
+    static __forceinline void store(double* ptr, const vdouble4& v) {
+      _mm256_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(double* ptr, const vdouble4& v) {
+      _mm256_storeu_pd(ptr, v);
+    }
+
+    static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       double& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+  __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
+  __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
+
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
+  __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
+
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
+  __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
+
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
+  __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
+
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
+  __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
+
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
+  __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
+  
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
+  __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
+  __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+  __forceinline vdouble4& operator +=(vdouble4& a, double          b) { return a = a + b; }
+  
+  __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+  __forceinline vdouble4& operator -=(vdouble4& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+  __forceinline vdouble4& operator *=(vdouble4& a, double          b) { return a = a * b; }
+  
+  __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+  __forceinline vdouble4& operator &=(vdouble4& a, double          b) { return a = a & b; }
+  
+  __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+  __forceinline vdouble4& operator |=(vdouble4& a, double          b) { return a = a | b; }
+  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#endif
+
+  __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
+  __forceinline vboold4 operator ==(double          a, const vdouble4& b) { return vdouble4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vdouble4& a, double          b) { return a != vdouble4(b); }
+  __forceinline vboold4 operator !=(double          a, const vdouble4& b) { return vdouble4(a) != b; }
+
+  __forceinline vboold4 operator < (const vdouble4& a, double          b) { return a <  vdouble4(b); }
+  __forceinline vboold4 operator < (double          a, const vdouble4& b) { return vdouble4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vdouble4& a, double          b) { return a >= vdouble4(b); }
+  __forceinline vboold4 operator >=(double          a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+  __forceinline vboold4 operator > (const vdouble4& a, double          b) { return a >  vdouble4(b); }
+  __forceinline vboold4 operator > (double          a, const vdouble4& b) { return vdouble4(a) >  b; }
+
+  __forceinline vboold4 operator <=(const vdouble4& a, double          b) { return a <= vdouble4(b); }
+  __forceinline vboold4 operator <=(double          a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+  __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+  __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+  __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a >  b; }
+  __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+ 
+  __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+    return _mm256_mask_blend_pd(m, f, t);
+#else
+    return _mm256_blendv_pd(f, t, m);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle2(const vdouble4& v) {
+    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+  }
+
+  __forceinline double toScalar(const vdouble4& v) {
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+  __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble8_avx512.h b/thirdparty/embree/common/simd/vdouble8_avx512.h
new file mode 100644
index 0000000000..98d21bfe4a
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble8_avx512.h
@@ -0,0 +1,351 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 64-bit double type */
+  template<>
+  struct vdouble<8>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {              // data
+      __m512d v;
+      double i[8];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble8& t) { v = t.v; }
+    __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m512d& t) { v = t; }
+    __forceinline operator __m512d() const { return v; }
+    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+    __forceinline vdouble(double i) {
+      v = _mm512_set1_pd(i);
+    }
+
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm512_set4_pd(d,c,b,a);
+    }
+
+    __forceinline vdouble(double a0, double a1, double a2, double a3,
+                          double a4, double a5, double a6, double a7)
+    {
+      v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm512_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+      _mm512_stream_pd((double*)ptr, a);
+    }
+
+    static __forceinline vdouble8 loadu(const void* addr) {
+      return _mm512_loadu_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const vdouble8* addr) {
+      return _mm512_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const double* addr) {
+      return _mm512_load_pd(addr);
+    }
+
+    static __forceinline void store(void* ptr, const vdouble8& v) {
+      _mm512_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vdouble8& v) {
+      _mm512_storeu_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+      _mm512_mask_storeu_pd(ptr, mask, f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+      _mm512_mask_store_pd(addr, mask, v2);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+      return _mm512_mask_compress_pd(a, mask, b);
+    }
+
+    static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline       double& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+  __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
+  __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
+
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
+  __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
+
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
+  __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
+
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
+  __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
+
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
+  __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
+
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
+  __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
+  __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
+  __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+  __forceinline vdouble8& operator +=(vdouble8& a, double          b) { return a = a + b; }
+
+  __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+  __forceinline vdouble8& operator -=(vdouble8& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+  __forceinline vdouble8& operator *=(vdouble8& a, double          b) { return a = a * b; }
+
+  __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+  __forceinline vdouble8& operator &=(vdouble8& a, double          b) { return a = a & b; }
+
+  __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+  __forceinline vdouble8& operator |=(vdouble8& a, double          b) { return a = a | b; }
+
+  __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+  __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
+  __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
+
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
+  __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
+
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
+  __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
+
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
+  __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
+  __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
+  __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+    return _mm512_mask_or_pd(f,m,t,t);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return shuffle4<i, i>(v);
+  }
+  
+  template<int i>
+  __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+  }
+
+  __forceinline double toScalar(const vdouble8& v) {
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 vreduce_add2(vdouble8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline vdouble8 vreduce_min2(vdouble8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vdouble8 vreduce_max2(vdouble8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+  __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+    return _mm512_permutexvar_pd(index, v);
+  }
+
+  __forceinline vdouble8 reverse(const vdouble8& a) {
+    return permute(a, vllong8(reverse_step));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h
new file mode 100644
index 0000000000..9f1e2459c4
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -0,0 +1,615 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 float type */
+  template<>
+  struct vfloat<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512 v; 
+      float f[16];
+      int i[16];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+        
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+    __forceinline vfloat(const __m512& t) { v = t; }
+    __forceinline operator __m512() const { return v; }
+    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline vfloat(float f) {
+      v = _mm512_set1_ps(f);
+    }
+
+    __forceinline vfloat(float a, float b, float c, float d) {
+      v = _mm512_set4_ps(a, b, c, d);
+    }
+
+    __forceinline vfloat(const vfloat4& i) {
+      v = _mm512_broadcast_f32x4(i);
+    }
+
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+      v = _mm512_castps128_ps512(a);
+      v = _mm512_insertf32x4(v, b, 1);
+      v = _mm512_insertf32x4(v, c, 2);
+      v = _mm512_insertf32x4(v, d, 3);
+    }
+
+    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+      v = _mm512_broadcast_f32x4(a);
+      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+    }
+
+    __forceinline vfloat(const vfloat8& i) {
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+    }
+
+    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+      v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+      v = _mm512_insertf32x8(v, b, 1);
+#else
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+    }
+
+    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+    /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+      v = _mm512_castpd_ps(aa);
+      }*/
+    
+    __forceinline explicit vfloat(const vint16& a) {
+      v = _mm512_cvtepi32_ps(a);
+    }
+
+    __forceinline explicit vfloat(const vuint16& a) {
+      v = _mm512_cvtepu32_ps(a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
+    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+      _mm512_stream_ps((float*)ptr,a);
+    }
+
+    static __forceinline vfloat16 broadcast(const float* f) {
+      return _mm512_set1_ps(*f);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+      return _mm512_i32gather_ps(index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+      vfloat16 r = zero;
+      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_i32scatter_ps(ptr, index, v, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
+    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+
+  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat16 rcp(const vfloat16& a) {
+    const vfloat16 r = _mm512_rcp14_ps(a);
+    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+  }
+
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+  __forceinline vfloat16 rsqrt(const vfloat16& a)
+  {
+    const vfloat16 r = _mm512_rsqrt14_ps(a);
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
+  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
+
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
+  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
+
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
+  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
+
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
+  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
+  
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+  }
+  
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b);  }
+  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
+  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_min_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_max_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
+  
+  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
+  
+  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
+
+  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
+  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
+
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
+  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
+
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
+  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
+
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
+  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
+  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
+  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  
+  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+    return _mm512_mask_blend_ps(s, f, t);
+  }
+
+  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+    return madd(t,b-a,a);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 floor(const vfloat16& a) {
+    return _mm512_floor_ps(a);
+  }
+  __forceinline vfloat16 ceil (const vfloat16& a) {
+    return _mm512_ceil_ps(a);
+  }
+  __forceinline vfloat16 round (const vfloat16& a) {
+    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  }
+  __forceinline vint16 floori (const vfloat16& a) {
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+  }
+
+  __forceinline vfloat16 reverse(const vfloat16& v) {
+    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+
+  template<int i>
+  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+ 
+  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+    vfloat16 z = zero;
+    return mask_align_shift_right<15>(0xfffe,z,a,a);
+  }
+
+  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+    return align_shift_right<1>(zero,x);
+  }
+
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+  template<int N, int i>
+  vfloat<N> extractN(const vfloat16& v);
+
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
+                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
+                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
+                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+                     c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+  }
+
+  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
+                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
+                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
+                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
+                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+                     c0, c1, c2, c3, c4, c5, c6, c7);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+ 
+  __forceinline size_t select_min(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_max(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
+    const vbool16 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
+    const vbool16 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<16-1>(v,z));
+    v = min(v,align_shift_right<16-2>(v,z));
+    v = min(v,align_shift_right<16-4>(v,z));
+    v = min(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<16-1>(v,z));
+    v = max(v,align_shift_right<16-2>(v,z));
+    v = max(v,align_shift_right<16-4>(v,z));
+    v = max(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+
+  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<1>(z,v));
+    v = min(v,align_shift_right<2>(z,v));
+    v = min(v,align_shift_right<4>(z,v));
+    v = min(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<1>(z,v));
+    v = max(v,align_shift_right<2>(z,v));
+    v = max(v,align_shift_right<4>(z,v));
+    v = max(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
new file mode 100644
index 0000000000..5215bf9730
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -0,0 +1,722 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE float type */
+  template<>
+  struct vfloat<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+    
+    enum  { size = 4 };                        // number of SIMD elements
+    union { __m128 v; float f[4]; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m128 a) : v(a) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator       __m128&()       { return v; }
+
+    __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+    __forceinline explicit vfloat(const vuint4& x) {
+      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
+      const __m128  af  = _mm_cvtepi32_ps(a);
+      const __m128  bf  = _mm_castsi128_ps(b);  
+      v  = _mm_add_ps(af,bf);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+    static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+    static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+    return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+    return _mm_load_ps(ptr); 
+#endif
+  }
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const char* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const short* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const short* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+    static __forceinline vfloat4 load(const unsigned short* ptr) {
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vfloat4& v)
+    {
+#if defined (__SSE4_1__)
+      _mm_stream_ps((float*)ptr,v);
+#else
+      _mm_store_ps((float*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_ps(ptr, index, scale);
+#else
+      return vfloat4(
+        *(float*)(((char*)ptr)+scale*index[0]),
+        *(float*)(((char*)ptr)+scale*index[1]),
+        *(float*)(((char*)ptr)+scale*index[2]),
+        *(float*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+      vfloat4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 4); return f[index]; }
+
+    friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__)
+      return _mm_blendv_ps(f, t, m); 
+#else
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Load/Store
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<> struct mem<vfloat4>
+  {
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); }
+    
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); }
+  };
+    
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+
+  __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#if defined(__AVX512VL__)
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+  
+  __forceinline vfloat4 rcp(const vfloat4& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rcp14_ps(a);
+#else
+    const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+  __forceinline vfloat4 rsqrt(const vfloat4& a)
+  {
+#if defined(__AVX512VL__)
+    vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+    vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__ARM_NEON)
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#elif defined(__AVX2__)
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+    r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+    return r;
+  }
+
+  __forceinline vboolf4 isnan(const vfloat4& a) {
+    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#if defined(__AVX512VL__)
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
+  __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
+
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
+  __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
+
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
+  __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
+
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
+  __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
+
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__)
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#else
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      return min(a,b);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      return max(a,b);
+    }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+  __forceinline vfloat4& operator +=(vfloat4& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+  __forceinline vfloat4& operator -=(vfloat4& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+  __forceinline vfloat4& operator *=(vfloat4& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+  __forceinline vfloat4& operator /=(vfloat4& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
+  __forceinline vboolf4 operator ==(float          a, const vfloat4& b) { return vfloat4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vfloat4& a, float          b) { return a != vfloat4(b); }
+  __forceinline vboolf4 operator !=(float          a, const vfloat4& b) { return vfloat4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
+  __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
+  
+  __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
+  __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vfloat4& a, float          b) { return a >  vfloat4(b); }
+  __forceinline vboolf4 operator > (float          a, const vfloat4& b) { return vfloat4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vfloat4& a, float          b) { return a <= vfloat4(b); }
+  __forceinline vboolf4 operator <=(float          a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+    __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+  {
+#if defined(__SSE4_1__) 
+    return _mm_blend_ps(f, t, mask);
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+  
+  __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+    return madd(t,b-a,a);
+  }
+  
+  __forceinline bool isvalid(const vfloat4& v) {
+    return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite(const vfloat4& a) {
+    return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+    return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+  __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+  __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__SSE4_1__)
+    return vint4(floor(a));
+#else
+    return vint4(a-vfloat4(0.5f));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+  template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
+  template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+
+#if defined (__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+
+  __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); 
+  }
+
+#if defined (__AVX2__)
+  __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+    return _mm_permutevar_ps(a,index);
+  }
+
+  __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+  }  
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting Network
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 sort_ascending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = min(a0,b0);
+    const vfloat4 d0 = max(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = min(a1,b1);
+    const vfloat4 d1 = max(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = min(a2,b2);
+    const vfloat4 d2 = max(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vfloat4 sort_descending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = max(a0,b0);
+    const vfloat4 d0 = min(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = max(a1,b1);
+    const vfloat4 d1 = min(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = max(a2,b2);
+    const vfloat4 d2 = min(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf)); 
+    const vbool4 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf)); 
+    const vbool4 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+  {
+    const vfloat4 a0 = a;
+    const vfloat4 b0 = shuffle<1,2,0,3>(b);
+    const vfloat4 a1 = shuffle<1,2,0,3>(a);
+    const vfloat4 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h
new file mode 100644
index 0000000000..13446454e8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -0,0 +1,758 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX float type */
+  template<>
+  struct vfloat<8>
+  {
+    ALIGNED_STRUCT_(32);
+   
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };                        // number of SIMD elements
+    union { __m256 v; float f[8]; int i[8]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat8& other) { v = other.v; }
+    __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator       __m256&()       { return v; }
+
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+    __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm256_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm256_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm256_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat8 broadcast(const void* a) {
+      return _mm256_broadcast_ss((float*)a); 
+    }
+
+    static __forceinline vfloat8 load(const char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const unsigned char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+      
+    static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+    static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#else
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+#endif
+    
+#if defined(__AVX2__)
+    static __forceinline vfloat8 load_nt(void* ptr) {
+      return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+    }
+#endif
+    
+    static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+      _mm256_stream_ps((float*)ptr,v);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__)
+      return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+      return vfloat8(
+          *(float*)(((char*)ptr)+scale*index[0]),
+          *(float*)(((char*)ptr)+scale*index[1]),
+          *(float*)(((char*)ptr)+scale*index[2]),
+          *(float*)(((char*)ptr)+scale*index[3]),
+          *(float*)(((char*)ptr)+scale*index[4]),
+          *(float*)(((char*)ptr)+scale*index[5]),
+          *(float*)(((char*)ptr)+scale*index[6]),
+          *(float*)(((char*)ptr)+scale*index[7]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+      vfloat8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
+      return r;
+    #endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 8); return f[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+
+  __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
+    return _mm256_xor_ps(a, mask);
+  }
+  __forceinline vfloat8 abs(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+    return _mm256_and_ps(a, mask);
+  }
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+  static __forceinline vfloat8 rcp(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+    const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+#else
+    return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+  static __forceinline vfloat8 rsqrt(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+    const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+#else
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
+  __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
+
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
+  __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
+
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
+  __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
+
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
+  __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
+
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+  /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+#else
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    return asFloat(min(asInt(a),asInt(b)));
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    return asFloat(max(asInt(a),asInt(b)));
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+  __forceinline vfloat8& operator +=(vfloat8& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+  __forceinline vfloat8& operator -=(vfloat8& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+  __forceinline vfloat8& operator *=(vfloat8& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+  __forceinline vfloat8& operator /=(vfloat8& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_mask_blend_ps(m, f, t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m); 
+  }
+#endif
+
+  template<int mask>
+    __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+    return _mm256_blend_ps(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
+  __forceinline vboolf8 operator ==(const float&   a, const vfloat8& b) { return vfloat8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vfloat8& a, const float&   b) { return a != vfloat8(b); }
+  __forceinline vboolf8 operator !=(const float&   a, const vfloat8& b) { return vfloat8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vfloat8& a, const float&   b) { return a <  vfloat8(b); }
+  __forceinline vboolf8 operator < (const float&   a, const vfloat8& b) { return vfloat8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vfloat8& a, const float&   b) { return a >= vfloat8(b); }
+  __forceinline vboolf8 operator >=(const float&   a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vfloat8& a, const float&   b) { return a >  vfloat8(b); }
+  __forceinline vboolf8 operator > (const float&   a, const vfloat8& b) { return vfloat8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vfloat8& a, const float&   b) { return a <= vfloat8(b); }
+  __forceinline vboolf8 operator <=(const float&   a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat8& v) {
+    return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite (const vfloat8& a) {
+    return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+    return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+#if defined (__AVX2__)
+  static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+    return _mm256_permutevar8x32_ps(a, index);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+  }  
+#endif
+
+#if defined (__AVX_I__)
+  template<const int mode>
+  static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+    return _mm256_cvtps_ph(a, mode);
+  }
+
+  static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+    return _mm256_cvtph_ps(a);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    return align_shift_right<1>(zero,x);
+  }
+#else
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    const vfloat8 t0 = shuffle<1,2,3,0>(x);
+    const vfloat8 t1 = shuffle4<1,0>(t0);
+    return _mm256_blend_ps(t0,t1,0x88);
+  }
+#endif
+
+  __forceinline vint8 floori(const vfloat8& a) {
+    return vint8(floor(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+  {
+    vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+    vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+    c0 = shuffle4<0,2>(h0,h4);
+    c1 = shuffle4<0,2>(h1,h5);
+    c2 = shuffle4<0,2>(h2,h6);
+    c3 = shuffle4<0,2>(h3,h7);
+    c4 = shuffle4<1,3>(h0,h4);
+    c5 = shuffle4<1,3>(h1,h5);
+    c6 = shuffle4<1,3>(h2,h6);
+    c7 = shuffle4<1,3>(h3,h7);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
+    const vbool8 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(neg_inf)); 
+    const vbool8 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators (pairs of Vec3fa's)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+  //  return vreduce_add4(a*b);
+  //}
+
+  __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+    return _mm256_dp_ps(a,b,0x7F);
+  }
+
+  __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+  {
+    const vfloat8 a0 = a;
+    const vfloat8 b0 = shuffle<1,2,0,3>(b);
+    const vfloat8 a1 = shuffle<1,2,0,3>(a);
+    const vfloat8 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+  //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+  //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+  //__forceinline float length     (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+  __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+  //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+  //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  //__forceinline float area    (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+  //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+  //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+  //  const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  //}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// In Register Sorting
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 sort_ascending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = min(a0,b0);
+    const vfloat8 d0 = max(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = min(a1,b1);
+    const vfloat8 d1 = max(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = min(a2,b2);
+    const vfloat8 d2 = max(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = min(a3,b3);
+    const vfloat8 d3 = max(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = min(a4,b4);
+    const vfloat8 d4 = max(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = min(a5,b5);
+    const vfloat8 d5 = max(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+   __forceinline vfloat8 sort_descending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = max(a0,b0);
+    const vfloat8 d0 = min(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = max(a1,b1);
+    const vfloat8 d1 = min(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = max(a2,b2);
+    const vfloat8 d2 = min(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = max(a3,b3);
+    const vfloat8 d3 = min(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = max(a4,b4);
+    const vfloat8 d4 = min(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = max(a5,b5);
+    const vfloat8 d5 = min(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h
new file mode 100644
index 0000000000..3720c3c9d6
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint16_avx512.h
@@ -0,0 +1,472 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 integer type */
+  template<>
+  struct vint<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vint() {}
+    __forceinline vint(const vint16& t) { v = t.v; }
+    __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+    __forceinline vint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vint(int i) {
+      v = _mm512_set1_epi32(i);
+    }
+    
+    __forceinline vint(int a, int b, int c, int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vint(int a0 , int a1 , int a2 , int a3,
+                       int a4 , int a5 , int a6 , int a7,
+                       int a8 , int a9 , int a10, int a11,
+                       int a12, int a13, int a14, int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+    __forceinline vint(const vint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+      v = _mm512_castsi128_si512(a);
+      v = _mm512_inserti32x4(v, b, 1);
+      v = _mm512_inserti32x4(v, c, 2);
+      v = _mm512_inserti32x4(v, d, 3);
+    }
+
+    __forceinline vint(const vint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+
+    __forceinline vint(const vint8& a, const vint8& b) {
+      v = _mm512_castsi256_si512(a);
+      v = _mm512_inserti64x4(v, b, 1);
+    }
+   
+    __forceinline explicit vint(const __m512& f) {
+      v = _mm512_cvtps_epi32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint(ZeroTy)   : v(_mm512_setzero_epi32()) {}
+    __forceinline vint(OneTy)    : v(_mm512_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)   : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+    static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+ 
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+    static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vint16 operator +(const vint16& a) { return a; }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
+  __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
+
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
+  __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
+
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
+  __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
+
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
+  __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
+
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
+  __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
+
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
+  __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
+
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
+  __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
+
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
+  __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
+  
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+  __forceinline vint16& operator +=(vint16& a, int           b) { return a = a + b; }
+  
+  __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+  __forceinline vint16& operator -=(vint16& a, int           b) { return a = a - b; }
+
+  __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+  __forceinline vint16& operator *=(vint16& a, int           b) { return a = a * b; }
+  
+  __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+  __forceinline vint16& operator &=(vint16& a, int           b) { return a = a & b; }
+  
+  __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+  __forceinline vint16& operator |=(vint16& a, int           b) { return a = a | b; }
+  
+  __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+  __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
+  __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
+  __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
+  __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
+  __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
+  __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
+  __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+    
+ 
+  __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+  template<int i>
+    __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline int toScalar(const vint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+  template<int N, int i>
+  vint<N> extractN(const vint16& v);
+
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 vreduce_min2(vint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_max2(vint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_and2(vint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_or2(vint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_add2(vint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+  
+  __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 conflict(const vint16& index)
+  {
+    return _mm512_conflict_epi32(index);
+  }
+
+  __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+  {
+    return _mm512_mask_conflict_epi32(dest,mask,index);
+  }    
+
+  __forceinline vint16 convert_uint32_t(const __m512& f) {
+    return _mm512_cvtps_epu32(f);
+  }
+
+  __forceinline vint16 permute(vint16 v, vint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vint16 reverse(const vint16 &a) {
+    return permute(a,vint16(reverse_step));
+  }
+
+  __forceinline vint16 prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vint16 reverse_prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  /* this should use a vbool8 and a vint8_64...*/
+  template<int scale = 1, int hint = _MM_HINT_T0>
+    __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+  {
+#if defined(__AVX512PF__)
+    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
new file mode 100644
index 0000000000..9814d5c71c
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -0,0 +1,598 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };             // number of SIMD elements
+    union { __m128i v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint() {}
+    __forceinline vint(const vint4& a) { v = a.v; }
+    __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+    __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+    __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm_setzero_si128()) {}
+    __forceinline vint(OneTy)         : v(_mm_set_epi32(1, 1, 1, 1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+    __forceinline vint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+      return _mm_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+      return _mm_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__SSE4_1__)
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+#else
+
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    } 
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+#endif
+
+    static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline void store(unsigned char* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned char)v[i];
+#endif
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint4& v) {
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_epi32(ptr, index, scale);
+#else
+      return vint4(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+      vint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+#if defined(__x86_64__)
+    static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vint4 operator +(const vint4& a) { return a; }
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__SSSE3__)
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
+  __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
+
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
+  __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+  __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
+  __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
+
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
+  __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
+
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
+  __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
+
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
+  __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
+
+  __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
+
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+  __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
+  
+  __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+  __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+  __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
+#endif
+  
+  __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+  __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
+  
+  __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+  __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
+  
+  __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+  __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vint4& a, int          b) { return a == vint4(b); }
+  __forceinline vboolf4 operator ==(int          a, const vint4& b) { return vint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vint4& a, int          b) { return a != vint4(b); }
+  __forceinline vboolf4 operator !=(int          a, const vint4& b) { return vint4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vint4& a, int          b) { return a <  vint4(b); }
+  __forceinline vboolf4 operator < (int          a, const vint4& b) { return vint4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vint4& a, int          b) { return a >= vint4(b); }
+  __forceinline vboolf4 operator >=(int          a, const vint4& b) { return vint4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vint4& a, int          b) { return a >  vint4(b); }
+  __forceinline vboolf4 operator > (int          a, const vint4& b) { return vint4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vint4& a, int          b) { return a <= vint4(b); }
+  __forceinline vboolf4 operator <=(int          a, const vint4& b) { return vint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vint4 min(const vint4& a, int          b) { return min(a,vint4(b)); }
+  __forceinline vint4 min(int          a, const vint4& b) { return min(vint4(a),b); }
+  __forceinline vint4 max(const vint4& a, int          b) { return max(a,vint4(b)); }
+  __forceinline vint4 max(int          a, const vint4& b) { return max(vint4(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+
+  __forceinline size_t toSizeT(const vint4& v) { 
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+    return toScalar(v);
+#else
+    return _mm_cvtsi128_si64(v); 
+#endif
+  }
+
+#if defined(__AVX512VL__)
+
+  __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+  }
+
+  template<int i>
+  __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+    return _mm_alignr_epi32(a, b, i);    
+  }  
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umin(a0,b0);
+    const vint4 d0 = umax(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umin(a1,b1);
+    const vint4 d1 = umax(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umin(a2,b2);
+    const vint4 d2 = umax(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umax(a0,b0);
+    const vint4 d0 = umin(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umax(a1,b1);
+    const vint4 d1 = umin(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umax(a2,b2);
+    const vint4 d2 = umin(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+#else
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = min(a0,b0);
+    const vint4 d0 = max(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = min(a1,b1);
+    const vint4 d1 = max(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = min(a2,b2);
+    const vint4 d2 = max(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = max(a0,b0);
+    const vint4 d0 = min(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = max(a1,b1);
+    const vint4 d1 = min(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = max(a2,b2);
+    const vint4 d2 = min(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h
new file mode 100644
index 0000000000..f43e9a8c22
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -0,0 +1,470 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vint8 load(const unsigned char* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned char* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 load(const unsigned short* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned short* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i) {
+      vint4 il(i.vl);
+      vint4 ih(i.vh);
+      vint4::store(ptr + 0,il);
+      vint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+      return vint8(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]),
+          *(int*)(((char*)ptr)+scale*index[4]),
+          *(int*)(((char*)ptr)+scale*index[5]),
+          *(int*)(((char*)ptr)+scale*index[6]),
+          *(int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+      vint8 r = zero;
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+  __forceinline vint8 abs       (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+  __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umin(const vint8& a, int          b) { return umin(a,vint8(b)); }
+  __forceinline vint8 umin(int          a, const vint8& b) { return umin(vint8(a),b); }
+
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umax(const vint8& a, int          b) { return umax(a,vint8(b)); }
+  __forceinline vint8 umax(int          a, const vint8& b) { return umax(vint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+  
+  __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+  
+  __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+  __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h
new file mode 100644
index 0000000000..e04737ffbe
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32(ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+      vint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vint8 select(const vint8& t, const vint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+  __forceinline vint8 permute(const vint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong4_avx2.h b/thirdparty/embree/common/simd/vllong4_avx2.h
new file mode 100644
index 0000000000..6c86845877
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong4_avx2.h
@@ -0,0 +1,352 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX2 64-bit long long type */
+  template<>
+  struct vllong<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256i v; 
+      long long i[4];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong4& t) { v = t.v; }
+    __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m256i& t) { v = t; }
+    __forceinline operator __m256i() const { return v; }
+    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+    __forceinline vllong(long long i) {
+      v = _mm256_set1_epi64x(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm256_set_epi64x(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+    __forceinline vllong(OneTy)  : v(_mm256_set1_epi64x(1)) {}
+    __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+    }
+
+    static __forceinline vllong4 loadu(const void* addr)
+    {
+      return _mm256_loadu_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const vllong4* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const long long* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline void store(void* ptr, const vllong4& v) {
+      _mm256_store_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong4& v) {
+      _mm256_storeu_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_store_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+  #if defined(__AVX512VL__)
+    return _mm256_mask_blend_epi64(m, f, t);
+  #else
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+  #endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+  __forceinline vllong4 operator +(const vllong4& a) { return a; }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
+  __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
+
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
+  __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
+
+  /* only low 32bit part */
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
+  __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
+
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
+  __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
+
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
+  __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
+
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
+  __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
+
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+  //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  
+  //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+  //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
+  //__forceinline vllong4 min(long long      a, const vllong4& b) { return min(vllong4(a),b); }
+
+  //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+  //__forceinline vllong4 max(const vllong4& a, long long      b) { return max(a,vllong4(b)); }
+  //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+  __forceinline vllong4& operator +=(vllong4& a, long long      b) { return a = a + b; }
+  
+  __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+  __forceinline vllong4& operator -=(vllong4& a, long long      b) { return a = a - b; }
+
+  __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+  __forceinline vllong4& operator *=(vllong4& a, long long      b) { return a = a * b; }
+  
+  __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+  __forceinline vllong4& operator &=(vllong4& a, long long      b) { return a = a & b; }
+  
+  __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+  __forceinline vllong4& operator |=(vllong4& a, long long      b) { return a = a | b; }
+  
+  __forceinline vllong4& operator <<=(vllong4& a, long long      b) { return a = a << b; }
+  //__forceinline vllong4& operator >>=(vllong4& a, long long      b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+  __forceinline vboold4 operator ==(const vllong4& a, long long      b) { return a == vllong4(b); }
+  __forceinline vboold4 operator ==(long long      a, const vllong4& b) { return vllong4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vllong4& a, long long      b) { return a != vllong4(b); }
+  __forceinline vboold4 operator !=(long long      a, const vllong4& b) { return vllong4(a) != b; }
+
+  __forceinline vboold4 operator > (const vllong4& a, long long      b) { return a >  vllong4(b); }
+  __forceinline vboold4 operator > (long long      a, const vllong4& b) { return vllong4(a) >  b; }
+
+  __forceinline vboold4 operator < (const vllong4& a, long long      b) { return a <  vllong4(b); }
+  __forceinline vboold4 operator < (long long      a, const vllong4& b) { return vllong4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vllong4& a, long long      b) { return a >= vllong4(b); }
+  __forceinline vboold4 operator >=(long long      a, const vllong4& b) { return vllong4(a) >= b; }
+
+  __forceinline vboold4 operator <=(const vllong4& a, long long      b) { return a <= vllong4(b); }
+  __forceinline vboold4 operator <=(long long      a, const vllong4& b) { return vllong4(a) <= b; }
+
+  __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+  __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+  __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a >  b; }
+  __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle2(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+  }
+
+  __forceinline long long toScalar(const vllong4& v) {
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+  }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+    // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return _mm256_permutex2var_epi64(a,index,a);
+#else
+    return _mm256_permutexvar_epi64(index,a);
+#endif
+  }
+
+  __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+    return _mm256_permutex2var_epi64(a,index,b);
+  }
+
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+
+  __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+  __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h
new file mode 100644
index 0000000000..ee69411637
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong8_avx512.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 8-wide AVX-512 64-bit long long type */
+  template<>
+  struct vllong<8>
+  {
+    ALIGNED_STRUCT_(64);
+        
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m512i v; 
+      long long i[8];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong8& t) { v = t.v; }
+    __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vllong(long long i) {
+      v = _mm512_set1_epi64(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm512_set4_epi64(d,c,b,a);      
+    }
+
+    __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+                         long long a4, long long a5, long long a6, long long a7)
+    {
+      v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline vllong(const vllong<4>& i) {
+      v = _mm512_broadcast_i64x4(i);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vllong(OneTy)  : v(_mm512_set1_epi64(1)) {}
+    __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vllong8 loadu(const void* addr) {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const vllong8* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const long long* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const unsigned char* ptr) {
+      return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
+    }
+
+    static __forceinline void store(void* ptr, const vllong8& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong8& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+      _mm512_mask_storeu_epi64(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+      _mm512_mask_store_epi64(addr,mask,v2);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_compress_epi64(a,mask,b);
+    }
+
+    static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_expand_epi64(b,mask,a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+  __forceinline vllong8 operator +(const vllong8& a) { return a; }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
+  __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
+
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
+  __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
+
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
+  __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
+
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
+  __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
+
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
+  __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
+
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
+  __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
+
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
+  __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
+
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
+  __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
+  
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8& operator +=(vllong8& a, const vllong8&  b) { return a = a + b; }
+  __forceinline vllong8& operator +=(vllong8& a, long long       b) { return a = a + b; }
+  
+  __forceinline vllong8& operator -=(vllong8& a, const vllong8&  b) { return a = a - b; }
+  __forceinline vllong8& operator -=(vllong8& a, long long       b) { return a = a - b; }
+
+  __forceinline vllong8& operator *=(vllong8& a, const vllong8&  b) { return a = a * b; }
+  __forceinline vllong8& operator *=(vllong8& a, long long       b) { return a = a * b; }
+  
+  __forceinline vllong8& operator &=(vllong8& a, const vllong8&  b) { return a = a & b; }
+  __forceinline vllong8& operator &=(vllong8& a, long long       b) { return a = a & b; }
+  
+  __forceinline vllong8& operator |=(vllong8& a, const vllong8&  b) { return a = a | b; }
+  __forceinline vllong8& operator |=(vllong8& a, long long       b) { return a = a | b; }
+  
+  __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+  __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
+  __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
+  
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
+  __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
+  
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
+  __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
+  
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
+  __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
+  __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
+  __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
+
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+    
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+    return _mm512_mask_or_epi64(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return shuffle4<i, i>(v);
+  }
+
+  template<int i>
+  __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+    return _mm512_alignr_epi64(a, b, i);
+  };
+
+  __forceinline long long toScalar(const vllong8& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 vreduce_min2(vllong8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_max2(vllong8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_and2(vllong8 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_or2(vllong8 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_add2(vllong8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+  __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+  __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+    return _mm512_permutexvar_epi64(index,v);  
+  }
+
+  __forceinline vllong8 reverse(const vllong8& a) {
+    return permute(a,vllong8(reverse_step));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h
new file mode 100644
index 0000000000..c9eb6682ff
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint16_avx512.h
@@ -0,0 +1,424 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 unsigned integer type */
+  template<>
+  struct vuint<16>
+  {
+    ALIGNED_STRUCT_(64);   
+
+    typedef vboolf16 Bool;
+    typedef vuint16  UInt;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      unsigned int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint16& t) { v = t.v; }
+    __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+    __forceinline vuint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vuint(unsigned int i) {
+      v = _mm512_set1_epi32(i);
+    }
+
+    __forceinline vuint(const vuint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vuint(const vuint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+    
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+                        unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+                        unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+                        unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline explicit vuint(const __m512& f) {
+      v = _mm512_cvtps_epu32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vuint(OneTy)  : v(_mm512_set1_epi32(1)) {}
+    __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vuint16 loadu(const void* addr)
+    {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vuint16 load(const vuint16* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(const unsigned int* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+    static __forceinline void store(void* ptr, const vuint16& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vuint16& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+      _mm512_mask_storeu_epi32(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+      _mm512_mask_store_epi32(addr,mask,v2);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vuint16 operator +(const vuint16& a) { return a; }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
+  __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
+
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
+  __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
+
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
+  __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
+
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
+  __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
+
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
+  __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
+
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
+  __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
+
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
+  __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
+
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
+  __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
+  
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+  __forceinline vuint16& operator +=(vuint16& a, unsigned int   b) { return a = a + b; }
+  
+  __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+  __forceinline vuint16& operator -=(vuint16& a, unsigned int   b) { return a = a - b; }
+
+  __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+  __forceinline vuint16& operator *=(vuint16& a, unsigned int   b) { return a = a * b; }
+  
+  __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+  __forceinline vuint16& operator &=(vuint16& a, unsigned int   b) { return a = a & b; }
+  
+  __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+  __forceinline vuint16& operator |=(vuint16& a, unsigned int   b) { return a = a | b; }
+  
+  __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
+  __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
+  __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
+  __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
+  __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
+  __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
+  __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+    
+ 
+  __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline unsigned int toScalar(const vuint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 vreduce_min2(vuint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_max2(vuint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_and2(vuint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_or2(vuint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_add2(vuint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vuint16 reverse(const vuint16& a) {
+    return permute(a,vuint16(reverse_step));
+  }
+
+  __forceinline vuint16 prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vuint16 reverse_prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
new file mode 100644
index 0000000000..0601b9ab80
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -0,0 +1,426 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vuint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vuint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union { __m128i v; unsigned int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint4& a) { v = a.v; }
+    __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+    __forceinline vuint(const __m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+
+    __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm_setzero_si128()) {}
+    __forceinline vuint(OneTy)    : v(_mm_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+    __forceinline vuint(StepTy)   : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vuint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vuint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+#endif
+
+    static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline vuint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); 
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+      return vuint4(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+      vuint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vuint4 operator +(const vuint4& a) { return a; }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
+  __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
+
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
+  __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+//  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
+//  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
+
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
+  __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
+
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
+  __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
+
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
+
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+  __forceinline vuint4& operator +=(vuint4& a, unsigned int  b) { return a = a + b; }
+  
+  __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+  __forceinline vuint4& operator -=(vuint4& a, unsigned int  b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+//  __forceinline vuint4& operator *=(vuint4& a, unsigned int  b) { return a = a * b; }
+//#endif
+  
+  __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+  __forceinline vuint4& operator &=(vuint4& a, unsigned int  b) { return a = a & b; }
+  
+  __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+  __forceinline vuint4& operator |=(vuint4& a, unsigned int  b) { return a = a | b; }
+  
+  __forceinline vuint4& operator <<=(vuint4& a, unsigned int  b) { return a = a << b; }
+  __forceinline vuint4& operator >>=(vuint4& a, unsigned int  b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vuint4& a, unsigned int  b) { return a == vuint4(b); }
+  __forceinline vboolf4 operator ==(unsigned int  a, const vuint4& b) { return vuint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vuint4& a, unsigned int  b) { return a != vuint4(b); }
+  __forceinline vboolf4 operator !=(unsigned int  a, const vuint4& b) { return vuint4(a) != b; }
+
+  //__forceinline vboolf4 operator < (const vuint4& a, unsigned int  b) { return a <  vuint4(b); }
+  //__forceinline vboolf4 operator < (unsigned int  a, const vuint4& b) { return vuint4(a) <  b; }
+
+  //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int  b) { return a >= vuint4(b); }
+  //__forceinline vboolf4 operator >=(unsigned int  a, const vuint4& b) { return vuint4(a) >= b; }
+
+  //__forceinline vboolf4 operator > (const vuint4& a, unsigned int  b) { return a >  vuint4(b); }
+  //__forceinline vboolf4 operator > (unsigned int  a, const vuint4& b) { return vuint4(a) >  b; }
+
+  //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int  b) { return a <= vuint4(b); }
+  //__forceinline vboolf4 operator <=(unsigned int  a, const vuint4& b) { return vuint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+  //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a <  b; }
+  //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+  //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a >  b; }
+  //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <  b); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >  b); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+/*#if defined(__SSE4_1__)
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vuint4 min(const vuint4& a, unsigned int  b) { return min(a,vuint4(b)); }
+  __forceinline vuint4 min(unsigned int  a, const vuint4& b) { return min(vuint4(a),b); }
+  __forceinline vuint4 max(const vuint4& a, unsigned int  b) { return max(a,vuint4(b)); }
+  __forceinline vuint4 max(unsigned int  a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+  __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h
new file mode 100644
index 0000000000..589cd9d731
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);   
+
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vuint8 load(const unsigned char* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned char* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 load(const unsigned short* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned short* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i) {
+      vuint4 il(i.vl);
+      vuint4 ih(i.vh);
+      vuint4::store(ptr + 0,il);
+      vuint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+      return vuint8(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]),
+          *(unsigned int*)(((char*)ptr)+scale*index[4]),
+          *(unsigned int*)(((char*)ptr)+scale*index[5]),
+          *(unsigned int*)(((char*)ptr)+scale*index[6]),
+          *(unsigned int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+      vuint8 r = zero;
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+  __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                       _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+  
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+  
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h
new file mode 100644
index 0000000000..17b994522f
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -0,0 +1,446 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);
+        
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vuint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+      vuint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+  //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a <  b; }
+  //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+  //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a >  b; }
+  //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+  //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <  b); }
+  //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+  //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >  b); }
+  //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+  __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
new file mode 100644
index 0000000000..abdd269069
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -0,0 +1,327 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+namespace embree
+{
+  void* alignedMalloc(size_t size, size_t align) 
+  {
+    if (size == 0)
+      return nullptr;
+    
+    assert((align & (align-1)) == 0);
+    void* ptr = _mm_malloc(size,align);
+
+    if (size != 0 && ptr == nullptr)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+    
+    return ptr;
+  }
+  
+  void alignedFree(void* ptr)
+  {
+    if (ptr)
+      _mm_free(ptr);
+  }
+
+  static bool huge_pages_enabled = false;
+  static MutexSys os_init_mutex;
+
+  __forceinline bool isHugePageCandidate(const size_t bytes) 
+  {
+    if (!huge_pages_enabled)
+      return false;
+
+    /* use huge pages only when memory overhead is low */
+    const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+    return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+  bool win_enable_selockmemoryprivilege (bool verbose)
+  {
+    HANDLE hToken;
+    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+      if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+
+    TOKEN_PRIVILEGES tp;
+    tp.PrivilegeCount = 1;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+    if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+      if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+    
+    SetLastError(ERROR_SUCCESS);
+    if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+      return false;
+    }
+    
+    if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+      return false;
+    } 
+
+    return true;
+  }
+
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+    if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+      huge_pages_enabled = false;
+      return false;
+    }
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  {
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+      int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+      if (ptr != nullptr) {
+        hugepages = true;
+        return ptr;
+      }
+    } 
+
+    /* fall back to 4k pages */
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+    // -- GODOT start --
+    // if (ptr == nullptr) throw std::bad_alloc();
+    if (ptr == nullptr) abort();
+    // -- GODOT end --
+    hugepages = false;
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    if (hugepages) // decommitting huge pages seems not to work under Windows
+      return bytesOld;
+
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0) 
+      return;
+
+    if (!VirtualFree(ptr,0,MEM_RELEASE))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  void os_advise(void *ptr, size_t bytes)
+  {
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+#if defined(__LINUX__)
+
+    int hugepagesize = 0;
+
+    std::ifstream file; 
+    file.open("/proc/meminfo",std::ios::in);
+    if (!file.is_open()) {
+      if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+    
+    std::string line;
+    while (getline(file,line))
+    {
+      std::stringstream sline(line);
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string tag; getline(sline,tag,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string val; getline(sline,val,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string unit; getline(sline,unit,' ');
+      if (tag == "Hugepagesize:" && unit == "kB") {
+	hugepagesize = std::stoi(val)*1024;
+	break;
+      }
+    }
+    
+    if (hugepagesize != PAGE_SIZE_2M) 
+    {
+      if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+#endif
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  { 
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+#if defined(__MACOSX__)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#elif defined(MAP_HUGETLB)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#endif
+    } 
+
+    /* fallback to 4k pages */
+    void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    // -- GODOT start --
+    // if (ptr == MAP_FAILED) throw std::bad_alloc();
+    if (ptr == MAP_FAILED) abort();
+    // -- GODOT end --
+    hugepages = false;
+
+    /* advise huge page hint for THP */
+    os_advise(ptr,bytes);
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0)
+      return;
+
+    /* for hugepages we need to also align the size */
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytes = (bytes+pageSize-1) & ~(pageSize-1);
+    if (munmap(ptr,bytes) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  /* hint for transparent huge pages (THP) */
+  void os_advise(void* pptr, size_t bytes)
+  {
+#if defined(MADV_HUGEPAGE)
+    madvise(pptr,bytes,MADV_HUGEPAGE); 
+#endif
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h
new file mode 100644
index 0000000000..4fa474ec1d
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align)                                           \
+  void* operator new(size_t size) { return alignedMalloc(size,align); } \
+  void operator delete(void* ptr) { alignedFree(ptr); }                 \
+  void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedFree(ptr); }
+
+#define ALIGNED_CLASS_(align)                                           \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                              \
+ private:
+  
+  /*! aligned allocation */
+  void* alignedMalloc(size_t size, size_t align);
+  void alignedFree(void* ptr);
+  
+  /*! allocator that performs aligned allocations */
+  template<typename T, size_t alignment>
+    struct aligned_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return alignedFree(p);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocates pages directly from OS */
+  bool win_enable_selockmemoryprivilege(bool verbose);
+  bool os_init(bool hugepages, bool verbose);
+  void* os_malloc (size_t bytes, bool& hugepages);
+  size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+  void  os_free   (void* ptr, size_t bytes, bool hugepages);
+  void  os_advise (void* ptr, size_t bytes);
+
+  /*! allocator that performs OS allocations */
+  template<typename T>
+    struct os_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline os_allocator () 
+        : hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return os_free(p,n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+      bool hugepages;
+    };
+
+  /*! allocator for IDs */
+  template<typename T, size_t max_id>
+    struct IDPool
+    {
+      typedef T value_type;
+
+      IDPool ()
+      : nextID(0) {}
+
+      T allocate() 
+      {
+        /* return ID from list */
+        if (!IDs.empty()) 
+        {
+          T id = *IDs.begin();
+          IDs.erase(IDs.begin());
+          return id;
+        } 
+
+        /* allocate new ID */
+        else
+        {
+          if (size_t(nextID)+1 > max_id)
+            return -1;
+          
+          return nextID++;
+        }
+      }
+
+      /* adds an ID provided by the user */
+      bool add(T id)
+      {
+        if (id > max_id)
+          return false;
+        
+        /* check if ID should be in IDs set */
+        if (id < nextID) {
+          auto p = IDs.find(id);
+          if (p == IDs.end()) return false;
+          IDs.erase(p);
+          return true;
+        }
+
+        /* otherwise increase ID set */
+        else
+        {
+          for (T i=nextID; i<id; i++) {
+            IDs.insert(i);
+          }
+          nextID = id+1;
+          return true;
+        }
+      }
+
+      void deallocate( T id ) 
+      {
+        assert(id < nextID);
+        MAYBE_UNUSED auto done = IDs.insert(id).second;
+        assert(done);
+      }
+
+    private:
+      std::set<T> IDs;   //!< stores deallocated IDs to be reused
+      T nextID;          //!< next ID to use when IDs vector is empty
+    };
+}
+
diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h
new file mode 100644
index 0000000000..dd9190c52a
--- /dev/null
+++ b/thirdparty/embree/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+  /*! static array with static size */
+  template<typename T, size_t N>
+    class array_t
+    {
+    public:
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+N; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return N == 0; }
+      __forceinline size_t size     () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+            
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline T& front() const { assert(N > 0); return items[0]; };
+      __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      T items[N];
+    };
+
+  /*! static array with dynamic size */
+  template<typename T, size_t N>
+    class darray_t
+    {
+    public:
+
+      __forceinline darray_t () : M(0) {}
+
+      __forceinline darray_t (const T& v) : M(0) {
+        for (size_t i=0; i<N; i++) items[i] = v;
+      }
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+M; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return M == 0; }
+      __forceinline size_t size     () const { return M; }
+      __forceinline size_t capacity () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+      
+      void resize(size_t new_size) {
+        assert(new_size < max_size());
+        M = new_size;
+      }
+
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& v) 
+      {
+        assert(M+1 < max_size());
+        items[M++] = v;
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        M--;
+      }
+
+      __forceinline void clear() {
+        M = 0;
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline T& front() const { assert(M > 0); return items[0]; };
+      __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      size_t M;
+      T items[N];
+    };
+
+  /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+  template<typename Ty, size_t max_stack_bytes>
+    struct __aligned(64) StackArray
+  {
+    __forceinline StackArray (const size_t N)
+      : N(N)
+    {
+      if (N*sizeof(Ty) <= max_stack_bytes) 
+        data = &arr[0];
+      else
+        data = (Ty*) alignedMalloc(N*sizeof(Ty),64); 
+    }
+
+    __forceinline ~StackArray () {
+      if (data != &arr[0]) alignedFree(data);
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)       { assert(i>=0 && i<N); return data[i]; }
+    __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+    __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__64BIT__)
+    __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+  private:
+    Ty arr[max_stack_bytes/sizeof(Ty)];
+    Ty* data;
+    size_t N;
+
+  private:
+    StackArray (const StackArray& other) DELETED; // do not implement
+    StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+  };
+
+  /*! dynamic sized array that is allocated on the stack */
+  template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+    struct __aligned(64) DynamicStackArray
+  {
+    __forceinline DynamicStackArray ()
+      : data(&arr[0]) {}
+
+    __forceinline ~DynamicStackArray ()
+    {
+      if (!isStackAllocated())
+        delete[] data;
+    }
+
+    __forceinline bool isStackAllocated() const {
+      return data == &arr[0];
+    }
+
+    __forceinline size_t size() const
+    {
+      if (isStackAllocated()) return max_stack_elements;
+      else return max_total_elements;
+    }
+
+    __forceinline void resize(size_t M)
+    {
+      assert(M <= max_total_elements);
+      if (likely(M <= max_stack_elements)) return;
+      if (likely(!isStackAllocated())) return;
+
+      data = new Ty[max_total_elements];
+      
+      for (size_t i=0; i<max_stack_elements; i++)
+        data[i] = arr[i];
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+    __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__64BIT__)
+    __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+    __forceinline DynamicStackArray (const DynamicStackArray& other)
+      : data(&arr[0]) 
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+    }
+     
+    DynamicStackArray& operator= (const DynamicStackArray& other)
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+
+      return *this;
+    }
+
+  private:
+    Ty arr[max_stack_elements];
+    Ty* data;
+  };
+}
diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h
new file mode 100644
index 0000000000..67af254f36
--- /dev/null
+++ b/thirdparty/embree/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+#  define __memory_barrier() asm volatile("" ::: "memory")
+#elif  defined(_MSC_VER)
+#  define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+  template <typename T>
+    struct atomic : public std::atomic<T>
+  {
+    atomic () {}
+      
+    atomic (const T& a)
+      : std::atomic<T>(a) {}
+
+    atomic (const atomic<T>& a) {
+      this->store(a.load());
+    }
+
+    atomic& operator=(const atomic<T>& other) {
+      this->store(other.load());
+      return *this;
+    }
+  };
+
+  template<typename T>
+    __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a <= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+
+  template<typename T>
+    __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a >= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+}
diff --git a/thirdparty/embree/common/sys/barrier.cpp b/thirdparty/embree/common/sys/barrier.cpp
new file mode 100644
index 0000000000..0c0e39d92d
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : i(0), enterCount(0), exitCount(0), barrierSize(0) 
+    {
+      events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      init(N);
+    }
+    
+    __forceinline ~BarrierSysImplementation ()
+    {
+      CloseHandle(events[0]);
+      CloseHandle(events[1]);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      barrierSize = N;
+      enterCount.store(N);
+      exitCount.store(N);
+    }
+
+    __forceinline void wait()
+    {
+      /* every thread entering the barrier decrements this count */
+      size_t i0 = i;
+      size_t cnt0 = enterCount--;
+
+      /* all threads except the last one are wait in the barrier */
+      if (cnt0 > 1) 
+      {
+        if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+          THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+      }
+      
+      /* the last thread starts all threads waiting at the barrier */
+      else 
+      {
+        i = 1-i;
+        enterCount.store(barrierSize);
+        if (SetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("SetEvent failed");
+      }
+
+      /* every thread leaving the barrier decrements this count */
+      size_t cnt1 = exitCount--;
+
+      /* the last thread that left the barrier resets the event again */
+      if (cnt1 == 1) 
+      {
+        exitCount.store(barrierSize);
+        if (ResetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("ResetEvent failed");
+      }
+    }
+
+  public:
+    HANDLE events[2];
+    atomic<size_t> i;
+    atomic<size_t> enterCount;
+    atomic<size_t> exitCount;
+    size_t barrierSize;
+  };
+}
+
+#else
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : count(0), barrierSize(0) 
+    {
+      init(N);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      assert(count == 0);
+      count = 0;
+      barrierSize = N;
+    }
+
+    __forceinline void wait()
+    {
+      mutex.lock();
+      count++;
+      
+      if (count == barrierSize) {
+        count = 0;
+        cond.notify_all();
+        mutex.unlock();
+        return;
+      }
+      
+      cond.wait(mutex);
+      mutex.unlock();
+      return;
+    }
+
+  public:
+    MutexSys mutex;
+    ConditionSys cond;
+    volatile size_t count;
+    volatile size_t barrierSize;
+  };
+}
+
+#endif
+
+namespace embree
+{
+  BarrierSys::BarrierSys (size_t N) {
+    opaque = new BarrierSysImplementation(N);
+  }
+
+  BarrierSys::~BarrierSys () {
+    delete (BarrierSysImplementation*) opaque;
+  }
+
+  void BarrierSys::init(size_t count) {
+    ((BarrierSysImplementation*) opaque)->init(count);
+  }
+
+  void BarrierSys::wait() {
+    ((BarrierSysImplementation*) opaque)->wait();
+  }
+
+  LinearBarrierActive::LinearBarrierActive (size_t N) 
+    : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+  { 
+    if (N == 0) N = getNumberOfLogicalThreads();
+    init(N);
+  }
+
+  LinearBarrierActive::~LinearBarrierActive() 
+  {
+    delete[] count0;
+    delete[] count1;
+  }
+
+  void LinearBarrierActive::init(size_t N) 
+  {
+    if (threadCount != N) {
+      threadCount = N;
+      if (count0) delete[] count0; count0 = new unsigned char[N];
+      if (count1) delete[] count1; count1 = new unsigned char[N];
+    }
+    mode      = 0;
+    flag0     = 0;
+    flag1     = 0;
+    for (size_t i=0; i<N; i++) count0[i] = 0;
+    for (size_t i=0; i<N; i++) count1[i] = 0;
+  }
+
+  void LinearBarrierActive::wait (const size_t threadIndex)
+  {
+    if (mode == 0)
+    {			
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count1[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {
+          while (likely(count0[i] == 0)) 
+            pause_cpu();
+        }
+        mode  = 1;
+        flag1 = 0;
+        __memory_barrier();
+        flag0 = 1;
+      }			
+      else
+      {					
+        count0[threadIndex] = 1;
+        {
+          while (likely(flag0 == 0))
+            pause_cpu();
+        }
+        
+      }		
+    }					
+    else						
+    {
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count0[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {		
+          while (likely(count1[i] == 0))
+            pause_cpu();
+        }
+        
+        mode  = 0;
+        flag0 = 0;
+        __memory_barrier();
+        flag1 = 1;
+      }			
+      else
+      {					
+        count1[threadIndex] = 1;
+        {
+          while (likely(flag1 == 0))
+            pause_cpu();
+        }
+      }		
+    }					
+  }
+
+  struct barrier_sys_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> threadID;
+    std::atomic<size_t> numFailed;
+    std::vector<size_t> threadResults;
+
+    barrier_sys_regression_test() 
+      : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(barrier_sys_regression_test* This)
+    {
+      size_t tid = This->threadID++;
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        This->threadResults[tid] = tid;
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      threadID.store(0);
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      threadResults.resize(numThreads);
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+        barrier.wait();
+        barrier.wait();
+        for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+      }
+
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      return numFailed == 0;
+    }
+  };
+
+  barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h
new file mode 100644
index 0000000000..37fc036291
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system barrier using operating system */
+  class BarrierSys
+  {
+  public:
+
+    /*! construction / destruction */
+    BarrierSys (size_t N = 0);
+    ~BarrierSys ();
+
+  private:
+    /*! class in non-copyable */
+    BarrierSys (const BarrierSys& other) DELETED; // do not implement
+    BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t count);
+
+    /*! lets calling thread wait in barrier */
+    void wait();
+
+  private:
+    void* opaque;
+  };
+
+  /*! fast active barrier using atomitc counter */
+  struct BarrierActive 
+  {
+  public:
+    BarrierActive () 
+      : cntr(0) {}
+    
+    void reset() {
+      cntr.store(0);
+    }
+
+    void wait (size_t numThreads) 
+    {
+      cntr++;
+      while (cntr.load() != numThreads) 
+        pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr;
+  };
+
+  /*! fast active barrier that does not require initialization to some number of threads */
+  struct BarrierActiveAutoReset
+  {
+  public:
+    BarrierActiveAutoReset () 
+      : cntr0(0), cntr1(0) {}
+
+    void wait (size_t threadCount) 
+    {
+      cntr0.fetch_add(1);
+      while (cntr0 != threadCount) pause_cpu();
+      cntr1.fetch_add(1);
+      while (cntr1 != threadCount) pause_cpu();
+      cntr0.fetch_add(-1);
+      while (cntr0 != 0) pause_cpu();
+      cntr1.fetch_add(-1);
+      while (cntr1 != 0) pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr0;
+    std::atomic<size_t> cntr1;
+  };
+
+  class LinearBarrierActive
+  {
+  public:
+
+    /*! construction and destruction */
+    LinearBarrierActive (size_t threadCount = 0);
+    ~LinearBarrierActive();
+    
+  private:
+    /*! class in non-copyable */
+    LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+    LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t threadCount);
+    
+    /*! thread with threadIndex waits in the barrier */
+    void wait (const size_t threadIndex);
+    
+  private:
+    volatile unsigned char* count0;
+    volatile unsigned char* count1; 
+    volatile unsigned int mode;
+    volatile unsigned int flag0;
+    volatile unsigned int flag1;
+    volatile size_t threadCount;
+  };
+}
+
diff --git a/thirdparty/embree/common/sys/condition.cpp b/thirdparty/embree/common/sys/condition.cpp
new file mode 100644
index 0000000000..606a1d0b04
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () {
+      InitializeConditionVariable(&cond);
+    }
+
+    __forceinline ~ConditionImplementation () {
+    }
+
+    __forceinline void wait(MutexSys& mutex_in) {
+      SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+    }
+
+    __forceinline void notify_all() {
+      WakeAllConditionVariable(&cond);
+    }
+
+  public:
+    CONDITION_VARIABLE cond;
+  };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () { 
+      if (pthread_cond_init(&cond,nullptr) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_init failed");
+    }
+    
+    __forceinline ~ConditionImplementation() { 
+      MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0;
+      assert(ok);
+    }
+    
+    __forceinline void wait(MutexSys& mutex) { 
+      if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_wait failed");
+    }
+    
+    __forceinline void notify_all() { 
+      if (pthread_cond_broadcast(&cond) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_broadcast failed");
+    }
+    
+  public:
+    pthread_cond_t cond;
+  };
+}
+#endif
+
+namespace embree 
+{
+  ConditionSys::ConditionSys () { 
+    cond = new ConditionImplementation; 
+  }
+
+  ConditionSys::~ConditionSys() { 
+    delete (ConditionImplementation*) cond;
+  }
+
+  void ConditionSys::wait(MutexSys& mutex) { 
+    ((ConditionImplementation*) cond)->wait(mutex);
+  }
+
+  void ConditionSys::notify_all() { 
+    ((ConditionImplementation*) cond)->notify_all();
+  }
+}
diff --git a/thirdparty/embree/common/sys/condition.h b/thirdparty/embree/common/sys/condition.h
new file mode 100644
index 0000000000..557c6e3482
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+  class ConditionSys
+  {
+  public:
+    ConditionSys();
+    ~ConditionSys();
+    void wait( class MutexSys& mutex );
+    void notify_all();
+
+    template<typename Predicate>
+      __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+    {
+      while (!pred()) wait(mutex);
+    }
+
+  private:
+    ConditionSys (const ConditionSys& other) DELETED; // do not implement
+    ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+  protected:
+    void* cond;
+  };
+}
diff --git a/thirdparty/embree/common/sys/filename.cpp b/thirdparty/embree/common/sys/filename.cpp
new file mode 100644
index 0000000000..f55b224302
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.cpp
@@ -0,0 +1,138 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+  const char path_sep = '\\';
+#else
+  const char path_sep = '/';
+#endif
+
+  /*! create an empty filename */
+  FileName::FileName () {}
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const char* in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const std::string& in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+  
+  /*! returns path to home folder */
+  FileName FileName::homeFolder() 
+  {
+#ifdef __WIN32__
+    const char* home = getenv("UserProfile");
+#else
+    const char* home = getenv("HOME");
+#endif
+    if (home) return home;
+    return "";
+  }
+
+  /*! returns path to executable */
+  FileName FileName::executableFolder() {
+    return FileName(getExecutableFileName()).path();
+  }
+
+  /*! returns the path */
+  FileName FileName::path() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return FileName();
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename */
+  std::string FileName::base() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return filename;
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  std::string FileName::ext() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return "";
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  FileName FileName::dropExt() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return filename;
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename without extension */
+  std::string FileName::name() const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) end = filename.size();
+    return filename.substr(start, end - start);
+  }
+
+  /*! replaces the extension */
+  FileName FileName::setExt(const std::string& ext) const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) return FileName(filename+ext);
+    return FileName(filename.substr(0,end)+ext);
+  }
+
+  /*! adds the extension */
+  FileName FileName::addExt(const std::string& ext) const {
+    return FileName(filename+ext);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const FileName& other ) const {
+    if (filename == "") return FileName(other);
+    else return FileName(filename + path_sep + other.filename);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const std::string& other ) const {
+    return operator+(FileName(other));
+  }
+
+  /*! removes the base from a filename (if possible) */
+  FileName FileName::operator -( const FileName& base ) const {
+    size_t pos = filename.find_first_of(base);
+    if (pos == std::string::npos) return *this;
+    return FileName(filename.substr(pos+1));
+  }
+
+  /*! == operator */
+  bool operator== (const FileName& a, const FileName& b) {
+    return a.filename == b.filename;
+  }
+  
+  /*! != operator */
+  bool operator!= (const FileName& a, const FileName& b) {
+    return a.filename != b.filename;
+  }
+
+  /*! output operator */
+  std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+    return cout << filename.filename;
+  }
+}
diff --git a/thirdparty/embree/common/sys/filename.h b/thirdparty/embree/common/sys/filename.h
new file mode 100644
index 0000000000..d5929cd836
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.h
@@ -0,0 +1,81 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! Convenience class for handling file names and paths. */
+  class FileName
+  {
+  public:
+
+    /*! create an empty filename */
+    FileName ();
+
+    /*! create a valid filename from a string */
+    FileName (const char* filename);
+
+    /*! create a valid filename from a string */
+    FileName (const std::string& filename);
+    
+    /*! returns path to home folder */
+    static FileName homeFolder();
+
+    /*! returns path to executable */
+    static FileName executableFolder();
+
+    /*! auto convert into a string */
+    operator std::string() const { return filename; }
+
+    /*! returns a string of the filename */
+    const std::string str() const { return filename; }
+
+    /*! returns a c-string of the filename */
+    const char* c_str() const { return filename.c_str(); }
+
+    /*! returns the path of a filename */
+    FileName path() const;
+
+    /*! returns the file of a filename  */
+    std::string base() const;
+
+    /*! returns the base of a filename without extension */
+    std::string name() const;
+
+    /*! returns the file extension */
+    std::string ext() const;
+
+    /*! drops the file extension */
+    FileName dropExt() const;
+
+    /*! replaces the file extension */
+    FileName setExt(const std::string& ext = "") const;
+
+    /*! adds file extension */
+    FileName addExt(const std::string& ext = "") const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const FileName& other ) const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const std::string& other ) const;
+
+    /*! removes the base from a filename (if possible) */
+    FileName operator -( const FileName& base ) const;
+
+    /*! == operator */
+    friend bool operator==(const FileName& a, const FileName& b);
+
+    /*! != operator */
+    friend bool operator!=(const FileName& a, const FileName& b);
+
+    /*! output operator */
+    friend std::ostream& operator<<(std::ostream& cout, const FileName& filename);
+   
+  private:
+    std::string filename;
+  };
+}
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
new file mode 100644
index 0000000000..ed8dd7d40a
--- /dev/null
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <immintrin.h>
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+  #if !defined(_tzcnt_u32)
+    #define _tzcnt_u32 __tzcnt_u32
+  #endif
+  #if !defined(_tzcnt_u64)
+    #define _tzcnt_u64 __tzcnt_u64
+  #endif
+#endif
+
+#if defined(__LZCNT__)
+  #if !defined(_lzcnt_u32)
+    #define _lzcnt_u32 __lzcnt32
+  #endif
+  #if !defined(_lzcnt_u64)
+    #define _lzcnt_u64 __lzcnt64
+  #endif
+#endif
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#define NOMINMAX
+// -- GODOT start --
+#endif
+#include "windows.h"
+// -- GODOT end --
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__WIN32__)
+  
+  __forceinline size_t read_tsc()  
+  {
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return (size_t)li.QuadPart;
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline unsigned bscf(unsigned& v) 
+  {
+    unsigned i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) 
+    return 63 -_lzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+    long r = v; _bittestandcomplement(&r,i); return r;
+  }
+  
+  __forceinline int bts(int v, int i) {
+    long r = v; _bittestandset(&r,i); return r;
+  }
+  
+  __forceinline int btr(int v, int i) {
+    long r = v; _bittestandreset(&r,i); return r;
+  }
+  
+#if defined(__X86_64__)
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+    size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+    __int64 r = v; _bittestandset64(&r,i); return r;
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+    __int64 r = v; _bittestandreset64(&r,i); return r;
+  }
+  
+#endif
+  
+  __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+    return _InterlockedCompareExchange((volatile long*)p,v,c);
+  }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#else
+  
+#if defined(__i386__) && defined(__PIC__)
+  
+  __forceinline void __cpuid(int out[4], int op) 
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 
+                  : "0"(op)); 
+  }
+  
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) 
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2)); 
+  }
+  
+#elif defined(__X86_ASM__)
+
+  __forceinline void __cpuid(int out[4], int op) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); 
+  }
+  
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 
+  }
+  
+#endif
+  
+  __forceinline uint64_t read_tsc()  {
+#if defined(__X86_ASM__)
+    uint32_t high,low;
+    asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+    return (((uint64_t)high) << 32) + (uint64_t)low;
+#else
+    /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
+    return 0;
+#endif
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned bsf(unsigned v) 
+  {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+  }
+#endif
+  
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+    return _tzcnt_u64(v);
+#else
+    return _tzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctzl(v);
+#endif
+  }
+
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned int bscf(unsigned int& v) 
+  {
+    unsigned int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+#endif
+  
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+    return 63 - _lzcnt_u64(v);
+#else
+    return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#endif
+  }
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+
+  __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__) 
+#if defined(__INTEL_COMPILER)
+    return _blsr_u64(v);
+#else
+#if defined(__X86_64__)
+    return __blsr_u64(v);
+#else
+    return __blsr_u32(v);
+#endif
+#endif
+#else
+    return v & (v-1);
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline int bts(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (v << i));
+#endif
+  }
+  
+  __forceinline int btr(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(v << i));
+#endif
+  }
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (v << i));
+#endif
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(v << i));
+#endif
+  }
+
+  __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+    return __sync_val_compare_and_swap(value, comparand, input);
+  }
+  
+#endif
+  
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+  __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+  __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+  __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+  __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+  __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+  __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(__SSE4_2__)
+  
+  __forceinline int popcnt(int in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+  __forceinline unsigned popcnt(unsigned in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+#if defined(__64BIT__)
+  __forceinline size_t popcnt(size_t in) {
+    return _mm_popcnt_u64(in);
+  }
+#endif
+  
+#endif
+
+#if defined(__X86_ASM__)
+  __forceinline uint64_t rdtsc()
+  {
+    int dummy[4]; 
+    __cpuid(dummy,0); 
+    uint64_t clock = read_tsc(); 
+    __cpuid(dummy,0); 
+    return clock;
+  }
+#endif
+  
+  __forceinline void pause_cpu(const size_t N = 8)
+  {
+    for (size_t i=0; i<N; i++)
+      _mm_pause();    
+  }
+  
+  /* prefetches */
+  __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+  __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+  __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+  __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+  __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+    _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);    
+#endif
+  }
+
+  __forceinline void prefetchL1EX(const void* ptr) { 
+    prefetchEX(ptr); 
+  }
+  
+  __forceinline void prefetchL2EX(const void* ptr) { 
+    prefetchEX(ptr); 
+  }
+#if defined(__AVX2__)
+   __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+   __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+   __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+   __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+   __forceinline float mm512_cvtss_f32(__m512 v) {
+     return _mm512_cvtss_f32(v);
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) {
+     return _mm512_mask2int(k1);
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) {
+     return _mm512_int2mask(mask);
+   }
+#else
+   __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+     return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+     return (int)k1;
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+     return (__mmask16)mask;
+   }
+#endif
+#endif
+}
diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp
new file mode 100644
index 0000000000..fc983dffd5
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+    std::string fullName = file+".dll";
+    FileName executable = getExecutableFileName();
+    HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+    return lib_t(handle);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return (void*)GetProcAddress(HMODULE(lib),sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    FreeLibrary(HMODULE(lib));
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+#if defined(__MACOSX__)
+    std::string fullName = "lib"+file+".dylib";
+#else
+    std::string fullName = "lib"+file+".so";
+#endif
+    void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+    if (lib) return lib_t(lib);
+    FileName executable = getExecutableFileName();
+    lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+    if (lib == nullptr) {
+      const char* error = dlerror();
+      if (error) { 
+        THROW_RUNTIME_ERROR(error);
+      } else {
+        THROW_RUNTIME_ERROR("could not load library "+executable.str());
+      }
+    }
+    return lib_t(lib);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return dlsym(lib,sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    dlclose(lib);
+  }
+}
+#endif
diff --git a/thirdparty/embree/common/sys/library.h b/thirdparty/embree/common/sys/library.h
new file mode 100644
index 0000000000..67e14d2420
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! type for shared library */
+  typedef struct opaque_lib_t* lib_t;
+
+  /*! loads a shared library */
+  lib_t openLibrary(const std::string& file);
+
+  /*! returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym);
+
+  /*! unloads a shared library */
+  void closeLibrary(lib_t lib);
+}
diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp
new file mode 100644
index 0000000000..789feaf2d8
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -0,0 +1,57 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+  void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+  void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys() 
+  { 
+    mutex = new pthread_mutex_t; 
+    if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+  }
+  
+  MutexSys::~MutexSys() 
+  { 
+    MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+    assert(ok);
+    delete (pthread_mutex_t*)mutex; 
+  }
+  
+  void MutexSys::lock() 
+  { 
+    if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) 
+      THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+  }
+  
+  bool MutexSys::try_lock() { 
+    return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+  }
+  
+  void MutexSys::unlock() 
+  { 
+    if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+  }
+};
+#endif
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
new file mode 100644
index 0000000000..4cb3626d92
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system mutex */
+  class MutexSys {
+    friend struct ConditionImplementation;
+  public:
+    MutexSys();
+    ~MutexSys();
+
+  private:
+    MutexSys (const MutexSys& other) DELETED; // do not implement
+    MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+  public:
+    void lock();
+    bool try_lock();
+    void unlock();
+
+  protected:
+    void* mutex;
+  };
+
+  /*! spinning mutex */
+  class SpinLock
+  {
+  public:
+ 
+    SpinLock ()
+      : flag(false) {}
+
+    __forceinline bool isLocked() {
+      return flag.load();
+    }
+
+    __forceinline void lock()
+    {
+      while (true) 
+      {
+        while (flag.load()) 
+        {
+          _mm_pause(); 
+          _mm_pause();
+        }
+        
+        bool expected = false;
+        if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+          break;
+      }
+    }
+    
+    __forceinline bool try_lock()
+    {
+      bool expected = false;
+      if (flag.load() != expected) {
+        return false;
+      }
+      return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+    }
+
+    __forceinline void unlock() {
+      flag.store(false,std::memory_order_release);
+    }
+    
+    __forceinline void wait_until_unlocked() 
+    {
+      while(flag.load())
+      {
+        _mm_pause(); 
+        _mm_pause();
+      }
+    }
+
+  public:
+    atomic<bool> flag;
+  };
+
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+    Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+    ~Lock() { if (locked) mutex.unlock(); }
+    __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+    __forceinline bool isLocked() const { return locked; }
+  protected:
+    Mutex& mutex;
+    bool locked;
+  };
+}
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
new file mode 100644
index 0000000000..697e07bb86
--- /dev/null
+++ b/thirdparty/embree/common/sys/platform.h
@@ -0,0 +1,392 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 Intel platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#define __X86_ASM__
+#elif defined(__i386__) || defined(_M_IX86)
+#define __X86_ASM__
+#endif
+
+/* detect 64 bit platform */
+#if defined(__X86_64__) || defined(__aarch64__)
+#define __64BIT__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define dll_export __declspec(dllexport)
+#define dll_import __declspec(dllimport)
+#else
+#define dll_export __attribute__ ((visibility ("default")))
+#define dll_import 
+#endif
+
+// -- GODOT start --
+#if defined(__WIN32__) && !defined(__MINGW32__)
+// -- GODOT end --
+#if !defined(__noinline)
+#define __noinline             __declspec(noinline)
+#endif
+//#define __forceinline        __forceinline
+//#define __restrict           __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__           __restrict
+#else
+#define __restrict__           //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread)
+#define __thread               __declspec(thread)
+#endif
+#if !defined(__aligned)
+#define __aligned(...)           __declspec(align(__VA_ARGS__))
+#endif
+//#define __FUNCTION__           __FUNCTION__
+#define debugbreak()           __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline             __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+//#define __restrict             __restrict
+//#define __thread               __thread
+#if !defined(__aligned)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__           __PRETTY_FUNCTION__
+#endif
+#define debugbreak()           asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+  #define DELETED
+#else
+  #define DELETED  = delete
+#endif
+
+// -- GODOT start --
+#if !defined(likely)
+// -- GODOT end --
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define   likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define   likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+// -- GODOT start --
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define THROW_RUNTIME_ERROR(str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(str);
+  #define THROW_RUNTIME_ERROR(str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define FATAL(x)   THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+  typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__64BIT__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+  return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94  ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335)  // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+#  if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+#    pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+#    pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+#  endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable:  1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+
+/* embree output stream */
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__) 
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+	__asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+					IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+					IACA_UD_BYTES}
+
+namespace embree
+{
+  template<typename Closure>
+    struct OnScopeExitHelper
+  {
+    OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+    ~OnScopeExitHelper() { if (active) f(); }
+    void deactivate() { active = false; }
+    bool active;
+    const Closure f;
+  };
+  
+  template <typename Closure>
+    OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+    return OnScopeExitHelper<Closure>(f);
+  }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code)                                             \
+  auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+  template<typename Ty>
+    std::unique_ptr<Ty> make_unique(Ty* ptr) {
+    return std::unique_ptr<Ty>(ptr);
+  }
+
+}
diff --git a/thirdparty/embree/common/sys/ref.h b/thirdparty/embree/common/sys/ref.h
new file mode 100644
index 0000000000..c2b56c1908
--- /dev/null
+++ b/thirdparty/embree/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+  struct NullTy {
+  };
+
+  extern MAYBE_UNUSED NullTy null;
+  
+  class RefCount
+  {
+  public:
+    RefCount(int val = 0) : refCounter(val) {}
+    virtual ~RefCount() {};
+  
+    virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+    virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+  private:
+    std::atomic<size_t> refCounter;
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reference to single object
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename Type>
+  class Ref
+  {
+  public:
+    Type* ptr;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Ref() : ptr(nullptr) {}
+    __forceinline Ref(NullTy) : ptr(nullptr) {}
+    __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+    __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+    __forceinline Ref(Type* const input) : ptr(input)
+    {
+      if (ptr)
+        ptr->refInc();
+    }
+
+    __forceinline ~Ref()
+    {
+      if (ptr)
+        ptr->refDec();
+    }
+
+    __forceinline Ref& operator =(const Ref& input)
+    {
+      if (input.ptr)
+        input.ptr->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Ref&& input)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      input.ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Type* const input)
+    {
+      if (input)
+        input->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(NullTy)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline operator bool() const { return ptr != nullptr; }
+
+    __forceinline const Type& operator  *() const { return *ptr; }
+    __forceinline       Type& operator  *()       { return *ptr; }
+    __forceinline const Type* operator ->() const { return  ptr; }
+    __forceinline       Type* operator ->()       { return  ptr; }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> cast()       { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> dynamicCast()       { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+  };
+
+  template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   <  b.ptr;   }
+
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy            ) { return a.ptr   == nullptr; }
+  template<typename Type> __forceinline bool operator ==(NullTy            , const Ref<Type>& b) { return nullptr == b.ptr;   }
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   == b.ptr;   }
+
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy            ) { return a.ptr   != nullptr; }
+  template<typename Type> __forceinline bool operator !=(NullTy            , const Ref<Type>& b) { return nullptr != b.ptr;   }
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   != b.ptr;   }
+}
diff --git a/thirdparty/embree/common/sys/regression.cpp b/thirdparty/embree/common/sys/regression.cpp
new file mode 100644
index 0000000000..45315b1105
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.cpp
@@ -0,0 +1,30 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "regression.h"
+
+namespace embree
+{
+  /* registerRegressionTest is invoked from static initializers, thus
+   * we cannot have the regression_tests variable as global static
+   * variable due to issues with static variable initialization
+   * order. */
+  std::vector<RegressionTest*>& get_regression_tests()
+  {
+    static std::vector<RegressionTest*> regression_tests;
+    return regression_tests;
+  } 
+
+  void registerRegressionTest(RegressionTest* test) 
+  {
+    get_regression_tests().push_back(test);
+  }
+
+  RegressionTest* getRegressionTest(size_t index)
+  {
+    if (index >= get_regression_tests().size())
+      return nullptr;
+    
+    return get_regression_tests()[index];
+  }
+}
diff --git a/thirdparty/embree/common/sys/regression.h b/thirdparty/embree/common/sys/regression.h
new file mode 100644
index 0000000000..bb0bb94006
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.h
@@ -0,0 +1,25 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#include <vector>
+
+namespace embree
+{
+  /*! virtual interface for all regression tests */
+  struct RegressionTest 
+  { 
+    RegressionTest (std::string name) : name(name) {}
+    virtual bool run() = 0;
+    std::string name;
+  };
+ 
+  /*! registers a regression test */
+  void registerRegressionTest(RegressionTest* test);
+
+  /*! run all regression tests */
+  RegressionTest* getRegressionTest(size_t index);
+}
diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/string.cpp
new file mode 100644
index 0000000000..f42fdc8536
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+  char to_lower(char c) { return char(tolower(int(c))); }
+  char to_upper(char c) { return char(toupper(int(c))); }
+  std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+  std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+  Vec2f string_to_Vec2f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next);
+    return Vec2f(x,y);
+  }
+  
+  Vec3f string_to_Vec3f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); 
+    return Vec3f(x,y,z);
+  }
+  
+  Vec4f string_to_Vec4f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); str = str.substr(next+1);
+    const float w = std::stof(str,&next);
+    return Vec4f(x,y,z,w);
+  }
+}
diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/string.h
new file mode 100644
index 0000000000..820076b21c
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+  class IOStreamStateRestorer 
+  {
+  public:
+    IOStreamStateRestorer(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~IOStreamStateRestorer() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+
+  std::string toLowerCase(const std::string& s);
+  std::string toUpperCase(const std::string& s);
+
+  Vec2f string_to_Vec2f ( std::string str );
+  Vec3f string_to_Vec3f ( std::string str );
+  Vec4f string_to_Vec4f ( std::string str );
+}
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
new file mode 100644
index 0000000000..f1a59e511e
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -0,0 +1,656 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sysinfo.h"
+#include "intrinsics.h"
+#include "string.h"
+#include "ref.h"
+#if defined(__FREEBSD__)
+#include <sys/cpuset.h>
+#include <pthread_np.h>
+typedef cpuset_t cpu_set_t;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+  NullTy null;
+  
+  std::string getPlatformName() 
+  {
+#if defined(__LINUX__) && !defined(__64BIT__)
+    return "Linux (32bit)";
+#elif defined(__LINUX__) && defined(__64BIT__)
+    return "Linux (64bit)";
+#elif defined(__FREEBSD__) && !defined(__64BIT__)
+    return "FreeBSD (32bit)";
+#elif defined(__FREEBSD__) && defined(__64BIT__)
+    return "FreeBSD (64bit)";
+#elif defined(__CYGWIN__) && !defined(__64BIT__)
+    return "Cygwin (32bit)";
+#elif defined(__CYGWIN__) && defined(__64BIT__)
+    return "Cygwin (64bit)";
+#elif defined(__WIN32__) && !defined(__64BIT__)
+    return "Windows (32bit)";
+#elif defined(__WIN32__) && defined(__64BIT__)
+    return "Windows (64bit)";
+#elif defined(__MACOSX__) && !defined(__64BIT__)
+    return "Mac OS X (32bit)";
+#elif defined(__MACOSX__) && defined(__64BIT__)
+    return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && !defined(__64BIT__)
+    return "Unix (32bit)";
+#elif defined(__UNIX__) && defined(__64BIT__)
+    return "Unix (64bit)";
+#else
+    return "Unknown";
+#endif
+  }
+
+  std::string getCompilerName()
+  {
+#if defined(__INTEL_COMPILER)
+    int icc_mayor = __INTEL_COMPILER / 100 % 100;
+    int icc_minor = __INTEL_COMPILER % 100;
+    std::string version = "Intel Compiler ";
+    version += toString(icc_mayor);
+    version += "." + toString(icc_minor);
+#if defined(__INTEL_COMPILER_UPDATE)
+    version += "." + toString(__INTEL_COMPILER_UPDATE);
+#endif
+    return version;
+#elif defined(__clang__)
+    return "CLANG " __clang_version__;
+#elif defined (__GNUC__)
+    return "GCC " __VERSION__;
+#elif defined(_MSC_VER)
+    std::string version = toString(_MSC_FULL_VER);
+    version.insert(4,".");
+    version.insert(9,".");
+    version.insert(2,".");
+    return "Visual C++ Compiler " + version;
+#else
+    return "Unknown Compiler";
+#endif
+  }
+
+  std::string getCPUVendor()
+  {
+#if defined(__X86_ASM__)
+    int cpuinfo[4]; 
+    __cpuid (cpuinfo, 0); 
+    int name[4];
+    name[0] = cpuinfo[1];
+    name[1] = cpuinfo[3];
+    name[2] = cpuinfo[2];
+    name[3] = 0;
+    return (char*)name;
+#elif defined(__ARM_NEON)
+    return "ARM";
+#else
+    return "Unknown";
+#endif
+  }
+
+  CPU getCPUModel() 
+  {
+#if defined(__X86_ASM__)
+    if (getCPUVendor() != "GenuineIntel")
+      return CPU::UNKNOWN;
+    
+    int out[4];
+    __cpuid(out, 0);
+    if (out[0] < 1) return CPU::UNKNOWN;
+    __cpuid(out, 1);
+
+    /* please see CPUID documentation for these formulas */
+    uint32_t family_ID          = (out[0] >>  8) & 0x0F;
+    uint32_t extended_family_ID = (out[0] >> 20) & 0xFF;
+    
+    uint32_t model_ID           = (out[0] >>  4) & 0x0F;
+    uint32_t extended_model_ID  = (out[0] >> 16) & 0x0F;
+    
+    uint32_t DisplayFamily = family_ID;
+    if (family_ID == 0x0F)
+      DisplayFamily += extended_family_ID;
+    
+    uint32_t DisplayModel = model_ID;
+    if (family_ID == 0x06 || family_ID == 0x0F)
+      DisplayModel += extended_model_ID << 4;
+
+    uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0);
+
+    // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel)
+    if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1;
+
+    if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
+    if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
+    
+#elif defined(__ARM_NEON)
+    return CPU::ARM;
+#endif
+    
+    return CPU::UNKNOWN;
+  }
+
+  std::string stringOfCPUModel(CPU model)
+  {
+    switch (model) {
+    case CPU::XEON_ICE_LAKE           : return "Xeon Ice Lake";
+    case CPU::CORE_ICE_LAKE           : return "Core Ice Lake";
+    case CPU::CORE_TIGER_LAKE         : return "Core Tiger Lake";
+    case CPU::CORE_COMET_LAKE         : return "Core Comet Lake";
+    case CPU::CORE_CANNON_LAKE        : return "Core Cannon Lake";
+    case CPU::CORE_KABY_LAKE          : return "Core Kaby Lake";
+    case CPU::XEON_SKY_LAKE           : return "Xeon Sky Lake";
+    case CPU::CORE_SKY_LAKE           : return "Core Sky Lake";
+    case CPU::XEON_PHI_KNIGHTS_MILL   : return "Xeon Phi Knights Mill";
+    case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing";
+    case CPU::XEON_BROADWELL          : return "Xeon Broadwell";
+    case CPU::CORE_BROADWELL          : return "Core Broadwell";
+    case CPU::XEON_HASWELL            : return "Xeon Haswell";
+    case CPU::CORE_HASWELL            : return "Core Haswell";
+    case CPU::XEON_IVY_BRIDGE         : return "Xeon Ivy Bridge";
+    case CPU::CORE_IVY_BRIDGE         : return "Core Ivy Bridge";
+    case CPU::SANDY_BRIDGE            : return "Sandy Bridge";
+    case CPU::NEHALEM                 : return "Nehalem";
+    case CPU::CORE2                   : return "Core2";
+    case CPU::CORE1                   : return "Core";
+    case CPU::ARM                     : return "ARM";
+    case CPU::UNKNOWN                 : return "Unknown CPU";
+    }
+    return "Unknown CPU (error)";
+  }
+
+#if defined(__X86_ASM__)
+  /* constants to access destination registers of CPUID instruction */
+  static const int EAX = 0;
+  static const int EBX = 1;
+  static const int ECX = 2;
+  static const int EDX = 3;
+
+  /* cpuid[eax=1].ecx */
+  static const int CPU_FEATURE_BIT_SSE3   = 1 << 0;
+  static const int CPU_FEATURE_BIT_SSSE3  = 1 << 9;
+  static const int CPU_FEATURE_BIT_FMA3   = 1 << 12;
+  static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19;
+  static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20;
+  //static const int CPU_FEATURE_BIT_MOVBE  = 1 << 22;
+  static const int CPU_FEATURE_BIT_POPCNT = 1 << 23;
+  //static const int CPU_FEATURE_BIT_XSAVE  = 1 << 26;
+  static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27;
+  static const int CPU_FEATURE_BIT_AVX    = 1 << 28;
+  static const int CPU_FEATURE_BIT_F16C   = 1 << 29;
+  static const int CPU_FEATURE_BIT_RDRAND = 1 << 30;
+
+  /* cpuid[eax=1].edx */
+  static const int CPU_FEATURE_BIT_SSE  = 1 << 25;
+  static const int CPU_FEATURE_BIT_SSE2 = 1 << 26;
+
+  /* cpuid[eax=0x80000001].ecx */
+  static const int CPU_FEATURE_BIT_LZCNT = 1 << 5;
+
+  /* cpuid[eax=7,ecx=0].ebx */
+  static const int CPU_FEATURE_BIT_BMI1    = 1 << 3;
+  static const int CPU_FEATURE_BIT_AVX2    = 1 << 5;
+  static const int CPU_FEATURE_BIT_BMI2    = 1 << 8;
+  static const int CPU_FEATURE_BIT_AVX512F = 1 << 16;     // AVX512F  (foundation)
+  static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17;    // AVX512DQ (doubleword and quadword instructions)
+  static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26;    // AVX512PF (prefetch gather/scatter instructions)
+  static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27;    // AVX512ER (exponential and reciprocal instructions)
+  static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28;    // AVX512CD (conflict detection instructions)
+  static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30;    // AVX512BW (byte and word instructions)
+  static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31;    // AVX512VL (vector length extensions)
+  static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21;  // AVX512IFMA (integer fused multiple-add instructions)
+  
+  /* cpuid[eax=7,ecx=0].ecx */
+  static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1;   // AVX512VBMI (vector bit manipulation instructions)
+#endif
+
+#if defined(__X86_ASM__)
+  __noinline int64_t get_xcr0() 
+  {
+// -- GODOT start --
+#if defined (__WIN32__) && !defined (__MINGW32__)
+// -- GODOT end --
+    int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+    xcr0 = _xgetbv(0);
+    return xcr0;
+#else
+    int xcr0 = 0;
+    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+    return xcr0;
+#endif
+  }
+#endif
+
+  int getCPUFeatures()
+  {
+#if defined(__X86_ASM__)
+    /* cache CPU features access */
+    static int cpu_features = 0;
+    if (cpu_features) 
+      return cpu_features;
+
+    /* get number of CPUID leaves */
+    int cpuid_leaf0[4]; 
+    __cpuid(cpuid_leaf0, 0x00000000);
+    unsigned nIds = cpuid_leaf0[EAX];  
+
+    /* get number of extended CPUID leaves */
+    int cpuid_leafe[4]; 
+    __cpuid(cpuid_leafe, 0x80000000);
+    unsigned nExIds = cpuid_leafe[EAX];
+
+    /* get CPUID leaves for EAX = 1,7, and 0x80000001 */
+    int cpuid_leaf_1[4] = { 0,0,0,0 };
+    int cpuid_leaf_7[4] = { 0,0,0,0 };
+    int cpuid_leaf_e1[4] = { 0,0,0,0 };
+    if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001);
+#if _WIN32
+#if _MSC_VER && (_MSC_FULL_VER < 160040219)
+#else
+    if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0);
+#endif
+#else
+    if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0);
+#endif
+    if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001);
+
+    /* detect if OS saves XMM, YMM, and ZMM states */
+    bool xmm_enabled = true;
+    bool ymm_enabled = false;
+    bool zmm_enabled = false;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) {
+      int64_t xcr0 = get_xcr0();
+      xmm_enabled = ((xcr0 & 0x02) == 0x02);                /* checks if xmm are enabled in XCR0 */
+      ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */
+      zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */
+    }
+    if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
+    if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
+    if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
+    
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE   ) cpu_features |= CPU_FEATURE_SSE;
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2  ) cpu_features |= CPU_FEATURE_SSE2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3  ) cpu_features |= CPU_FEATURE_SSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
+    
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX   ) cpu_features |= CPU_FEATURE_AVX;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C  ) cpu_features |= CPU_FEATURE_F16C;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2  ) cpu_features |= CPU_FEATURE_AVX2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3  ) cpu_features |= CPU_FEATURE_FMA3;
+    if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2;
+
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F   ) cpu_features |= CPU_FEATURE_AVX512F;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ  ) cpu_features |= CPU_FEATURE_AVX512DQ;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF  ) cpu_features |= CPU_FEATURE_AVX512PF;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER; 
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD  ) cpu_features |= CPU_FEATURE_AVX512CD;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW  ) cpu_features |= CPU_FEATURE_AVX512BW;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL  ) cpu_features |= CPU_FEATURE_AVX512VL;
+    if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
+
+    return cpu_features;
+#elif defined(__ARM_NEON)
+    /* emulated features with sse2neon */
+    return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED;
+#else
+    /* Unknown CPU. */
+    return 0;
+#endif
+  }
+
+  std::string stringOfCPUFeatures(int features)
+  {
+    std::string str;
+    if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM ";
+    if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM ";
+    if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM ";
+    if (features & CPU_FEATURE_SSE   ) str += "SSE ";
+    if (features & CPU_FEATURE_SSE2  ) str += "SSE2 ";
+    if (features & CPU_FEATURE_SSE3  ) str += "SSE3 ";
+    if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 ";
+    if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 ";
+    if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 ";
+    if (features & CPU_FEATURE_POPCNT) str += "POPCNT ";
+    if (features & CPU_FEATURE_AVX   ) str += "AVX ";
+    if (features & CPU_FEATURE_F16C  ) str += "F16C ";
+    if (features & CPU_FEATURE_RDRAND) str += "RDRAND ";
+    if (features & CPU_FEATURE_AVX2  ) str += "AVX2 ";
+    if (features & CPU_FEATURE_FMA3  ) str += "FMA3 ";
+    if (features & CPU_FEATURE_LZCNT ) str += "LZCNT ";
+    if (features & CPU_FEATURE_BMI1  ) str += "BMI1 ";
+    if (features & CPU_FEATURE_BMI2  ) str += "BMI2 ";
+    if (features & CPU_FEATURE_AVX512F) str += "AVX512F ";
+    if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ ";
+    if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF ";
+    if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER ";
+    if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD ";
+    if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW ";
+    if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
+    if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
+    if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    return str;
+  }
+  
+  std::string stringOfISA (int isa)
+  {
+    if (isa == SSE) return "SSE";
+    if (isa == SSE2) return "SSE2";
+    if (isa == SSE3) return "SSE3";
+    if (isa == SSSE3) return "SSSE3";
+    if (isa == SSE41) return "SSE4.1";
+    if (isa == SSE42) return "SSE4.2";
+    if (isa == AVX) return "AVX";
+    if (isa == AVX2) return "AVX2";
+    if (isa == AVX512) return "AVX512";
+    return "UNKNOWN";
+  }
+
+  bool hasISA(int features, int isa) {
+    return (features & isa) == isa;
+  }
+  
+  std::string supportedTargetList (int features)
+  {
+    std::string v;
+    if (hasISA(features,SSE)) v += "SSE ";
+    if (hasISA(features,SSE2)) v += "SSE2 ";
+    if (hasISA(features,SSE3)) v += "SSE3 ";
+    if (hasISA(features,SSSE3)) v += "SSSE3 ";
+    if (hasISA(features,SSE41)) v += "SSE4.1 ";
+    if (hasISA(features,SSE42)) v += "SSE4.2 ";
+    if (hasISA(features,AVX)) v += "AVX ";
+    if (hasISA(features,AVXI)) v += "AVXI ";
+    if (hasISA(features,AVX2)) v += "AVX2 ";
+    if (hasISA(features,AVX512)) v += "AVX512 ";
+    return v;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <psapi.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() {
+    char filename[1024];
+    if (!GetModuleFileName(nullptr, filename, sizeof(filename)))
+      return std::string();
+    return std::string(filename);
+  }
+
+  unsigned int getNumberOfLogicalThreads() 
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc      pGetActiveProcessorCount      = (GetActiveProcessorCountFunc)     GetProcAddress(hlib, "GetActiveProcessorCount");
+
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) 
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0;
+      for (int i = 0; i < groups; i++) 
+        totalProcessors += pGetActiveProcessorCount(i);
+      nThreads = totalProcessors;
+    }
+    else
+    {
+      SYSTEM_INFO sysinfo;
+      GetSystemInfo(&sysinfo);
+      nThreads = sysinfo.dwNumberOfProcessors;
+    }
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth() 
+  {
+    HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (handle == INVALID_HANDLE_VALUE) return 80;
+    CONSOLE_SCREEN_BUFFER_INFO info;
+    memset(&info,0,sizeof(info));
+    GetConsoleScreenBufferInfo(handle, &info);
+    return info.dwSize.X;
+  }
+
+  double getSeconds() 
+  {
+    LARGE_INTEGER freq, val;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&val);
+    return (double)val.QuadPart / (double)freq.QuadPart;
+  }
+
+  void sleepSeconds(double t) {
+    Sleep(DWORD(1000.0*t));
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.QuotaPeakPagedPoolUsage;
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.WorkingSetSize;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <stdio.h>
+#include <unistd.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() 
+  {
+    std::string pid = "/proc/" + toString(getpid()) + "/exe";
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return virt*sysconf(_SC_PAGE_SIZE);
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return resident*sysconf(_SC_PAGE_SIZE);
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__FreeBSD__)
+
+#include <sys/sysctl.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    size_t len = sizeof(buf)-1;
+    if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+   
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Mac OS X Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach-o/dyld.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    char buf[4096];
+    uint32_t size = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &size) != 0)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+   
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+namespace embree
+{
+  unsigned int getNumberOfLogicalThreads() 
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+// -- GODOT start --
+// #if defined(__MACOSX__)
+#if defined(__MACOSX__) || defined(__ANDROID__)
+// -- GODOT end --
+    nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+    assert(nThreads);
+#else
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+      nThreads = CPU_COUNT(&set);
+#endif
+    
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth() 
+  {
+    struct winsize info;
+    if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
+    return info.ws_col;
+  }
+
+  double getSeconds() {
+    struct timeval tp; gettimeofday(&tp,nullptr);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+  }
+
+  void sleepSeconds(double t) {
+    usleep(1000000.0*t);
+  }
+}
+#endif
+
diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h
new file mode 100644
index 0000000000..72351d12e4
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.h
@@ -0,0 +1,178 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define CACHELINE_SIZE 64
+
+#if !defined(PAGE_SIZE)
+  #define PAGE_SIZE 4096
+#endif
+
+#define PAGE_SIZE_2M (2*1024*1024)
+#define PAGE_SIZE_4K (4*1024)
+
+#include "platform.h"
+
+/* define isa namespace and ISA bitvector */
+#if defined (__AVX512VL__)
+#  define isa avx512
+#  define ISA AVX512
+#  define ISA_STR "AVX512"
+#elif defined (__AVX2__)
+#  define isa avx2
+#  define ISA AVX2
+#  define ISA_STR "AVX2"
+#elif defined(__AVXI__)
+#  define isa avxi
+#  define ISA AVXI
+#  define ISA_STR "AVXI"
+#elif defined(__AVX__)
+#  define isa avx
+#  define ISA AVX
+#  define ISA_STR "AVX"
+#elif defined (__SSE4_2__)
+#  define isa sse42
+#  define ISA SSE42
+#  define ISA_STR "SSE4.2"
+//#elif defined (__SSE4_1__) //  we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11
+//#  define isa sse41
+//#  define ISA SSE41
+//#  define ISA_STR "SSE4.1"
+//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC
+//#  define isa ssse3
+//#  define ISA SSSE3
+//#  define ISA_STR "SSSE3"
+//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang
+//#  define isa sse3
+//#  define ISA SSE3
+//#  define ISA_STR "SSE3"
+#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__)
+#  define isa sse2
+#  define ISA SSE2
+#  define ISA_STR "SSE2"
+#elif defined(__SSE__)
+#  define isa sse
+#  define ISA SSE
+#  define ISA_STR "SSE"
+#else 
+#error Unknown ISA
+#endif
+
+namespace embree
+{
+  enum class CPU
+  {
+    XEON_ICE_LAKE,
+    CORE_ICE_LAKE,
+    CORE_TIGER_LAKE,
+    CORE_COMET_LAKE,
+    CORE_CANNON_LAKE,
+    CORE_KABY_LAKE,
+    XEON_SKY_LAKE,
+    CORE_SKY_LAKE,
+    XEON_PHI_KNIGHTS_MILL,
+    XEON_PHI_KNIGHTS_LANDING,
+    XEON_BROADWELL,
+    CORE_BROADWELL,
+    XEON_HASWELL,
+    CORE_HASWELL,
+    XEON_IVY_BRIDGE,
+    CORE_IVY_BRIDGE,
+    SANDY_BRIDGE,
+    NEHALEM,
+    CORE2,
+    CORE1,
+    ARM,
+    UNKNOWN,
+  };
+  
+  /*! get the full path to the running executable */
+  std::string getExecutableFileName();
+
+  /*! return platform name */
+  std::string getPlatformName();
+
+  /*! get the full name of the compiler */
+  std::string getCompilerName();
+
+  /*! return the name of the CPU */
+  std::string getCPUVendor();
+
+  /*! get microprocessor model */
+  CPU getCPUModel(); 
+
+  /*! converts CPU model into string */
+  std::string stringOfCPUModel(CPU model);
+
+  /*! CPU features */
+  static const int CPU_FEATURE_SSE    = 1 << 0;
+  static const int CPU_FEATURE_SSE2   = 1 << 1;
+  static const int CPU_FEATURE_SSE3   = 1 << 2;
+  static const int CPU_FEATURE_SSSE3  = 1 << 3;
+  static const int CPU_FEATURE_SSE41  = 1 << 4;
+  static const int CPU_FEATURE_SSE42  = 1 << 5; 
+  static const int CPU_FEATURE_POPCNT = 1 << 6;
+  static const int CPU_FEATURE_AVX    = 1 << 7;
+  static const int CPU_FEATURE_F16C   = 1 << 8;
+  static const int CPU_FEATURE_RDRAND = 1 << 9;
+  static const int CPU_FEATURE_AVX2   = 1 << 10;
+  static const int CPU_FEATURE_FMA3   = 1 << 11;
+  static const int CPU_FEATURE_LZCNT  = 1 << 12;
+  static const int CPU_FEATURE_BMI1   = 1 << 13;
+  static const int CPU_FEATURE_BMI2   = 1 << 14;
+  static const int CPU_FEATURE_AVX512F = 1 << 16;
+  static const int CPU_FEATURE_AVX512DQ = 1 << 17;    
+  static const int CPU_FEATURE_AVX512PF = 1 << 18;
+  static const int CPU_FEATURE_AVX512ER = 1 << 19;
+  static const int CPU_FEATURE_AVX512CD = 1 << 20;
+  static const int CPU_FEATURE_AVX512BW = 1 << 21;
+  static const int CPU_FEATURE_AVX512VL = 1 << 22;
+  static const int CPU_FEATURE_AVX512IFMA = 1 << 23;
+  static const int CPU_FEATURE_AVX512VBMI = 1 << 24;
+  static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
+  static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
+  static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
+ 
+  /*! get CPU features */
+  int getCPUFeatures();
+
+  /*! convert CPU features into a string */
+  std::string stringOfCPUFeatures(int features);
+
+  /*! creates a string of all supported targets that are supported */
+  std::string supportedTargetList (int isa);
+
+  /*! ISAs */
+  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; 
+  static const int SSE2   = SSE | CPU_FEATURE_SSE2;
+  static const int SSE3   = SSE2 | CPU_FEATURE_SSE3;
+  static const int SSSE3  = SSE3 | CPU_FEATURE_SSSE3;
+  static const int SSE41  = SSSE3 | CPU_FEATURE_SSE41;
+  static const int SSE42  = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT;
+  static const int AVX    = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED;
+  static const int AVXI   = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
+  static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
+  static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+
+  /*! converts ISA bitvector into a string */
+  std::string stringOfISA(int features);
+
+  /*! return the number of logical threads of the system */
+  unsigned int getNumberOfLogicalThreads();
+
+  /*! returns the size of the terminal window in characters */
+  int getTerminalWidth();
+
+  /*! returns performance counter in seconds */
+  double getSeconds();
+
+  /*! sleeps the specified number of seconds */
+  void sleepSeconds(double t);
+
+  /*! returns virtual address space occupied by process */
+  size_t getVirtualMemoryBytes();
+
+  /*! returns resident memory required by process */
+  size_t getResidentMemoryBytes();
+}
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
new file mode 100644
index 0000000000..f4014be89b
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -0,0 +1,474 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "thread.h"
+#include "sysinfo.h"
+#include "string.h"
+
+#include <iostream>
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <xmmintrin.h>
+#endif
+
+#if defined(PTHREADS_WIN32)
+#pragma comment (lib, "pthreadVC.lib")
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /*! set the affinity of a given thread */
+  void setAffinity(HANDLE thread, ssize_t affinity)
+  {
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
+    typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
+    SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
+    SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) 
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0, group = 0, number = 0;
+      for (int i = 0; i<groups; i++) {
+        int processors = pGetActiveProcessorCount(i);
+        if (totalProcessors + processors > affinity) {
+          group = i;
+          number = (int)affinity - totalProcessors;
+          break;
+        }
+        totalProcessors += processors;
+      }
+  
+      GROUP_AFFINITY groupAffinity;
+      groupAffinity.Group = (WORD)group;
+      groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
+      groupAffinity.Reserved[0] = 0;
+      groupAffinity.Reserved[1] = 0;
+      groupAffinity.Reserved[2] = 0;
+      if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
+        WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
+  
+      PROCESSOR_NUMBER processorNumber;
+      processorNumber.Group = group;
+      processorNumber.Number = number;
+      processorNumber.Reserved = 0;
+      if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
+        WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
+    } 
+    else 
+    {
+      if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
+        WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
+      if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
+        WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
+      }
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity) {
+    setAffinity(GetCurrentThread(), affinity);
+  }
+
+  struct ThreadStartupData 
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg) 
+      : f(f), arg(arg) {}
+  public:
+    thread_func f;
+    void* arg;
+  };
+
+  DWORD WINAPI threadStartup(LPVOID ptr)
+  {
+    ThreadStartupData* parg = (ThreadStartupData*) ptr;
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    parg->f(parg->arg);
+    delete parg;
+    return 0;
+  }
+
+#if !defined(PTHREADS_WIN32)
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
+    if (thread == nullptr) FATAL("CreateThread failed");
+    if (threadID >= 0) setAffinity(thread, threadID);
+    return thread_t(thread);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    SwitchToThread();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    WaitForSingleObject(HANDLE(tid), INFINITE);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! destroy a hardware thread by its handle */
+  void destroyThread(thread_t tid) {
+    TerminateThread(HANDLE(tid),0);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() {
+    return tls_t(size_t(TlsAlloc()));
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) {
+    TlsSetValue(DWORD(size_t(tls)), ptr);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) {
+    return TlsGetValue(DWORD(size_t(tls)));
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) {
+    TlsFree(DWORD(size_t(tls)));
+  }
+#endif
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+namespace embree
+{
+  static MutexSys mutex;
+  static std::vector<size_t> threadIDs;
+  
+  /* changes thread ID mapping such that we first fill up all thread on one core */
+  size_t mapThreadID(size_t threadID)
+  {
+    Lock<MutexSys> lock(mutex);
+    
+    if (threadIDs.size() == 0)
+    {
+      /* parse thread/CPU topology */
+      for (size_t cpuID=0;;cpuID++)
+      {
+        std::fstream fs;
+        std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
+        fs.open (cpu.c_str(), std::fstream::in);
+        if (fs.fail()) break;
+
+        int i;
+        while (fs >> i) 
+        {
+          if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
+            threadIDs.push_back(i);
+          if (fs.peek() == ',') 
+            fs.ignore();
+        }
+        fs.close();
+      }
+
+#if 0
+      for (size_t i=0;i<threadIDs.size();i++)
+        std::cout << i << " -> " << threadIDs[i] << std::endl;
+#endif
+
+      /* verify the mapping and do not use it if the mapping has errors */
+      for (size_t i=0;i<threadIDs.size();i++) {
+        for (size_t j=0;j<threadIDs.size();j++) {
+          if (i != j && threadIDs[i] == threadIDs[j]) {
+            threadIDs.clear();
+          }
+        }
+      }
+    }
+
+    /* re-map threadIDs if mapping is available */
+    size_t ID = threadID;
+    if (threadID < threadIDs.size())
+      ID = threadIDs[threadID];
+
+    /* find correct thread to affinitize to */
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+    {
+      for (int i=0, j=0; i<CPU_SETSIZE; i++)
+      {
+        if (!CPU_ISSET(i,&set)) continue;
+
+        if (j == ID) {
+          ID = i;
+          break;
+        }
+        j++;
+      }
+    }
+
+    return ID;
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    size_t threadID = mapThreadID(affinity);
+    CPU_SET(threadID, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+// -- GODOT start --
+////////////////////////////////////////////////////////////////////////////////
+/// Android Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__ANDROID__)
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    sched_setaffinity(0, sizeof(cset), &cset);
+  }
+}
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FreeBSD__)
+
+#include <pthread_np.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpuset_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// MacOSX Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
+#include <mach/mach_init.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+#if !defined(__ARM_NEON) // affinity seems not supported on M1 chip
+    
+    thread_affinity_policy ap;
+    ap.affinity_tag = affinity;
+    if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
+      WARNING("setting thread affinity failed"); // on purpose only a warning
+    
+#endif
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+
+#include <pthread.h>
+#include <sched.h>
+
+#if defined(__USE_NUMA__)
+#include <numa.h>
+#endif
+
+namespace embree
+{
+  struct ThreadStartupData 
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg, int affinity) 
+      : f(f), arg(arg), affinity(affinity) {}
+  public: 
+    thread_func f;
+    void* arg;
+    ssize_t affinity;
+  };
+  
+  static void* threadStartup(ThreadStartupData* parg)
+  {
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    
+    /*! Mac OS X does not support setting affinity at thread creation time */
+#if defined(__MACOSX__)
+    if (parg->affinity >= 0)
+	setAffinity(parg->affinity);
+#endif
+
+    parg->f(parg->arg);
+    delete parg;
+    return nullptr;
+  }
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    /* set stack size */
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
+
+    /* create thread */
+    pthread_t* tid = new pthread_t;
+    if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
+      pthread_attr_destroy(&attr);
+      delete tid; 
+      FATAL("pthread_create failed");
+    }
+    pthread_attr_destroy(&attr);
+
+    /* set affinity */
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      threadID = mapThreadID(threadID);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#elif defined(__FreeBSD__)
+    if (threadID >= 0) {
+      cpuset_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+// -- GODOT start --
+#elif defined(__ANDROID__)
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
+    }
+#endif
+// -- GODOT end --
+
+    return thread_t(tid);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    sched_yield();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
+      FATAL("pthread_join failed");
+    delete (pthread_t*)tid;
+  }
+
+  /*! destroy a hardware thread by its handle */
+  void destroyThread(thread_t tid) {
+// -- GODOT start --
+#if defined(__ANDROID__)
+    FATAL("Can't destroy threads on Android.");
+#else
+    pthread_cancel(*(pthread_t*)tid);
+    delete (pthread_t*)tid;
+#endif
+// -- GODOT end --
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() 
+  {
+    pthread_key_t* key = new pthread_key_t;
+    if (pthread_key_create(key,nullptr) != 0) {
+      delete key;
+      FATAL("pthread_key_create failed");
+    }
+
+    return tls_t(key);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) 
+  {
+    assert(tls);
+    return pthread_getspecific(*(pthread_key_t*)tls);
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) 
+  {
+    assert(tls);
+    if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
+      FATAL("pthread_setspecific failed");
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) 
+  {
+    assert(tls);
+    if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
+      FATAL("pthread_key_delete failed");
+    delete (pthread_key_t*)tls;
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h
new file mode 100644
index 0000000000..92a10d5c5d
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "mutex.h"
+#include "alloc.h"
+#include "vector.h"
+#include <vector>
+
+namespace embree
+{
+  /*! type for thread */
+  typedef struct opaque_thread_t* thread_t;
+
+  /*! signature of thread start function */
+  typedef void (*thread_func)(void*);
+
+  /*! creates a hardware thread running on specific logical thread */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1);
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity);
+
+  /*! the thread calling this function gets yielded */
+  void yield();
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid);
+
+  /*! destroy handle of a thread */
+  void destroyThread(thread_t tid);
+
+  /*! type for handle to thread local storage */
+  typedef struct opaque_tls_t* tls_t;
+
+  /*! creates thread local storage */
+  tls_t createTls();
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr);
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls);
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls);
+}
diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h
new file mode 100644
index 0000000000..f832626789
--- /dev/null
+++ b/thirdparty/embree/common/sys/vector.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "alloc.h"
+#include <algorithm>
+
+namespace embree
+{
+   template<typename T, typename allocator>
+    class vector_t
+    {
+    public:
+      typedef T value_type;
+      typedef T* iterator;
+      typedef const T* const_iterator;
+    
+      __forceinline vector_t () 
+        : size_active(0), size_alloced(0), items(nullptr) {}
+    
+      __forceinline explicit vector_t (size_t sz) 
+        : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      template<typename M>
+      __forceinline explicit vector_t (M alloc, size_t sz) 
+      : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      __forceinline ~vector_t() {
+        clear();
+      }
+    
+      __forceinline vector_t (const vector_t& other)
+      {
+        size_active = other.size_active;
+        size_alloced = other.size_alloced;
+        items = alloc.allocate(size_alloced);
+        for (size_t i=0; i<size_active; i++) 
+          ::new (&items[i]) value_type(other.items[i]);
+      }
+    
+      __forceinline vector_t (vector_t&& other)
+        : alloc(std::move(other.alloc))
+      {
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+      }
+
+      __forceinline vector_t& operator=(const vector_t& other) 
+      {
+        resize(other.size_active);
+        for (size_t i=0; i<size_active; i++)
+          items[i] = value_type(other.items[i]);
+        return *this;
+      }
+
+      __forceinline vector_t& operator=(vector_t&& other) 
+      {
+        clear();
+        alloc = std::move(other.alloc);
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+        return *this;
+      }
+
+      /********************** Iterators  ****************************/
+    
+      __forceinline       iterator begin()       { return items; };
+      __forceinline const_iterator begin() const { return items; };
+
+      __forceinline       iterator end  ()       { return items+size_active; };
+      __forceinline const_iterator end  () const { return items+size_active; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return size_active == 0; }
+      __forceinline size_t size     () const { return size_active; }
+      __forceinline size_t capacity () const { return size_alloced; }
+
+
+      __forceinline void resize(size_t new_size) {
+        internal_resize(new_size,internal_grow_size(new_size));
+      }
+
+      __forceinline void reserve(size_t new_alloced) 
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return;
+
+        /* resize exact otherwise */
+        internal_resize(size_active,new_alloced);
+      }
+
+      __forceinline void shrink_to_fit() {
+        internal_resize(size_active,size_active);
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline T& front() const { assert(size_active > 0); return items[0]; };
+      __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+     
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& nt) 
+      {
+        const T v = nt; // need local copy as input reference could point to this vector
+        internal_resize(size_active,internal_grow_size(size_active+1));
+        ::new (&items[size_active++]) T(v);
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        size_active--;
+        alloc.destroy(&items[size_active]);
+      }
+
+      __forceinline void clear() 
+      {
+        /* destroy elements */
+        for (size_t i=0; i<size_active; i++)
+          alloc.destroy(&items[i]);
+        
+        /* free memory */
+        alloc.deallocate(items,size_alloced); 
+        items = nullptr;
+        size_active = size_alloced = 0;
+      }
+
+    /******************** Comparisons **************************/
+    
+    friend bool operator== (const vector_t& a, const vector_t& b) 
+    {
+      if (a.size() != b.size()) return false;
+      for (size_t i=0; i<a.size(); i++)
+        if (a[i] != b[i])
+          return false;
+      return true;
+    }
+
+    friend bool operator!= (const vector_t& a, const vector_t& b) {
+      return !(a==b);
+    }
+
+    private:
+
+      __forceinline void internal_resize_init(size_t new_active)
+      {
+        assert(size_active == 0); 
+        assert(size_alloced == 0);
+        assert(items == nullptr);
+        if (new_active == 0) return;
+        items = alloc.allocate(new_active);
+        for (size_t i=0; i<new_active; i++) ::new (&items[i]) T();
+        size_active = new_active;
+        size_alloced = new_active;
+      }
+
+      __forceinline void internal_resize(size_t new_active, size_t new_alloced)
+      {
+        assert(new_active <= new_alloced); 
+
+        /* destroy elements */
+        if (new_active < size_active) 
+        {
+          for (size_t i=new_active; i<size_active; i++)
+            alloc.destroy(&items[i]);
+          size_active = new_active;
+        }
+
+        /* only reallocate if necessary */
+        if (new_alloced == size_alloced) {
+          for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T;
+          size_active = new_active;
+          return;
+        }
+
+        /* reallocate and copy items */
+        T* old_items = items;
+        items = alloc.allocate(new_alloced);
+        for (size_t i=0; i<size_active; i++) {
+          ::new (&items[i]) T(std::move(old_items[i]));
+          alloc.destroy(&old_items[i]);
+        }
+
+        for (size_t i=size_active; i<new_active; i++) {
+          ::new (&items[i]) T;
+        }
+
+        alloc.deallocate(old_items,size_alloced);
+        size_active = new_active;
+        size_alloced = new_alloced;
+      }
+
+      __forceinline size_t internal_grow_size(size_t new_alloced)
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return size_alloced;
+
+        /* resize to next power of 2 otherwise */
+        size_t new_size_alloced = size_alloced;
+        while (new_size_alloced < new_alloced) {
+          new_size_alloced = std::max(size_t(1),2*new_size_alloced);
+        }
+        return new_size_alloced;
+      }
+
+    private:
+      allocator alloc;
+      size_t size_active;    // number of valid items
+      size_t size_alloced;   // number of items allocated
+      T* items;              // data array
+    };
+
+  /*! vector class that performs standard allocations */
+  template<typename T>
+    using vector = vector_t<T,std::allocator<T>>;
+
+  /*! vector class that performs aligned allocations */
+  template<typename T>
+    using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
+  
+  /*! vector class that performs OS allocations */
+  template<typename T>
+    using ovector = vector_t<T,os_allocator<T> >;
+}
diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..8f3dd87689
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskscheduler.h
@@ -0,0 +1,15 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+#  include "taskschedulerinternal.h"
+#elif defined(TASKING_TBB)
+#  include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+#  include "taskschedulerppl.h"
+#else
+#  error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ad438588a3
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,420 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+  RTC_NAMESPACE_BEGIN
+  
+  static MutexSys g_mutex;
+  size_t TaskScheduler::g_numThreads = 0;
+  __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+  std::vector<Ref<TaskScheduler>> g_instance_vector;
+  __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+  TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+  template<typename Predicate, typename Body>
+  __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+  {
+    while (true)
+    {
+      /*! some rounds that yield */
+      for (size_t i=0; i<32; i++)
+      {
+        /*! some spinning rounds */
+        const size_t threadCount = thread.threadCount();
+        for (size_t j=0; j<1024; j+=threadCount)
+        {
+          if (!pred()) return;
+          if (thread.scheduler->steal_from_other_threads(thread)) {
+            i=j=0;
+            body();
+          }
+        }
+        yield();
+      }
+    }
+  }
+
+  /*! run this task */
+  void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+  {
+    /* try to run if not already stolen */
+    if (try_switch_state(INITIALIZED,DONE))
+    {
+      Task* prevTask = thread.task;
+      thread.task = this;
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
+          closure->execute();
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
+      thread.task = prevTask;
+      add_dependencies(-1);
+    }
+
+    /* steal until all dependencies have completed */
+    steal_loop(thread,
+               [&] () { return dependencies>0; },
+               [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+    /* now signal our parent task that we are finished */
+    if (parent)
+      parent->add_dependencies(-1);
+  }
+
+    /*! run this task */
+  dll_export void TaskScheduler::Task::run (Thread& thread) {
+    run_internal(thread);
+  }
+
+  bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+  {
+    /* stop if we run out of local tasks or reach the waiting task */
+    if (right == 0 || &tasks[right-1] == parent)
+      return false;
+
+    /* execute task */
+    size_t oldRight = right;
+    tasks[right-1].run_internal(thread);
+    if (right != oldRight) {
+      THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+    }
+
+    /* pop task and closure from stack */
+    right--;
+    if (tasks[right].stackPtr != size_t(-1))
+      stackPtr = tasks[right].stackPtr;
+
+    /* also move left pointer */
+    if (left >= right) left.store(right.load());
+
+    return right != 0;
+  }
+
+  dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+    return execute_local_internal(thread,parent);
+  }
+
+  bool TaskScheduler::TaskQueue::steal(Thread& thread)
+  {
+    size_t l = left;
+    size_t r = right;
+    if (l < r)
+    {
+      l = left++;
+       if (l >= r)
+         return false;
+    }
+    else
+      return false;
+
+    if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+      return false;
+
+    thread.tasks.right++;
+    return true;
+  }
+
+  /* we steal from the left */
+  size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+  {
+    if (left >= right) return 0;
+    return tasks[left].N;
+  }
+
+  void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+  {
+    TaskScheduler::ThreadPool* pool = pair->first;
+    size_t threadIndex = pair->second;
+    delete pair;
+    pool->thread_loop(threadIndex);
+  }
+
+  TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+    : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+  dll_export void TaskScheduler::ThreadPool::startThreads()
+  {
+    if (running) return;
+    setNumThreads(numThreads,true);
+  }
+
+  void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+  {
+    Lock<MutexSys> lock(g_mutex);
+    assert(newNumThreads);
+    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+    numThreads = newNumThreads;
+    if (!startThreads && !running) return;
+    running = true;
+    size_t numThreadsActive = numThreadsRunning;
+
+    mutex.lock();
+    numThreadsRunning = newNumThreads;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* start new threads */
+    for (size_t t=numThreadsActive; t<numThreads; t++)
+    {
+      if (t == 0) continue;
+      auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+      threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+    }
+
+    /* stop some threads if we reduce the number of threads */
+    for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+      if (t == 0) continue;
+      embree::join(threads.back());
+      threads.pop_back();
+    }
+  }
+
+  TaskScheduler::ThreadPool::~ThreadPool()
+  {
+    /* leave all taskschedulers */
+    mutex.lock();
+    numThreadsRunning = 0;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* wait for threads to terminate */
+    for (size_t i=0; i<threads.size(); i++)
+      embree::join(threads[i]);
+  }
+
+  dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+  {
+    mutex.lock();
+    schedulers.push_back(scheduler);
+    mutex.unlock();
+    condition.notify_all();
+  }
+
+  dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+  {
+    Lock<MutexSys> lock(mutex);
+    for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+      if (scheduler == *it) {
+        schedulers.erase(it);
+        return;
+      }
+    }
+  }
+
+  void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+  {
+    while (globalThreadIndex < numThreadsRunning)
+    {
+      Ref<TaskScheduler> scheduler = NULL;
+      ssize_t threadIndex = -1;
+      {
+        Lock<MutexSys> lock(mutex);
+        condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+        if (globalThreadIndex >= numThreadsRunning) break;
+        scheduler = schedulers.front();
+        threadIndex = scheduler->allocThreadIndex();
+      }
+      scheduler->thread_loop(threadIndex);
+    }
+  }
+
+  TaskScheduler::TaskScheduler()
+    : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+  {
+    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    for (size_t i=0; i<threadLocal.size(); i++)
+      threadLocal[i].store(nullptr);
+  }
+
+  TaskScheduler::~TaskScheduler()
+  {
+    assert(threadCounter == 0);
+  }
+
+  dll_export size_t TaskScheduler::threadID()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadIndex()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadCount() {
+    return threadPool->size();
+  }
+
+  dll_export TaskScheduler* TaskScheduler::instance()
+  {
+    if (g_instance == NULL) {
+      Lock<MutexSys> lock(g_mutex);
+      g_instance = new TaskScheduler;
+      g_instance_vector.push_back(g_instance);
+    }
+    return g_instance;
+  }
+
+  void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+  {
+    if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+    threadPool->setNumThreads(numThreads,start_threads);
+  }
+
+  void TaskScheduler::destroy() {
+    delete threadPool; threadPool = nullptr;
+  }
+
+  dll_export ssize_t TaskScheduler::allocThreadIndex()
+  {
+    size_t threadIndex = threadCounter++;
+    assert(threadIndex < threadLocal.size());
+    return threadIndex;
+  }
+
+  void TaskScheduler::join()
+  {
+    mutex.lock();
+    size_t threadIndex = allocThreadIndex();
+    condition.wait(mutex, [&] () { return hasRootTask.load(); });
+    mutex.unlock();
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
+  }
+
+  void TaskScheduler::reset() {
+    hasRootTask = false;
+  }
+
+  void TaskScheduler::wait_for_threads(size_t threadCount)
+  {
+    while (threadCounter < threadCount-1)
+      pause_cpu();
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+    return thread_local_thread;
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+  {
+    Thread* old = thread_local_thread;
+    thread_local_thread = thread;
+    return old;
+  }
+
+  dll_export bool TaskScheduler::wait()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread == nullptr) return true;
+    while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+    return thread->scheduler->cancellingException == nullptr;
+  }
+
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+  {
+    /* allocate thread structure */
+    std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+    Thread& thread = *mthread;
+    threadLocal[threadIndex].store(&thread);
+    Thread* oldThread = swapThread(&thread);
+
+    /* main thread loop */
+    while (anyTasksRunning)
+    {
+      steal_loop(thread,
+                 [&] () { return anyTasksRunning > 0; },
+                 [&] () {
+                   anyTasksRunning++;
+                   while (thread.tasks.execute_local_internal(thread,nullptr));
+                   anyTasksRunning--;
+                 });
+    }
+    threadLocal[threadIndex].store(nullptr);
+    swapThread(oldThread);
+
+    /* remember exception to throw */
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
+    /* wait for all threads to terminate */
+    threadCounter--;
+#if defined(__WIN32__)
+	size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+	while (threadCounter > 0) {
+#if defined(__WIN32__)
+          if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+            yield();
+          else
+            _mm_pause();
+	  loopIndex++;
+#else
+          yield();
+#endif
+	}
+     // -- GODOT start --
+     // return except;
+     return;
+     // -- GODOT end --
+  }
+
+  bool TaskScheduler::steal_from_other_threads(Thread& thread)
+  {
+    const size_t threadIndex = thread.threadIndex;
+    const size_t threadCount = this->threadCounter;
+
+    for (size_t i=1; i<threadCount; i++)
+    {
+      pause_cpu(32);
+      size_t otherThreadIndex = threadIndex+i;
+      if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+      Thread* othread = threadLocal[otherThreadIndex].load();
+      if (!othread)
+        continue;
+
+      if (othread->tasks.steal(thread))
+        return true;
+    }
+
+    return false;
+  }
+
+  dll_export void TaskScheduler::startThreads() {
+    threadPool->startThreads();
+  }
+
+  dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->add(scheduler);
+  }
+
+  dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->remove(scheduler);
+  }
+
+  RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8fa6bb12fa
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,385 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+  /* The tasking system exports some symbols to be used by the tutorials. Thus we 
+     hide is also in the API namespace when requested. */
+  RTC_NAMESPACE_BEGIN
+
+  struct TaskScheduler : public RefCount
+  {
+    ALIGNED_STRUCT_(64);
+    friend class Device;
+
+    static const size_t TASK_STACK_SIZE = 4*1024;           //!< task structure stack
+    static const size_t CLOSURE_STACK_SIZE = 512*1024;    //!< stack for task closures
+
+    struct Thread;
+
+    /*! virtual interface for all tasks */
+    struct TaskFunction {
+      virtual void execute() = 0;
+    };
+
+    /*! builds a task interface from a closure */
+    template<typename Closure>
+    struct ClosureTaskFunction : public TaskFunction
+    {
+      Closure closure;
+      __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+      void execute() { closure(); };
+    };
+
+    struct __aligned(64) Task
+    {
+      /*! states a task can be in */
+      enum { DONE, INITIALIZED };
+
+      /*! switch from one state to another */
+      __forceinline void switch_state(int from, int to)
+      {
+	__memory_barrier();
+        MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+	assert(success);
+      }
+
+      /*! try to switch from one state to another */
+      __forceinline bool try_switch_state(int from, int to) {
+	__memory_barrier();
+	return state.compare_exchange_strong(from,to);
+      }
+
+       /*! increment/decrement dependency counter */
+      void add_dependencies(int n) {
+	dependencies+=n;
+      }
+
+      /*! initialize all tasks to DONE state by default */
+      __forceinline Task()
+	: state(DONE) {}
+
+      /*! construction of new task */
+      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      {
+        if (parent) parent->add_dependencies(+1);
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! construction of stolen task, stealing thread will decrement initial dependency */
+      __forceinline Task (TaskFunction* closure, Task* parent)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      {
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! try to steal this task */
+      bool try_steal(Task& child)
+      {
+        if (!stealable) return false;
+	if (!try_switch_state(INITIALIZED,DONE)) return false;
+	new (&child) Task(closure, this);
+        return true;
+      }
+
+      /*! run this task */
+      dll_export void run(Thread& thread);
+
+      void run_internal(Thread& thread);
+
+    public:
+      std::atomic<int> state;            //!< state this task is in
+      std::atomic<int> dependencies;     //!< dependencies to wait for
+      std::atomic<bool> stealable;       //!< true if task can be stolen
+      TaskFunction* closure;             //!< the closure to execute
+      Task* parent;                      //!< parent task to signal when we are finished
+      size_t stackPtr;                   //!< stack location where closure is stored
+      size_t N;                          //!< approximative size of task
+    };
+
+    struct TaskQueue
+    {
+      TaskQueue ()
+      : left(0), right(0), stackPtr(0) {}
+
+      __forceinline void* alloc(size_t bytes, size_t align = 64)
+      {
+        size_t ofs = bytes + ((align - stackPtr) & (align-1));
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
+        stackPtr += ofs;
+        return &stack[stackPtr-bytes];
+      }
+
+      template<typename Closure>
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      {
+        if (right >= TASK_STACK_SIZE)
+           // -- GODOT start --
+           // throw std::runtime_error("task stack overflow");
+           abort();
+           // -- GODOT end --
+
+	/* allocate new task on right side of stack */
+        size_t oldStackPtr = stackPtr;
+        TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+        new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
+        right++;
+
+	/* also move left pointer */
+	if (left >= right-1) left = right-1;
+      }
+
+      dll_export bool execute_local(Thread& thread, Task* parent);
+      bool execute_local_internal(Thread& thread, Task* parent);
+      bool steal(Thread& thread);
+      size_t getTaskSizeAtLeft();
+
+      bool empty() { return right == 0; }
+
+    public:
+
+      /* task stack */
+      Task tasks[TASK_STACK_SIZE];
+      __aligned(64) std::atomic<size_t> left;   //!< threads steal from left
+      __aligned(64) std::atomic<size_t> right;  //!< new tasks are added to the right
+
+      /* closure stack */
+      __aligned(64) char stack[CLOSURE_STACK_SIZE];
+      size_t stackPtr;
+    };
+
+    /*! thread local structure for each thread */
+    struct Thread
+    {
+      ALIGNED_STRUCT_(64);
+
+      Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+      : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+      __forceinline size_t threadCount() {
+        return scheduler->threadCounter;
+      }
+
+      size_t threadIndex;              //!< ID of this thread
+      TaskQueue tasks;                 //!< local task queue
+      Task* task;                      //!< current active task
+      Ref<TaskScheduler> scheduler;     //!< pointer to task scheduler
+    };
+
+    /*! pool of worker threads */
+    struct ThreadPool
+    {
+      ThreadPool (bool set_affinity);
+      ~ThreadPool ();
+
+      /*! starts the threads */
+      dll_export void startThreads();
+
+      /*! sets number of threads to use */
+      void setNumThreads(size_t numThreads, bool startThreads = false);
+
+      /*! adds a task scheduler object for scheduling */
+      dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+      /*! remove the task scheduler object again */
+      dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+      /*! returns number of threads of the thread pool */
+      size_t size() const { return numThreads; }
+
+      /*! main loop for all threads */
+      void thread_loop(size_t threadIndex);
+
+    private:
+      std::atomic<size_t> numThreads;
+      std::atomic<size_t> numThreadsRunning;
+      bool set_affinity;
+      std::atomic<bool> running;
+      std::vector<thread_t> threads;
+
+    private:
+      MutexSys mutex;
+      ConditionSys condition;
+      std::list<Ref<TaskScheduler> > schedulers;
+    };
+
+    TaskScheduler ();
+    ~TaskScheduler ();
+
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /*! lets new worker threads join the tasking system */
+    void join();
+    void reset();
+
+    /*! let a worker thread allocate a thread index */
+    dll_export ssize_t allocThreadIndex();
+
+    /*! wait for some number of threads available (threadCount includes main thread) */
+    void wait_for_threads(size_t threadCount);
+
+    /*! thread loop for all worker threads */
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
+
+    /*! steals a task from a different thread */
+    bool steal_from_other_threads(Thread& thread);
+
+    template<typename Predicate, typename Body>
+      static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+    {
+      if (useThreadPool) startThreads();
+
+      size_t threadIndex = allocThreadIndex();
+      std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+      Thread& thread = *mthread;
+      assert(threadLocal[threadIndex].load() == nullptr);
+      threadLocal[threadIndex] = &thread;
+      Thread* oldThread = swapThread(&thread);
+      thread.tasks.push_right(thread,size,closure);
+      {
+        Lock<MutexSys> lock(mutex);
+	anyTasksRunning++;
+        hasRootTask = true;
+        condition.notify_all();
+      }
+
+      if (useThreadPool) addScheduler(this);
+
+      while (thread.tasks.execute_local(thread,nullptr));
+      anyTasksRunning--;
+      if (useThreadPool) removeScheduler(this);
+
+      threadLocal[threadIndex] = nullptr;
+      swapThread(oldThread);
+
+      /* remember exception to throw */
+      std::exception_ptr except = nullptr;
+      if (cancellingException != nullptr) except = cancellingException;
+
+      /* wait for all threads to terminate */
+      threadCounter--;
+      while (threadCounter > 0) yield();
+      cancellingException = nullptr;
+
+      /* re-throw proper exception */
+      if (except != nullptr)
+        std::rethrow_exception(except);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(size_t size, const Closure& closure)
+    {
+      Thread* thread = TaskScheduler::thread();
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+      else                           instance()->spawn_root(closure,size);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(const Closure& closure) {
+      spawn(1,closure);
+    }
+
+    /* spawn a new task set  */
+    template<typename Index, typename Closure>
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    {
+      spawn(end-begin, [=]()
+        {
+	  if (end-begin <= blockSize) {
+	    return closure(range<Index>(begin,end));
+	  }
+	  const Index center = (begin+end)/2;
+	  spawn(begin,center,blockSize,closure);
+	  spawn(center,end  ,blockSize,closure);
+	  wait();
+	});
+    }
+
+    /* work on spawned subtasks and wait until all have finished */
+    dll_export static bool wait();
+
+    /* returns the ID of the current thread */
+    dll_export static size_t threadID();
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    dll_export static size_t threadIndex();
+
+    /* returns the total number of threads */
+    dll_export static size_t threadCount();
+
+  private:
+
+    /* returns the thread local task list of this worker thread */
+    dll_export static Thread* thread();
+
+    /* sets the thread local task list of this worker thread */
+    dll_export static Thread* swapThread(Thread* thread);
+
+    /*! returns the taskscheduler object to be used by the master thread */
+    dll_export static TaskScheduler* instance();
+
+    /*! starts the threads */
+    dll_export static void startThreads();
+
+    /*! adds a task scheduler object for scheduling */
+    dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+    /*! remove the task scheduler object again */
+    dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+  private:
+    std::vector<atomic<Thread*>> threadLocal;
+    std::atomic<size_t> threadCounter;
+    std::atomic<size_t> anyTasksRunning;
+    std::atomic<bool> hasRootTask;
+    std::exception_ptr cancellingException;
+    MutexSys mutex;
+    ConditionSys condition;
+
+  private:
+    static size_t g_numThreads;
+    static __thread TaskScheduler* g_instance;
+    static __thread Thread* thread_local_thread;
+    static ThreadPool* threadPool;
+  };
+
+  RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+    using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerppl.h b/thirdparty/embree/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..cbc2ecdbb8
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID() {
+      return GetCurrentThreadId();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    /* FIXME: threadIndex is NOT supported by PPL! */
+    static __forceinline size_t threadIndex() {
+      return 0;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+      return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+    }
+  };
+};
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..35bd49849f
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -0,0 +1,73 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#  define NOMINMAX
+// -- GODOT start --
+#endif
+// -- GODOT end --
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+      return tbb::task_arena::current_thread_index();
+#else
+      return 0;
+#endif
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::max_concurrency();
+#else
+      return tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+  };
+
+};
diff --git a/thirdparty/embree/include/embree3/rtcore.h b/thirdparty/embree/include/embree3/rtcore.h
new file mode 100644
index 0000000000..450ab4c535
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore.h
@@ -0,0 +1,14 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_config.h"
+#include "rtcore_common.h"
+#include "rtcore_device.h"
+#include "rtcore_buffer.h"
+#include "rtcore_ray.h"
+#include "rtcore_geometry.h"
+#include "rtcore_scene.h"
+#include "rtcore_builder.h"
+#include "rtcore_quaternion.h"
diff --git a/thirdparty/embree/include/embree3/rtcore_buffer.h b/thirdparty/embree/include/embree3/rtcore_buffer.h
new file mode 100644
index 0000000000..6b8eba9769
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_buffer.h
@@ -0,0 +1,51 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Types of buffers */
+enum RTCBufferType
+{
+  RTC_BUFFER_TYPE_INDEX            = 0,
+  RTC_BUFFER_TYPE_VERTEX           = 1,
+  RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2,
+  RTC_BUFFER_TYPE_NORMAL           = 3,
+  RTC_BUFFER_TYPE_TANGENT          = 4,
+  RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5,
+
+  RTC_BUFFER_TYPE_GRID                 = 8,
+
+  RTC_BUFFER_TYPE_FACE                 = 16,
+  RTC_BUFFER_TYPE_LEVEL                = 17,
+  RTC_BUFFER_TYPE_EDGE_CREASE_INDEX    = 18,
+  RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT   = 19,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX  = 20,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21,
+  RTC_BUFFER_TYPE_HOLE                 = 22,
+
+  RTC_BUFFER_TYPE_FLAGS = 32
+};
+
+/* Opaque buffer type */
+typedef struct RTCBufferTy* RTCBuffer;
+
+/* Creates a new buffer. */
+RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize);
+
+/* Creates a new shared buffer. */
+RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize);
+
+/* Returns a pointer to the buffer data. */
+RTC_API void* rtcGetBufferData(RTCBuffer buffer);
+
+/* Retains the buffer (increments the reference count). */
+RTC_API void rtcRetainBuffer(RTCBuffer buffer);
+
+/* Releases the buffer (decrements the reference count). */
+RTC_API void rtcReleaseBuffer(RTCBuffer buffer);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_builder.h b/thirdparty/embree/include/embree3/rtcore_builder.h
new file mode 100644
index 0000000000..4bff999fed
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_builder.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_scene.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Opaque BVH type */
+typedef struct RTCBVHTy* RTCBVH;
+
+/* Input build primitives for the builder */
+struct RTC_ALIGN(32) RTCBuildPrimitive
+{
+  float lower_x, lower_y, lower_z; 
+  unsigned int geomID;
+  float upper_x, upper_y, upper_z;
+  unsigned int primID;
+};
+
+/* Opaque thread local allocator type */
+typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator;
+
+/* Callback to create a node */
+typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr);
+
+/* Callback to set the pointer to all children */
+typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr);
+
+/* Callback to set the bounds of all children */
+typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr);
+
+/* Callback to create a leaf node */
+typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr);
+
+/* Callback to split a build primitive */
+typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr);
+
+/* Build flags */
+enum RTCBuildFlags
+{
+  RTC_BUILD_FLAG_NONE    = 0,
+  RTC_BUILD_FLAG_DYNAMIC = (1 << 0),
+};
+
+enum RTCBuildConstants
+{
+  RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32
+};
+
+/* Input for builders */
+struct RTCBuildArguments
+{
+  size_t byteSize;
+  
+  enum RTCBuildQuality buildQuality;
+  enum RTCBuildFlags buildFlags;
+  unsigned int maxBranchingFactor;
+  unsigned int maxDepth;
+  unsigned int sahBlockSize;
+  unsigned int minLeafSize;
+  unsigned int maxLeafSize;
+  float traversalCost;
+  float intersectionCost;
+  
+  RTCBVH bvh;
+  struct RTCBuildPrimitive* primitives;
+  size_t primitiveCount;
+  size_t primitiveArrayCapacity;
+  
+  RTCCreateNodeFunction createNode;
+  RTCSetNodeChildrenFunction setNodeChildren;
+  RTCSetNodeBoundsFunction setNodeBounds;
+  RTCCreateLeafFunction createLeaf;
+  RTCSplitPrimitiveFunction splitPrimitive;
+  RTCProgressMonitorFunction buildProgress;
+  void* userPtr;
+};
+
+/* Returns the default build settings.  */
+RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments()
+{
+  struct RTCBuildArguments args;
+  args.byteSize = sizeof(args);
+  args.buildQuality = RTC_BUILD_QUALITY_MEDIUM;
+  args.buildFlags = RTC_BUILD_FLAG_NONE;
+  args.maxBranchingFactor = 2;
+  args.maxDepth = 32;
+  args.sahBlockSize = 1;
+  args.minLeafSize = 1;
+  args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF;
+  args.traversalCost = 1.0f;
+  args.intersectionCost = 1.0f;
+  args.bvh = NULL;
+  args.primitives = NULL;
+  args.primitiveCount = 0;
+  args.primitiveArrayCapacity = 0;
+  args.createNode = NULL;
+  args.setNodeChildren = NULL;
+  args.setNodeBounds = NULL;
+  args.createLeaf = NULL;
+  args.splitPrimitive = NULL;
+  args.buildProgress = NULL;
+  args.userPtr = NULL;
+  return args;
+}
+
+/* Creates a new BVH. */
+RTC_API RTCBVH rtcNewBVH(RTCDevice device);
+
+/* Builds a BVH. */
+RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args);
+
+/* Allocates memory using the thread local allocator. */
+RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align);
+
+/* Retains the BVH (increments reference count). */
+RTC_API void rtcRetainBVH(RTCBVH bvh);
+
+/* Releases the BVH (decrements reference count). */
+RTC_API void rtcReleaseBVH(RTCBVH bvh);
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
new file mode 100644
index 0000000000..4857e1e05e
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_common.h
@@ -0,0 +1,328 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stddef.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "rtcore_config.h"
+
+RTC_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+#if defined(_M_X64)
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+#endif
+
+// -- GODOT start --
+#if defined(_WIN32) && defined(_MSC_VER)
+// -- GODOT end --
+#  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+#else
+#  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#if !defined (RTC_DEPRECATED)
+#ifdef __GNUC__
+  #define RTC_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+  #define RTC_DEPRECATED __declspec(deprecated)
+#else
+  #define RTC_DEPRECATED
+#endif
+#endif
+
+#if defined(_WIN32) 
+#  define RTC_FORCEINLINE __forceinline
+#else
+#  define RTC_FORCEINLINE inline __attribute__((always_inline))
+#endif
+
+/* Invalid geometry ID */
+#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1)
+
+/* Maximum number of time steps */
+#define RTC_MAX_TIME_STEP_COUNT 129
+
+/* Formats of buffers and other data structures */
+enum RTCFormat
+{
+  RTC_FORMAT_UNDEFINED = 0,
+
+  /* 8-bit unsigned integer */
+  RTC_FORMAT_UCHAR = 0x1001,
+  RTC_FORMAT_UCHAR2,
+  RTC_FORMAT_UCHAR3,
+  RTC_FORMAT_UCHAR4,
+
+  /* 8-bit signed integer */
+  RTC_FORMAT_CHAR = 0x2001,
+  RTC_FORMAT_CHAR2,
+  RTC_FORMAT_CHAR3,
+  RTC_FORMAT_CHAR4,
+
+  /* 16-bit unsigned integer */
+  RTC_FORMAT_USHORT = 0x3001,
+  RTC_FORMAT_USHORT2,
+  RTC_FORMAT_USHORT3,
+  RTC_FORMAT_USHORT4,
+
+  /* 16-bit signed integer */
+  RTC_FORMAT_SHORT = 0x4001,
+  RTC_FORMAT_SHORT2,
+  RTC_FORMAT_SHORT3,
+  RTC_FORMAT_SHORT4,
+
+  /* 32-bit unsigned integer */
+  RTC_FORMAT_UINT = 0x5001,
+  RTC_FORMAT_UINT2,
+  RTC_FORMAT_UINT3,
+  RTC_FORMAT_UINT4,
+
+  /* 32-bit signed integer */
+  RTC_FORMAT_INT = 0x6001,
+  RTC_FORMAT_INT2,
+  RTC_FORMAT_INT3,
+  RTC_FORMAT_INT4,
+
+  /* 64-bit unsigned integer */
+  RTC_FORMAT_ULLONG = 0x7001,
+  RTC_FORMAT_ULLONG2,
+  RTC_FORMAT_ULLONG3,
+  RTC_FORMAT_ULLONG4,
+
+  /* 64-bit signed integer */
+  RTC_FORMAT_LLONG = 0x8001,
+  RTC_FORMAT_LLONG2,
+  RTC_FORMAT_LLONG3,
+  RTC_FORMAT_LLONG4,
+
+  /* 32-bit float */
+  RTC_FORMAT_FLOAT = 0x9001,
+  RTC_FORMAT_FLOAT2,
+  RTC_FORMAT_FLOAT3,
+  RTC_FORMAT_FLOAT4,
+  RTC_FORMAT_FLOAT5,
+  RTC_FORMAT_FLOAT6,
+  RTC_FORMAT_FLOAT7,
+  RTC_FORMAT_FLOAT8,
+  RTC_FORMAT_FLOAT9,
+  RTC_FORMAT_FLOAT10,
+  RTC_FORMAT_FLOAT11,
+  RTC_FORMAT_FLOAT12,
+  RTC_FORMAT_FLOAT13,
+  RTC_FORMAT_FLOAT14,
+  RTC_FORMAT_FLOAT15,
+  RTC_FORMAT_FLOAT16,
+
+  /* 32-bit float matrix (row-major order) */
+  RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122,
+  RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123,
+  RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124,
+  RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132,
+  RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133,
+  RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134,
+  RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142,
+  RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143,
+  RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144,
+
+  /* 32-bit float matrix (column-major order) */
+  RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222,
+  RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223,
+  RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224,
+  RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232,
+  RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233,
+  RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234,
+  RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242,
+  RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243,
+  RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244,
+
+  /* special 12-byte format for grids */
+  RTC_FORMAT_GRID = 0xA001
+};
+
+/* Build quality levels */
+enum RTCBuildQuality
+{
+  RTC_BUILD_QUALITY_LOW    = 0,
+  RTC_BUILD_QUALITY_MEDIUM = 1,
+  RTC_BUILD_QUALITY_HIGH   = 2,
+  RTC_BUILD_QUALITY_REFIT  = 3,
+};
+
+/* Axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCBounds
+{
+  float lower_x, lower_y, lower_z, align0;
+  float upper_x, upper_y, upper_z, align1;
+};
+
+/* Linear axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCLinearBounds
+{
+  struct RTCBounds bounds0;
+  struct RTCBounds bounds1;
+};
+
+/* Intersection context flags */
+enum RTCIntersectContextFlags
+{
+  RTC_INTERSECT_CONTEXT_FLAG_NONE       = 0,
+  RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays
+  RTC_INTERSECT_CONTEXT_FLAG_COHERENT   = (1 << 0)  // optimize for coherent rays
+};
+
+/* Arguments for RTCFilterFunctionN */
+struct RTCFilterFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  struct RTCHitN* hit;
+  unsigned int N;
+};
+
+/* Filter callback function */
+typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args);
+
+/* Intersection context passed to intersect/occluded calls */
+struct RTCIntersectContext
+{
+  enum RTCIntersectContextFlags flags;               // intersection flags
+  RTCFilterFunctionN filter;                         // filter function to execute
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  unsigned int instStackSize;                        // Number of instances currently on the stack.
+#endif
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids.
+  
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;                      // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context)
+{
+  unsigned l = 0;
+  context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT;
+  context->filter = NULL;
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  context->instStackSize = 0;
+#endif
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    context->instID[l] = RTC_INVALID_GEOMETRY_ID;
+  
+#if RTC_MIN_WIDTH
+  context->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Point query structure for closest point query */
+struct RTC_ALIGN(16) RTCPointQuery 
+{
+  float x;                // x coordinate of the query point
+  float y;                // y coordinate of the query point
+  float z;                // z coordinate of the query point
+  float time;             // time of the point query
+  float radius;           // radius of the point query 
+};
+
+/* Structure of a packet of 4 query points */
+struct RTC_ALIGN(16) RTCPointQuery4
+{
+  float x[4];                // x coordinate of the query point
+  float y[4];                // y coordinate of the query point
+  float z[4];                // z coordinate of the query point
+  float time[4];             // time of the point query
+  float radius[4];           // radius of the point query
+};
+
+/* Structure of a packet of 8 query points */
+struct RTC_ALIGN(32) RTCPointQuery8
+{
+  float x[8];                // x coordinate of the query point
+  float y[8];                // y coordinate of the query point
+  float z[8];                // z coordinate of the query point
+  float time[8];             // time of the point query
+  float radius[8];           // radius ofr the point query 
+};
+
+/* Structure of a packet of 16 query points */
+struct RTC_ALIGN(64) RTCPointQuery16
+{
+  float x[16];                // x coordinate of the query point
+  float y[16];                // y coordinate of the query point
+  float z[16];                // z coordinate of the query point
+  float time[16];             // time of the point quey
+  float radius[16];           // radius of the point query
+};
+
+struct RTCPointQueryN;
+
+struct RTC_ALIGN(16) RTCPointQueryContext
+{
+  // accumulated 4x4 column major matrices from world space to instance space.
+  // undefined if size == 0.
+  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+
+  // accumulated 4x4 column major matrices from instance space to world space.
+  // undefined if size == 0.
+  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+
+  // instance ids.
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+
+  // number of instances currently on the stack.
+  unsigned int instStackSize;
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context)
+{
+  context->instStackSize = 0;
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
+{
+  // The (world space) query object that was passed as an argument of rtcPointQuery. The
+  // radius of the query can be decreased inside the callback to shrink the
+  // search domain. Increasing the radius or modifying the time or position of
+  // the query results in undefined behaviour.
+  struct RTCPointQuery* query;
+
+  // Used for user input/output data. Will not be read or modified internally.
+  void* userPtr;
+
+  // primitive and geometry ID of primitive
+  unsigned int  primID;        
+  unsigned int  geomID;    
+
+  // the context with transformation and instance ID stack
+  struct RTCPointQueryContext* context;
+
+  // If the current instance transform M (= context->world2inst[context->instStackSize]) 
+  // is a similarity matrix, i.e there is a constant factor similarityScale such that,
+  //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
+  // The similarity scale is 0, if the current instance transform is not a
+  // similarity transform and vice versa. The similarity scale allows to compute
+  // distance information in instance space and scale the distances into world
+  // space by dividing with the similarity scale, for example, to update the
+  // query radius. If the current instance transform is not a similarity
+  // transform (similarityScale = 0), the distance computation has to be
+  // performed in world space to ensure correctness. if there is no instance
+  // transform (context->instStackSize == 0), the similarity scale is 1.
+  float similarityScale;
+};
+
+typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h
new file mode 100644
index 0000000000..3a9819c9f1
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_config.h
@@ -0,0 +1,57 @@
+
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR 3
+#define RTC_VERSION_MINOR 13
+#define RTC_VERSION_PATCH 0
+#define RTC_VERSION 31300
+#define RTC_VERSION_STRING "3.13.0"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {
+#  define RTC_NAMESPACE_END }
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
diff --git a/thirdparty/embree/include/embree3/rtcore_device.h b/thirdparty/embree/include/embree3/rtcore_device.h
new file mode 100644
index 0000000000..2dd3047603
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_device.h
@@ -0,0 +1,87 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque device type */
+typedef struct RTCDeviceTy* RTCDevice;
+
+/* Creates a new Embree device. */
+RTC_API RTCDevice rtcNewDevice(const char* config);
+
+/* Retains the Embree device (increments the reference count). */
+RTC_API void rtcRetainDevice(RTCDevice device);
+  
+/* Releases an Embree device (decrements the reference count). */
+RTC_API void rtcReleaseDevice(RTCDevice device);
+
+/* Device properties */
+enum RTCDeviceProperty
+{
+  RTC_DEVICE_PROPERTY_VERSION       = 0,
+  RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1,
+  RTC_DEVICE_PROPERTY_VERSION_MINOR = 2,
+  RTC_DEVICE_PROPERTY_VERSION_PATCH = 3,
+
+  RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED  = 32,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED  = 33,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34,
+  RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED   = 35,
+
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63,
+  RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED          = 64,
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED    = 65,
+  RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED   = 66,
+  RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67,
+  RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED       = 68,
+
+  RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED    = 96,
+  RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED        = 97,
+  RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98,
+  RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED       = 99,
+  RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED        = 100,
+  RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED       = 101,
+
+  RTC_DEVICE_PROPERTY_TASKING_SYSTEM        = 128,
+  RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129,
+  RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130
+};
+
+/* Gets a device property. */
+RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop);
+
+/* Sets a device property. */
+RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value);
+  
+/* Error codes */
+enum RTCError
+{
+  RTC_ERROR_NONE              = 0,
+  RTC_ERROR_UNKNOWN           = 1,
+  RTC_ERROR_INVALID_ARGUMENT  = 2,
+  RTC_ERROR_INVALID_OPERATION = 3,
+  RTC_ERROR_OUT_OF_MEMORY     = 4,
+  RTC_ERROR_UNSUPPORTED_CPU   = 5,
+  RTC_ERROR_CANCELLED         = 6
+};
+
+/* Returns the error code. */
+RTC_API enum RTCError rtcGetDeviceError(RTCDevice device);
+
+/* Error callback function */
+typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str);
+
+/* Sets the error callback function. */
+RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr);
+
+/* Memory monitor callback function */
+typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post);
+
+/* Sets the memory monitor callback function. */
+RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_geometry.h b/thirdparty/embree/include/embree3/rtcore_geometry.h
new file mode 100644
index 0000000000..d1de17491c
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_geometry.h
@@ -0,0 +1,383 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_buffer.h"
+#include "rtcore_quaternion.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque scene type */
+typedef struct RTCSceneTy* RTCScene;
+
+/* Opaque geometry type */
+typedef struct RTCGeometryTy* RTCGeometry;
+
+/* Types of geometries */
+enum RTCGeometryType
+{
+  RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh
+  RTC_GEOMETRY_TYPE_QUAD     = 1, // quad (triangle pair) mesh
+  RTC_GEOMETRY_TYPE_GRID     = 2, // grid mesh
+
+  RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface
+
+  RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE   = 15, // Cone linear curves - discontinuous at edge boundaries 
+  RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE  = 16, // Round (rounded cone like) linear curves 
+  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE   = 17, // flat (ribbon-like) linear curves
+
+  RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE  = 24, // round (tube-like) Bezier curves
+  RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE   = 25, // flat (ribbon-like) Bezier curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  = 26, // flat normal-oriented Bezier curves
+  
+  RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves
+  RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE  = 33, // flat (ribbon-like) B-spline curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE  = 34, // flat normal-oriented B-spline curves
+
+  RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves
+  RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE  = 41, // flat (ribbon-like) Hermite curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE  = 42, // flat normal-oriented Hermite curves
+
+  RTC_GEOMETRY_TYPE_SPHERE_POINT = 50,
+  RTC_GEOMETRY_TYPE_DISC_POINT = 51,
+  RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52,
+
+  RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE  = 59, // flat (ribbon-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE  = 60, // flat normal-oriented Catmull-Rom curves
+
+  RTC_GEOMETRY_TYPE_USER     = 120, // user-defined geometry
+  RTC_GEOMETRY_TYPE_INSTANCE = 121  // scene instance
+};
+
+/* Interpolation modes for subdivision surfaces */
+enum RTCSubdivisionMode
+{
+  RTC_SUBDIVISION_MODE_NO_BOUNDARY     = 0,
+  RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1,
+  RTC_SUBDIVISION_MODE_PIN_CORNERS     = 2,
+  RTC_SUBDIVISION_MODE_PIN_BOUNDARY    = 3,
+  RTC_SUBDIVISION_MODE_PIN_ALL         = 4,
+};
+
+/* Curve segment flags */
+enum RTCCurveFlags
+{
+  RTC_CURVE_FLAG_NEIGHBOR_LEFT  = (1 << 0), // left segments exists
+  RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1)  // right segment exists
+};
+
+/* Arguments for RTCBoundsFunction */
+struct RTCBoundsFunctionArguments
+{
+  void* geometryUserPtr;
+  unsigned int primID;
+  unsigned int timeStep;
+  struct RTCBounds* bounds_o;
+};
+
+/* Bounding callback function */
+typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args);
+
+/* Arguments for RTCIntersectFunctionN */
+struct RTCIntersectFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayHitN* rayhit;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Intersection callback function */
+typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
+
+/* Arguments for RTCOccludedFunctionN */
+struct RTCOccludedFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Occlusion callback function */
+typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
+
+/* Arguments for RTCDisplacementFunctionN */
+struct RTCDisplacementFunctionNArguments
+{
+  void* geometryUserPtr;
+  RTCGeometry geometry;
+  unsigned int primID;
+  unsigned int timeStep;
+  const float* u;
+  const float* v;
+  const float* Ng_x;
+  const float* Ng_y;
+  const float* Ng_z;
+  float* P_x;
+  float* P_y;
+  float* P_z;
+  unsigned int N;
+};
+
+/* Displacement mapping callback function */
+typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args);
+
+/* Creates a new geometry of specified type. */
+RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type);
+
+/* Retains the geometry (increments the reference count). */
+RTC_API void rtcRetainGeometry(RTCGeometry geometry);
+
+/* Releases the geometry (decrements the reference count) */
+RTC_API void rtcReleaseGeometry(RTCGeometry geometry);
+
+/* Commits the geometry. */
+RTC_API void rtcCommitGeometry(RTCGeometry geometry);
+
+
+/* Enables the geometry. */
+RTC_API void rtcEnableGeometry(RTCGeometry geometry);
+
+/* Disables the geometry. */
+RTC_API void rtcDisableGeometry(RTCGeometry geometry);
+
+
+/* Sets the number of motion blur time steps of the geometry. */
+RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount);
+
+/* Sets the motion blur time range of the geometry. */
+RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime);
+  
+/* Sets the number of vertex attributes of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount);
+
+/* Sets the ray mask of the geometry. */
+RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask);
+
+/* Sets the build quality of the geometry. */
+RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality);
+
+/* Sets the maximal curve or point radius scale allowed by min-width feature. */
+RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale);
+
+
+/* Sets a geometry buffer. */
+RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Sets a shared geometry buffer. */
+RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Creates and sets a new geometry buffer. */
+RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount);
+
+/* Returns the pointer to the data of a buffer. */
+RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+/* Updates a geometry buffer. */
+RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+
+/* Sets the intersection filter callback function of the geometry. */
+RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the occlusion filter callback function of the geometry. */
+RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the user-defined data pointer of the geometry. */
+RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr);
+
+/* Gets the user-defined data pointer of the geometry. */
+RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry);
+
+/* Set the point query callback function of a geometry. */
+RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery);
+
+/* Sets the number of primitives of a user geometry. */
+RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount);
+
+/* Sets the bounding callback function to calculate bounding boxes for user primitives. */
+RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr);
+
+/* Set the intersect callback function of a user geometry. */
+RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect);
+
+/* Set the occlusion callback function of a user geometry. */
+RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded);
+
+/* Invokes the intersection filter from the intersection callback function. */
+RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+/* Invokes the occlusion filter from the occlusion callback function. */
+RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+
+/* Sets the instanced scene of an instance geometry. */
+RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene);
+
+/* Sets the transformation of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm);
+
+/* Sets the transformation quaternion of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd);
+
+/* Returns the interpolated transformation of an instance for the specified time. */
+RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm);
+
+
+/* Sets the uniform tessellation rate of the geometry. */
+RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate);
+
+/* Sets the number of topologies of a subdivision surface. */
+RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount);
+
+/* Sets the subdivision interpolation mode. */
+RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode);
+
+/* Binds a vertex attribute to a topology of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID);
+
+/* Sets the displacement callback function of a subdivision surface. */
+RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement);
+
+/* Returns the first half edge of a face. */
+RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID);
+
+/* Returns the face the half edge belongs to. */
+RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns next half edge. */
+RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns previous half edge. */
+RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns opposite half edge. */
+RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID);
+
+
+/* Arguments for rtcInterpolate */
+struct RTCInterpolateArguments
+{
+  RTCGeometry geometry;
+  unsigned int primID;
+  float u;
+  float v;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */
+RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args);
+
+/* Interpolates vertex data to some u/v location. */
+RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = NULL;
+  args.dPdv = NULL;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = ddPdudu;
+  args.ddPdvdv = ddPdvdv;
+  args.ddPdudv = ddPdudv;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Arguments for rtcInterpolateN */
+struct RTCInterpolateNArguments
+{
+  RTCGeometry geometry;
+  const void* valid;
+  const unsigned int* primIDs;
+  const float* u;
+  const float* v;
+  unsigned int N;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to an array of u/v locations. */
+RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args);
+
+/* RTCGrid primitive for grid mesh */
+struct RTCGrid
+{
+  unsigned int startVertexID;
+  unsigned int stride;
+  unsigned short width,height; // max is a 32k x 32k grid
+};
+
+RTC_NAMESPACE_END
+
+
diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree3/rtcore_quaternion.h
new file mode 100644
index 0000000000..6489fa3467
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_quaternion.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/*
+ * Structure for transformation respresentation as a matrix decomposition using
+ * a quaternion
+ */
+struct RTC_ALIGN(16) RTCQuaternionDecomposition
+{
+  float scale_x;
+  float scale_y;
+  float scale_z;
+  float skew_xy;
+  float skew_xz;
+  float skew_yz;
+  float shift_x;
+  float shift_y;
+  float shift_z;
+  float quaternion_r;
+  float quaternion_i;
+  float quaternion_j;
+  float quaternion_k;
+  float translation_x;
+  float translation_y;
+  float translation_z;
+};
+
+RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp)
+{
+  qdecomp->scale_x = 1.f;
+  qdecomp->scale_y = 1.f;
+  qdecomp->scale_z = 1.f;
+  qdecomp->skew_xy = 0.f;
+  qdecomp->skew_xz = 0.f;
+  qdecomp->skew_yz = 0.f;
+  qdecomp->shift_x = 0.f;
+  qdecomp->shift_y = 0.f;
+  qdecomp->shift_z = 0.f;
+  qdecomp->quaternion_r = 1.f;
+  qdecomp->quaternion_i = 0.f;
+  qdecomp->quaternion_j = 0.f;
+  qdecomp->quaternion_k = 0.f;
+  qdecomp->translation_x = 0.f;
+  qdecomp->translation_y = 0.f;
+  qdecomp->translation_z = 0.f;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float r, float i, float j, float k)
+{
+  qdecomp->quaternion_r = r;
+  qdecomp->quaternion_i = i;
+  qdecomp->quaternion_j = j;
+  qdecomp->quaternion_k = k;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float scale_x, float scale_y, float scale_z)
+{
+  qdecomp->scale_x = scale_x;
+  qdecomp->scale_y = scale_y;
+  qdecomp->scale_z = scale_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float skew_xy, float skew_xz, float skew_yz)
+{
+  qdecomp->skew_xy = skew_xy;
+  qdecomp->skew_xz = skew_xz;
+  qdecomp->skew_yz = skew_yz;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float shift_x, float shift_y, float shift_z)
+{
+  qdecomp->shift_x = shift_x;
+  qdecomp->shift_y = shift_y;
+  qdecomp->shift_z = shift_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float translation_x, float translation_y, float translation_z)
+{
+  qdecomp->translation_x = translation_x;
+  qdecomp->translation_y = translation_y;
+  qdecomp->translation_z = translation_z;
+}
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_ray.h b/thirdparty/embree/include/embree3/rtcore_ray.h
new file mode 100644
index 0000000000..a2ee6dabbb
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_ray.h
@@ -0,0 +1,378 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Ray structure for a single ray */
+struct RTC_ALIGN(16) RTCRay
+{
+  float org_x;        // x coordinate of ray origin
+  float org_y;        // y coordinate of ray origin
+  float org_z;        // z coordinate of ray origin
+  float tnear;        // start of ray segment
+
+  float dir_x;        // x coordinate of ray direction
+  float dir_y;        // y coordinate of ray direction
+  float dir_z;        // z coordinate of ray direction
+  float time;         // time of this ray for motion blur
+
+  float tfar;         // end of ray segment (set to hit distance)
+  unsigned int mask;  // ray mask
+  unsigned int id;    // ray ID
+  unsigned int flags; // ray flags
+};
+
+/* Hit structure for a single ray */
+struct RTC_ALIGN(16) RTCHit
+{
+  float Ng_x;          // x coordinate of geometry normal
+  float Ng_y;          // y coordinate of geometry normal
+  float Ng_z;          // z coordinate of geometry normal
+
+  float u;             // barycentric u coordinate of hit
+  float v;             // barycentric v coordinate of hit
+
+  unsigned int primID; // primitive ID
+  unsigned int geomID; // geometry ID
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+};
+
+/* Combined ray/hit structure for a single ray */
+struct RTCRayHit
+{
+  struct RTCRay ray;
+  struct RTCHit hit;
+};
+
+/* Ray structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCRay4
+{
+  float org_x[4];
+  float org_y[4];
+  float org_z[4];
+  float tnear[4];
+
+  float dir_x[4];
+  float dir_y[4];
+  float dir_z[4];
+  float time[4];
+
+  float tfar[4];
+  unsigned int mask[4];
+  unsigned int id[4];
+  unsigned int flags[4];
+};
+
+/* Hit structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCHit4
+{
+  float Ng_x[4];
+  float Ng_y[4];
+  float Ng_z[4];
+
+  float u[4];
+  float v[4];
+
+  unsigned int primID[4];
+  unsigned int geomID[4];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+};
+
+/* Combined ray/hit structure for a packet of 4 rays */
+struct RTCRayHit4
+{
+  struct RTCRay4 ray;
+  struct RTCHit4 hit;
+};
+
+/* Ray structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCRay8
+{
+  float org_x[8];
+  float org_y[8];
+  float org_z[8];
+  float tnear[8];
+
+  float dir_x[8];
+  float dir_y[8];
+  float dir_z[8];
+  float time[8];
+
+  float tfar[8];
+  unsigned int mask[8];
+  unsigned int id[8];
+  unsigned int flags[8];
+};
+
+/* Hit structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCHit8
+{
+  float Ng_x[8];
+  float Ng_y[8];
+  float Ng_z[8];
+
+  float u[8];
+  float v[8];
+
+  unsigned int primID[8];
+  unsigned int geomID[8];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+};
+
+/* Combined ray/hit structure for a packet of 8 rays */
+struct RTCRayHit8
+{
+  struct RTCRay8 ray;
+  struct RTCHit8 hit;
+};
+
+/* Ray structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCRay16
+{
+  float org_x[16];
+  float org_y[16];
+  float org_z[16];
+  float tnear[16];
+
+  float dir_x[16];
+  float dir_y[16];
+  float dir_z[16];
+  float time[16];
+
+  float tfar[16];
+  unsigned int mask[16];
+  unsigned int id[16];
+  unsigned int flags[16];
+};
+
+/* Hit structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCHit16
+{
+  float Ng_x[16];
+  float Ng_y[16];
+  float Ng_z[16];
+
+  float u[16];
+  float v[16];
+
+  unsigned int primID[16];
+  unsigned int geomID[16];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+};
+
+/* Combined ray/hit structure for a packet of 16 rays */
+struct RTCRayHit16
+{
+  struct RTCRay16 ray;
+  struct RTCHit16 hit;
+};
+
+/* Ray structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayNp
+{
+  float* org_x;
+  float* org_y;
+  float* org_z;
+  float* tnear;
+
+  float* dir_x;
+  float* dir_y;
+  float* dir_z;
+  float* time;
+
+  float* tfar;
+  unsigned int* mask;
+  unsigned int* id;
+  unsigned int* flags;
+};
+
+/* Hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCHitNp
+{
+  float* Ng_x;
+  float* Ng_y;
+  float* Ng_z;
+
+  float* u;
+  float* v;
+
+  unsigned int* primID;
+  unsigned int* geomID;
+  unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+};
+
+/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayHitNp
+{
+  struct RTCRayNp ray;
+  struct RTCHitNp hit;
+};
+
+struct RTCRayN;
+struct RTCHitN;
+struct RTCRayHitN;
+
+#if defined(__cplusplus)
+
+/* Helper functions to access ray packets of runtime size N */
+RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; }
+
+RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; }
+
+RTC_FORCEINLINE float&        RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_id   (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; }
+
+/* Helper functions to access hit packets of runtime size N */
+RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; }
+
+RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; }
+
+RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; }
+
+/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */
+RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; }
+RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; }
+
+/* Helper structure for a ray packet of compile-time size N */
+template<int N>
+struct RTCRayNt
+{
+  float org_x[N];
+  float org_y[N];
+  float org_z[N];
+  float tnear[N];
+
+  float dir_x[N];
+  float dir_y[N];
+  float dir_z[N];
+  float time[N];
+
+  float tfar[N];
+  unsigned int mask[N];
+  unsigned int id[N];
+  unsigned int flags[N];
+};
+
+/* Helper structure for a hit packet of compile-time size N */
+template<int N>
+struct RTCHitNt
+{
+  float Ng_x[N];
+  float Ng_y[N];
+  float Ng_z[N];
+
+  float u[N];
+  float v[N];
+
+  unsigned int primID[N];
+  unsigned int geomID[N];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+};
+
+/* Helper structure for a combined ray/hit packet of compile-time size N */
+template<int N>
+struct RTCRayHitNt
+{
+  RTCRayNt<N> ray;
+  RTCHitNt<N> hit;
+};
+
+RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i)
+{
+  RTCRay ray;
+  ray.org_x = RTCRayN_org_x(rayN,N,i);
+  ray.org_y = RTCRayN_org_y(rayN,N,i);
+  ray.org_z = RTCRayN_org_z(rayN,N,i);
+  ray.tnear = RTCRayN_tnear(rayN,N,i);
+  ray.dir_x = RTCRayN_dir_x(rayN,N,i);
+  ray.dir_y = RTCRayN_dir_y(rayN,N,i);
+  ray.dir_z = RTCRayN_dir_z(rayN,N,i);
+  ray.time  = RTCRayN_time(rayN,N,i);
+  ray.tfar  = RTCRayN_tfar(rayN,N,i);
+  ray.mask  = RTCRayN_mask(rayN,N,i);
+  ray.id    = RTCRayN_id(rayN,N,i);
+  ray.flags = RTCRayN_flags(rayN,N,i);
+  return ray;
+}
+
+RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i)
+{
+  RTCHit hit;
+  hit.Ng_x   = RTCHitN_Ng_x(hitN,N,i);
+  hit.Ng_y   = RTCHitN_Ng_y(hitN,N,i);
+  hit.Ng_z   = RTCHitN_Ng_z(hitN,N,i);
+  hit.u      = RTCHitN_u(hitN,N,i);
+  hit.v      = RTCHitN_v(hitN,N,i);
+  hit.primID = RTCHitN_primID(hitN,N,i);
+  hit.geomID = RTCHitN_geomID(hitN,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    hit.instID[l] = RTCHitN_instID(hitN,N,i,l);
+  return hit;
+}
+
+RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i)
+{
+  RTCHitN_Ng_x(hitN,N,i)   = hit->Ng_x;
+  RTCHitN_Ng_y(hitN,N,i)   = hit->Ng_y;
+  RTCHitN_Ng_z(hitN,N,i)   = hit->Ng_z;
+  RTCHitN_u(hitN,N,i)      = hit->u;
+  RTCHitN_v(hitN,N,i)      = hit->v;
+  RTCHitN_primID(hitN,N,i) = hit->primID;
+  RTCHitN_geomID(hitN,N,i) = hit->geomID;
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    RTCHitN_instID(hitN,N,i,l) = hit->instID[l];
+}
+
+RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i)
+{
+  RTCRayHit rh;
+
+  RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N);
+  rh.ray.org_x = RTCRayN_org_x(ray,N,i);
+  rh.ray.org_y = RTCRayN_org_y(ray,N,i);
+  rh.ray.org_z = RTCRayN_org_z(ray,N,i);
+  rh.ray.tnear = RTCRayN_tnear(ray,N,i);
+  rh.ray.dir_x = RTCRayN_dir_x(ray,N,i);
+  rh.ray.dir_y = RTCRayN_dir_y(ray,N,i);
+  rh.ray.dir_z = RTCRayN_dir_z(ray,N,i);
+  rh.ray.time  = RTCRayN_time(ray,N,i);
+  rh.ray.tfar  = RTCRayN_tfar(ray,N,i);
+  rh.ray.mask  = RTCRayN_mask(ray,N,i);
+  rh.ray.id    = RTCRayN_id(ray,N,i);
+  rh.ray.flags = RTCRayN_flags(ray,N,i);
+
+  RTCHitN* hit  = RTCRayHitN_HitN(rayhitN,N);
+  rh.hit.Ng_x   = RTCHitN_Ng_x(hit,N,i);
+  rh.hit.Ng_y   = RTCHitN_Ng_y(hit,N,i);
+  rh.hit.Ng_z   = RTCHitN_Ng_z(hit,N,i);
+  rh.hit.u      = RTCHitN_u(hit,N,i);
+  rh.hit.v      = RTCHitN_v(hit,N,i);
+  rh.hit.primID = RTCHitN_primID(hit,N,i);
+  rh.hit.geomID = RTCHitN_geomID(hit,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l);
+
+  return rh;
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h
new file mode 100644
index 0000000000..5878a3d402
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_scene.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Forward declarations for ray structures */
+struct RTCRayHit;
+struct RTCRayHit4;
+struct RTCRayHit8;
+struct RTCRayHit16;
+struct RTCRayHitNp;
+
+/* Scene flags */
+enum RTCSceneFlags
+{
+  RTC_SCENE_FLAG_NONE                    = 0,
+  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
+  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
+  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
+  RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3)
+};
+
+/* Creates a new scene. */
+RTC_API RTCScene rtcNewScene(RTCDevice device);
+
+/* Returns the device the scene got created in. The reference count of
+ * the device is incremented by this function. */
+RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
+   
+/* Retains the scene (increments the reference count). */
+RTC_API void rtcRetainScene(RTCScene scene);
+
+/* Releases the scene (decrements the reference count). */
+RTC_API void rtcReleaseScene(RTCScene scene);
+
+
+/* Attaches the geometry to a scene. */
+RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
+
+/* Attaches the geometry to a scene using the specified geometry ID. */
+RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
+
+/* Detaches the geometry from the scene. */
+RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. */
+RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
+
+
+/* Commits the scene. */
+RTC_API void rtcCommitScene(RTCScene scene);
+
+/* Commits the scene from multiple threads. */
+RTC_API void rtcJoinCommitScene(RTCScene scene);
+
+
+/* Progress monitor callback function */
+typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
+
+/* Sets the progress monitor callback function of the scene. */
+RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
+
+/* Sets the build quality of the scene. */
+RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
+
+/* Sets the scene flags. */
+RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
+
+/* Returns the scene flags. */
+RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
+
+/* Returns the axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
+
+/* Returns the linear axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
+
+
+/* Perform a closest point query of the scene. */
+RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Intersects a single ray with the scene. */
+RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit);
+
+/* Intersects a packet of 4 rays with the scene. */
+RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit);
+
+/* Intersects a packet of 8 rays with the scene. */
+RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit);
+
+/* Intersects a packet of 16 rays with the scene. */
+RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit);
+
+/* Intersects a stream of M rays with the scene. */
+RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of pointers to M rays with the scene. */
+RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N);
+
+/* Tests a single ray for occlusion with the scene. */
+RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray);
+
+/* Tests a packet of 4 rays for occlusion occluded with the scene. */
+RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray);
+
+/* Tests a packet of 8 rays for occlusion with the scene. */
+RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray);
+
+/* Tests a packet of 16 rays for occlusion with the scene. */
+RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray);
+
+/* Tests a stream of M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride);
+
+/* Tests a stream of pointers to M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N);
+
+/*! collision callback */
+struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
+typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
+
+/*! Performs collision detection of two scenes */
+RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
+ 
+#if defined(__cplusplus)
+
+/* Helper for easily combining scene flags */
+inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
+  return (RTCSceneFlags)((size_t)a | (size_t)b);
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..d83e8918a1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHair
+    {
+      /*! settings for builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t finished_range_threshold;  //!< finished range threshold
+      };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          friend struct BVHBuilderHair;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+          typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+          typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          static const size_t travCostAligned = 1;
+          static const size_t travCostUnaligned = 5;
+          static const size_t intCost = 6;
+
+          BuilderT (Scene* scene,
+                    PrimRef* prims,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeFunc& createAABBNode,
+                    const SetAABBNodeFunc& setAABBNode,
+                    const CreateOBBNodeFunc& createOBBNode,
+                    const SetOBBNodeFunc& setOBBNode,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const ReportFinishedRangeFunc& reportFinishedRange,
+                    const Settings settings)
+
+            : cfg(settings),
+            prims(prims),
+            createAlloc(createAlloc),
+            createAABBNode(createAABBNode),
+            setAABBNode(setAABBNode),
+            createOBBNode(createOBBNode),
+            setOBBNode(setOBBNode),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            reportFinishedRange(reportFinishedRange),
+            alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const PrimInfoRange& range)
+          {
+            if (range.size() == 0) return true;
+            unsigned int firstGeomID = prims[range.begin()].geomID();
+            for (size_t i=range.begin()+1; i<range.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+              return createLeaf(prims,pinfo,alloc);
+
+            /* fill all children by always splitting the largest one */
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+            unsigned numChildren = 1;
+            children[0] = pinfo;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              __aligned(64) PrimInfoRange left, right;
+              if (!sameGeometry(children[bestChild])) {
+                alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+              } else {
+                alignedHeuristic.splitFallback(children[bestChild],left,right);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* create node */
+            auto node = createAABBNode(alloc);
+
+            for (size_t i=0; i<numChildren; i++) {
+              const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+              setAABBNode(node,i,child,children[i].geomBounds);
+            }
+
+            return node;
+          }
+
+          /*! performs split */
+          __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+            const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+            /* try standard binning in aligned space */
+            float alignedObjectSAH = inf;
+            HeuristicBinningSAH::Split alignedObjectSplit;
+            if (aligned) {
+              alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+              alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+              bestSAH = min(alignedObjectSAH,bestSAH);
+            }
+
+            /* try standard binning in unaligned space */
+            UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (bestSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+              const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* try splitting into two strands */
+            HeuristicStrandSplitSAH::Split strandSplit;
+            float strandSAH = inf;
+            if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+              strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+              strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+              bestSAH = min(strandSAH,bestSAH);
+            }
+
+            /* fallback if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH)))
+            {
+              alignedHeuristic.deterministic_order(pinfo);
+              alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+            }
+
+            /* perform aligned split if this is best */
+            else if (bestSAH == alignedObjectSAH) {
+              alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+            }
+
+            /* perform unaligned split if this is best */
+            else if (bestSAH == unalignedObjectSAH) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* perform strand split if this is best */
+            else if (bestSAH == strandSAH) {
+              strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* can never happen */
+            else
+              assert(false);
+          }
+
+          /*! recursive build */
+          NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(pinfo.size());
+
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+            /* create leaf node */
+            if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+              alignedHeuristic.deterministic_order(pinfo);
+              return createLargeLeaf(depth,pinfo,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            size_t numChildren = 1;
+            children[0] = pinfo;
+            bool aligned = true;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                if (area(children[i].geomBounds) > bestArea) {
+                  bestArea = area(children[i].geomBounds);
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              PrimInfoRange left, right;
+              split(children[bestChild],left,right,aligned);
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            NodeRef node;
+
+            /* create aligned node */
+            if (aligned)
+            {
+              node = createAABBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else {
+                for (size_t i=0; i<numChildren; i++) {
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+                }
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              node = createOBBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                      const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                      const OBBox3fa obounds(space,sinfo.geomBounds);
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<numChildren; i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                  const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                  const OBBox3fa obounds(space,sinfo.geomBounds);
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+                }
+              }
+            }
+
+            /* reports a finished range of primrefs */
+            if (unlikely(alloc_barrier))
+              reportFinishedRange(pinfo);
+
+            return node;
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeFunc& createAABBNode;
+          const SetAABBNodeFunc& setAABBNode;
+          const CreateOBBNodeFunc& createOBBNode;
+          const SetOBBNodeFunc& setOBBNode;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+          const ReportFinishedRangeFunc& reportFinishedRange;
+
+        private:
+          HeuristicBinningSAH alignedHeuristic;
+          UnalignedHeuristicBinningSAH unalignedHeuristic;
+          HeuristicStrandSplitSAH strandHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        static NodeRef build (const CreateAllocFunc& createAlloc,
+                              const CreateAABBNodeFunc& createAABBNode,
+                              const SetAABBNodeFunc& setAABBNode,
+                              const CreateOBBNodeFunc& createOBBNode,
+                              const SetOBBNodeFunc& setOBBNode,
+                              const CreateLeafFunc& createLeaf,
+                              const ProgressMonitor& progressMonitor,
+                              const ReportFinishedRangeFunc& reportFinishedRange,
+                              Scene* scene,
+                              PrimRef* prims,
+                              const PrimInfo& pinfo,
+                              const Settings settings)
+        {
+          typedef BuilderT<NodeRef,
+            CreateAllocFunc,
+            CreateAABBNodeFunc,SetAABBNodeFunc,
+            CreateOBBNodeFunc,SetOBBNodeFunc,
+            CreateLeafFunc,ProgressMonitor,
+            ReportFinishedRangeFunc> Builder;
+
+          Builder builder(scene,prims,createAlloc,
+                          createAABBNode,setAABBNode,
+                          createOBBNode,setOBBNode,
+                          createLeaf,progressMonitor,reportFinishedRange,settings);
+
+          NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..8f21e3254f
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderMorton
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
+
+      /*! settings for morton builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+        : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      /*! Build primitive consisting of morton code and primitive ID. */
+      struct __aligned(8) BuildPrim
+      {
+        union {
+          struct {
+            unsigned int code;     //!< morton code
+            unsigned int index;    //!< i'th primitive
+          };
+          uint64_t t;
+        };
+
+        /*! interface for radix sort */
+        __forceinline operator unsigned() const { return code; }
+
+        /*! interface for standard sort */
+        __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+      };
+
+      /*! maps bounding box to morton code */
+      struct MortonCodeMapping
+      {
+        static const size_t LATTICE_BITS_PER_DIM = 10;
+        static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+        vfloat4 base;
+        vfloat4 scale;
+
+        __forceinline MortonCodeMapping(const BBox3fa& bounds)
+        {
+          base  = (vfloat4)bounds.lower;
+          const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+          scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+        }
+
+        __forceinline const vint4 bin (const BBox3fa& box) const
+        {
+          const vfloat4 lower = (vfloat4)box.lower;
+          const vfloat4 upper = (vfloat4)box.upper;
+          const vfloat4 centroid = lower+upper;
+          return vint4((centroid-base)*scale);
+        }
+
+        __forceinline unsigned int code (const BBox3fa& box) const
+        {
+          const vint4 binID = bin(box);
+          const unsigned int x = extract<0>(binID);
+          const unsigned int y = extract<1>(binID);
+          const unsigned int z = extract<2>(binID);
+          const unsigned int xyz = bitInterleave(x,y,z);
+          return xyz;
+        }
+      };
+
+#if defined (__AVX2__)
+
+      /*! for AVX2 there is a fast scalar bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest) {}
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          dest->index = index;
+          dest->code = mapping.code(b);
+          dest++;
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+      };
+
+#else
+
+      /*! before AVX2 is it better to use the SSE version of bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+        __forceinline ~MortonCodeGenerator()
+        {
+          if (slots != 0)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            for (size_t i=0; i<slots; i++) {
+              dest[currentID-slots+i].index = ai[i];
+              dest[currentID-slots+i].code = code[i];
+            }
+          }
+        }
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          const vint4 binID = mapping.bin(b);
+          ax[slots] = extract<0>(binID);
+          ay[slots] = extract<1>(binID);
+          az[slots] = extract<2>(binID);
+          ai[slots] = index;
+          slots++;
+          currentID++;
+
+          if (slots == 4)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+            vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+            slots = 0;
+          }
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+        size_t slots;
+        vint4 ax, ay, az, ai;
+      };
+
+#endif
+
+      template<
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocator,
+        typename CreateNodeFunc,
+        typename SetNodeBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBounds,
+        typename ProgressMonitor>
+
+        class BuilderT : private Settings
+      {
+        ALIGNED_CLASS_(16);
+
+      public:
+
+        BuilderT (CreateAllocator& createAllocator,
+                  CreateNodeFunc& createNode,
+                  SetNodeBoundsFunc& setBounds,
+                  CreateLeafFunc& createLeaf,
+                  CalculateBounds& calculateBounds,
+                  ProgressMonitor& progressMonitor,
+                  const Settings& settings)
+
+          : Settings(settings),
+          createAllocator(createAllocator),
+          createNode(createNode),
+          setBounds(setBounds),
+          createLeaf(createLeaf),
+          calculateBounds(calculateBounds),
+          progressMonitor(progressMonitor),
+          morton(nullptr) {}
+
+        ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+        {
+          /* this should never occur but is a fatal error */
+          if (depth > maxDepth)
+            throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+          /* create leaf for few primitives */
+          if (current.size() <= maxLeafSize)
+            return createLeaf(current,alloc);
+
+          /* fill all children by always splitting the largest one */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          size_t numChildren = 1;
+          children[0] = current;
+
+          do {
+
+            /* find best child with largest number of primitives */
+            size_t bestChild = -1;
+            size_t bestSize = 0;
+            for (size_t i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= maxLeafSize)
+                continue;
+
+              /* remember child with largest size */
+              if (children[i].size() > bestSize) {
+                bestSize = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == size_t(-1)) break;
+
+            /*! split best child into left and right child */
+            auto split = children[bestChild].split();
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = split.first;
+            children[numChildren+0] = split.second;
+            numChildren++;
+
+          } while (numChildren < branchingFactor);
+
+          /* create node */
+          auto node = createNode(alloc,numChildren);
+
+          /* recurse into each child */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<numChildren; i++)
+            bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /*! recreates morton codes when reaching a region where all codes are identical */
+        __noinline void recreateMortonCodes(const range<unsigned>& current) const
+        {
+          /* fast path for small ranges */
+          if (likely(current.size() < 1024))
+          {
+            /*! recalculate centroid bounds */
+            BBox3fa centBounds(empty);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              centBounds.extend(center2(calculateBounds(morton[i])));
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+            /* sort morton codes */
+            std::sort(morton+current.begin(),morton+current.end());
+          }
+          else
+          {
+            /*! recalculate centroid bounds */
+            auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+              BBox3fa centBounds = empty;
+              for (size_t i=r.begin(); i<r.end(); i++)
+                centBounds.extend(center2(calculateBounds(morton[i])));
+              return centBounds;
+            };
+            const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+                                                       BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  morton[i].code = mapping.code(calculateBounds(morton[i]));
+                }
+              });
+
+            /*! sort morton codes */
+#if defined(TASKING_TBB)
+            tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+            radixsort32(morton+current.begin(),current.size());
+#endif
+          }
+        }
+
+        __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+        {
+          const unsigned int code_start = morton[current.begin()].code;
+          const unsigned int code_end   = morton[current.end()-1].code;
+          unsigned int bitpos = lzcnt(code_start^code_end);
+
+          /* if all items mapped to same morton code, then re-create new morton codes for the items */
+          if (unlikely(bitpos == 32))
+          {
+            recreateMortonCodes(current);
+            const unsigned int code_start = morton[current.begin()].code;
+            const unsigned int code_end   = morton[current.end()-1].code;
+            bitpos = lzcnt(code_start^code_end);
+
+            /* if the morton code is still the same, goto fall back split */
+            if (unlikely(bitpos == 32)) {
+              current.split(left,right);
+              return;
+            }
+          }
+
+          /* split the items at the topmost different morton code bit */
+          const unsigned int bitpos_diff = 31-bitpos;
+          const unsigned int bitmask = 1 << bitpos_diff;
+
+          /* find location where bit differs using binary search */
+          unsigned begin = current.begin();
+          unsigned end   = current.end();
+          while (begin + 1 != end) {
+            const unsigned mid = (begin+end)/2;
+            const unsigned bit = morton[mid].code & bitmask;
+            if (bit == 0) begin = mid; else end = mid;
+          }
+          unsigned center = end;
+#if defined(DEBUG)
+          for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
+          for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+          left = make_range(current.begin(),center);
+          right = make_range(center,current.end());
+        }
+
+        ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+        {
+          /* get thread local allocator */
+          if (!alloc)
+            alloc = createAllocator();
+
+          /* call memory monitor function to signal progress */
+          if (toplevel && current.size() <= singleThreadThreshold)
+            progressMonitor(current.size());
+
+          /* create leaf node */
+          if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+            return createLargeLeaf(depth,current,alloc);
+
+          /* fill all children by always splitting the one with the largest surface area */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          split(current,children[0],children[1]);
+          size_t numChildren = 2;
+
+          while (numChildren < branchingFactor)
+          {
+            /* find best child with largest number of primitives */
+            int bestChild = -1;
+            unsigned bestItems = 0;
+            for (unsigned int i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= minLeafSize)
+                continue;
+
+              /* remember child with largest area */
+              if (children[i].size() > bestItems) {
+                bestItems = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == -1) break;
+
+            /*! split best child into left and right child */
+            range<unsigned> left, right;
+            split(children[bestChild],left,right);
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = left;
+            children[numChildren+0] = right;
+            numChildren++;
+          }
+
+          /* create leaf node if no split is possible */
+          if (unlikely(numChildren == 1))
+            return createLeaf(current,alloc);
+
+          /* allocate node */
+          auto node = createNode(alloc,numChildren);
+
+          /* process top parts of tree parallel */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          if (current.size() > singleThreadThreshold)
+          {
+            /*! parallel_for is faster than spawing sub-tasks */
+            parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  bounds[i] = recurse(depth+1,children[i],nullptr,true);
+                  _mm_mfence(); // to allow non-temporal stores during build
+                }
+              });
+          }
+
+          /* finish tree sequentially */
+          else
+          {
+            for (size_t i=0; i<numChildren; i++)
+              bounds[i] = recurse(depth+1,children[i],alloc,false);
+          }
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /* build function */
+        ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+        {
+          /* sort morton codes */
+          morton = src;
+          radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+          /* build BVH */
+          const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+
+      public:
+        CreateAllocator& createAllocator;
+        CreateNodeFunc& createNode;
+        SetNodeBoundsFunc& setBounds;
+        CreateLeafFunc& createLeaf;
+        CalculateBounds& calculateBounds;
+        ProgressMonitor& progressMonitor;
+
+      public:
+        BuildPrim* morton;
+      };
+
+
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBoundsFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAllocator,
+                                 CreateNodeFunc createNode,
+                                 SetBoundsFunc setBounds,
+                                 CreateLeafFunc createLeaf,
+                                 CalculateBoundsFunc calculateBounds,
+                                 ProgressMonitor progressMonitor,
+                                 BuildPrim* src,
+                                 BuildPrim* tmp,
+                                 size_t numPrimitives,
+                                 const Settings& settings)
+        {
+          typedef BuilderT<
+            ReductionTy,
+            decltype(createAllocator()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetBoundsFunc,
+            CreateLeafFunc,
+            CalculateBoundsFunc,
+            ProgressMonitor> Builder;
+
+          Builder builder(createAllocator,
+                          createNode,
+                          setBounds,
+                          createLeaf,
+                          calculateBounds,
+                          progressMonitor,
+                          settings);
+
+          return builder.build(src,tmp,numPrimitives);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..f9a08d65cd
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS   32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      struct SharedVector
+      {
+        __forceinline SharedVector() {}
+
+        __forceinline SharedVector(T* ptr, size_t refCount = 1)
+          : prims(ptr), refCount(refCount) {}
+
+        __forceinline void incRef() {
+          refCount++;
+        }
+
+        __forceinline void decRef()
+        {
+          if (--refCount == 0)
+            delete prims;
+        }
+
+        T* prims;
+        size_t refCount;
+      };
+
+    template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+      struct LocalChildListT
+      {
+        typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+        __forceinline LocalChildListT (const BuildRecord& record)
+          : numChildren(1), numSharedPrimVecs(1)
+        {
+          /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+          children[0] = record;
+          primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+        }
+
+        __forceinline ~LocalChildListT()
+        {
+          for (size_t i = 0; i < numChildren; i++)
+            primvecs[i]->decRef();
+        }
+
+        __forceinline BuildRecord& operator[] ( const size_t i ) {
+          return children[i];
+        }
+
+        __forceinline size_t size() const {
+          return numChildren;
+        }
+
+        __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+        {
+          SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+          if (lrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[bestChild] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+          }
+
+          if (rrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[numChildren] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+          }
+          bsharedPrimVec->decRef();
+          new_vector.release();
+
+          children[bestChild] = lrecord;
+          children[numChildren] = rrecord;
+          numChildren++;
+        }
+
+      public:
+        array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+        array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+        size_t numChildren;
+
+        array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+        size_t numSharedPrimVecs;
+      };
+
+    template<typename Mesh>
+      struct RecalculatePrimRef
+      {
+        Scene* scene;
+
+        __forceinline RecalculatePrimRef (Scene* scene)
+          : scene(scene) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+        }
+      };
+
+    struct VirtualRecalculatePrimRef
+    {
+      Scene* scene;
+      
+      __forceinline VirtualRecalculatePrimRef (Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+        return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+      }
+    };
+
+    struct BVHBuilderMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+          travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+          singleThreadThreshold(1024) {}
+
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        bool singleLeafTimeSegment; //!< split time to single time range
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+          return a.prims.size() < b.prims.size();
+        }
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;                     //!< Depth of the root of this subtree.
+	SetMB prims;                      //!< The list of primitives.
+      };
+
+      struct BuildRecordSplit : public BuildRecord
+      {
+        __forceinline BuildRecordSplit () {}
+
+        __forceinline BuildRecordSplit (size_t depth) 
+          : BuildRecord(depth) {}
+
+        __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+          : BuildRecord(record), split(split) {}
+        
+        BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+      };
+
+      template<
+        typename NodeRef,
+        typename RecalculatePrimRef,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor	  
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;        //!< create balanced tree if we are that many levels before the maximum tree depth
+
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+          typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+          typedef mvector<PrimRefMB>* PrimRefVector;
+          typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+          typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+        public:
+
+          BuilderT (MemoryMonitorInterface* device,
+                    const RecalculatePrimRef recalculatePrimRef,
+                    const CreateAllocFunc createAlloc,
+                    const CreateNodeFunc createNode,
+                    const SetNodeFunc setNode,
+                    const CreateLeafFunc createLeaf,
+                    const ProgressMonitor progressMonitor,
+                    const Settings& settings)
+            : cfg(settings),
+            heuristicObjectSplit(),
+            heuristicTemporalSplit(device, recalculatePrimRef),
+            recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+            progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          /*! finds the best split */
+          const Split find(const SetMB& set)
+          {
+            /* first try standard object split */
+            const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+            const float object_split_sah = object_split.splitSAH();
+
+            /* test temporal splits only when object split was bad */
+            const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+            if (object_split_sah < 0.50f*leaf_sah)
+              return object_split;
+
+            /* do temporal splits only if the time range is big enough */
+            if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+            {
+              const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+              const float temporal_split_sah = temporal_split.splitSAH();
+
+              /* take temporal split if it improved SAH */
+              if (temporal_split_sah < object_split_sah)
+                return temporal_split;
+            }
+
+            return object_split;
+          }
+
+          /*! array partitioning */
+          __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            /* perform object split */
+            if (likely(split.data == Split::SPLIT_OBJECT)) {
+              heuristicObjectSplit.split(split,set,lset,rset);
+            }
+            /* perform temporal split */
+            else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+              return heuristicTemporalSplit.split(split,set,lset,rset);
+            }
+            /* perform fallback split */
+            else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+              set.deterministic_order();
+              splitFallback(set,lset,rset);
+            }
+            /* split by geometry */
+            else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+              set.deterministic_order();
+              splitByGeometry(set,lset,rset);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! finds the best fallback split */
+          __noinline Split findFallback(const SetMB& set)
+          {
+            /* split if primitives are not from same geometry */
+            if (!sameGeometry(set))
+              return Split(0.0f,Split::SPLIT_GEOMID);
+            
+            /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+            if (cfg.singleLeafTimeSegment)
+            {
+              /* test if one primitive has more than one time segment in time range, if so split time */
+              for (size_t i=set.begin(); i<set.end(); i++)
+              {
+                const PrimRefMB& prim = (*set.prims)[i];
+                const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+                const int localTimeSegments = itime_range.size();
+                assert(localTimeSegments > 0);
+                if (localTimeSegments > 1) {
+                  const int icenter = (itime_range.begin() + itime_range.end())/2;
+                  const float splitTime = prim.timeStep(icenter);
+                  return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+                }
+              }
+            }        
+
+            /* otherwise return fallback split */
+            return Split(0.0f,Split::SPLIT_FALLBACK);
+          }
+
+          /*! performs fallback split */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            if (set.size() == 0) return true;
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            unsigned int firstGeomID = prims[begin].geomID();
+            for (size_t i=begin+1; i<end; i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /* split by geometry ID */
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            
+            PrimInfoMB left(empty);
+            PrimInfoMB right(empty);
+            unsigned int geomID = prims[begin].geomID();
+            size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+            
+            new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (in.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* replace already found split by fallback split */
+            const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+	    
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+              return createLeaf(current,alloc);
+	  
+            /* fill all children by always splitting the largest one */
+            bool hasTimeSplits = false;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildListSplit children(current);
+
+            do {
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+                  continue;
+
+                force_split = false;
+                
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecordSplit& brecord = children[bestChild];
+              BuildRecordSplit lrecord(current.depth+1);
+              BuildRecordSplit rrecord(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+
+              /* find new splits */
+              lrecord.split = findFallback(lrecord.prims);
+              rrecord.split = findFallback(rrecord.prims);
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = in.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create node */
+            auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+            /* recurse into each child and perform reduction */
+            LBBox3fa gbounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              gbounds.extend(values[i].lbounds);
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (hasTimeSplits)
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            const Split csplit = find(current.prims);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+            assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            SetMB lprims,rprims;
+            std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+            bool hasTimeSplits = new_vector != nullptr;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            {
+              BuildRecord lrecord(lprims,current.depth+1);
+              BuildRecord rrecord(rprims,current.depth+1);
+              children.split(0,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /*! split until node is full or SAH tells us to stop */
+            while (children.size() < cfg.branchingFactor) 
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                if (children[i].size() <= cfg.minLeafSize) continue;
+                if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              Split csplit = find(brecord.prims);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* sort buildrecords for simpler shadow ray traversal */
+            //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+            /*! create an inner node */
+            auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+            LBBox3fa gbounds = empty;
+
+            /* spawn tasks */
+            if (unlikely(current.size() > cfg.singleThreadThreshold))
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              /*! merge bounding boxes */
+              for (size_t i=0; i<children.size(); i++)
+                gbounds.extend(values[i].lbounds);
+            }
+            /* recurse into each child */
+            else
+            {
+              //for (size_t i=0; i<children.size(); i++)
+              for (ssize_t i=children.size()-1; i>=0; i--) {
+                values[i] = recurse(children[i],alloc,false);
+                gbounds.extend(values[i].lbounds);
+              }
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (unlikely(hasTimeSplits))
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          /*! builder entry function */
+          __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            const SetMB set(pinfo,&prims);
+            auto ret = recurse(BuildRecord(set,1),nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return ret;
+          }
+
+        private:
+          Settings cfg;
+          HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+          HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+          const RecalculatePrimRef recalculatePrimRef;
+          const CreateAllocFunc createAlloc;
+          const CreateNodeFunc createNode;
+          const SetNodeFunc setNode;
+          const CreateLeafFunc createLeaf;
+          const ProgressMonitor progressMonitor;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitorFunc>
+
+        static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+                                                      const PrimInfoMB& pinfo,
+                                                      MemoryMonitorInterface* device,
+                                                      const RecalculatePrimRef recalculatePrimRef,
+                                                      const CreateAllocFunc createAlloc,
+                                                      const CreateNodeFunc createNode,
+                                                      const SetNodeFunc setNode,
+                                                      const CreateLeafFunc createLeaf,
+                                                      const ProgressMonitorFunc progressMonitor,
+                                                      const Settings& settings)
+      {
+          typedef BuilderT<
+            NodeRef,
+            RecalculatePrimRef,
+            decltype(createAlloc()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetNodeFunc,
+            CreateLeafFunc,
+            ProgressMonitorFunc> Builder;
+
+          Builder builder(device,
+                          recalculatePrimRef,
+                          createAlloc,
+                          createNode,
+                          setNode,
+                          createLeaf,
+                          progressMonitor,
+                          settings);
+
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..397e8636b1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHairMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;       //!< depth of the root of this subtree
+	SetMB prims;        //!< the list of primitives
+      };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+          typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+          typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+          typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+        public:
+
+          BuilderT (Scene* scene,
+                    const RecalculatePrimRef& recalculatePrimRef,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeMBFunc& createAABBNodeMB,
+                    const SetAABBNodeMBFunc& setAABBNodeMB,
+                    const CreateOBBNodeMBFunc& createOBBNodeMB,
+                    const SetOBBNodeMBFunc& setOBBNodeMB,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings settings)
+
+            : cfg(settings),
+            scene(scene),
+            recalculatePrimRef(recalculatePrimRef),
+            createAlloc(createAlloc),
+            createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+            createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            unalignedHeuristic(scene),
+            temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+        private:
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+            unsigned int firstGeomID = prims[set.begin()].geomID();
+            for (size_t i=set.begin()+1; i<set.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+          
+          /*! performs some split if SAH approaches fail */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            PrimInfoMB linfo(empty);
+            PrimInfoMB rinfo(empty);
+            unsigned int geomID = (*set.prims)[begin].geomID();
+            size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+              return createLeaf(current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            LocalChildList children(current);
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+                  continue;
+
+                force_split = false;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!sameGeometry(children[bestChild].prims)) {
+                splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+              } else {
+                splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+              children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+            } while (children.size() < cfg.branchingFactor);
+
+
+            /* detect time_ranges that have shrunken */
+            bool timesplit = false;
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+            
+            /* create node */
+            NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+            LBBox3fa bounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              bounds.extend(values[i].lbounds);
+            }
+
+            setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+            if (timesplit)
+              bounds = current.prims.linearBounds(recalculatePrimRef);
+              
+            return NodeRecordMB4D(node,bounds,current.prims.time_range);
+          }
+
+          /*! performs split */
+          std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+            /* perform standard binning in aligned space */
+            HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+            float alignedObjectSAH = alignedObjectSplit.splitSAH();
+            bestSAH = min(alignedObjectSAH,bestSAH);
+
+            /* perform standard binning in unaligned space */
+            UnalignedHeuristicBinning::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (alignedObjectSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+              const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+            float temporal_split_sah = inf;
+            typename HeuristicTemporal::Split temporal_split;
+            if (bestSAH > 0.5f*leafSAH) {
+              if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+                temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+                temporal_split_sah = temporal_split.splitSAH();
+                bestSAH = min(temporal_split_sah,bestSAH);
+              }
+            }
+
+            /* perform fallback split if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH))) {
+              current.prims.deterministic_order();
+              splitFallback(current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform aligned split if this is best */
+            else if (likely(bestSAH == alignedObjectSAH)) {
+              alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform unaligned split if this is best */
+            else if (likely(bestSAH == unalignedObjectSAH)) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+              aligned = false;
+            }
+            /* perform temporal split if this is best */
+            else if (likely(bestSAH == temporal_split_sah)) {
+              timesplit = true;
+              return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! recursive build */
+          NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(current.size());
+
+            /* create leaf node */
+            if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            bool aligned = true;
+            bool timesplit = false;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                const float A = children[i].prims.halfArea();
+                if (A > bestArea) {
+                  bestArea = children[i].prims.halfArea();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+              children.split(bestChild,left,right,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create time split node */
+            if (timesplit)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequential */
+              else {
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                }
+              }
+
+              setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+
+            /* create aligned node */
+            else if (aligned)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      cbounds[i] = values[i].lbounds;
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++)
+                  bounds.extend(cbounds[i]);
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+              /* ... continue sequentially */
+              else
+              {
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                  bounds.extend(values[i].lbounds);
+                }
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              const NodeRef node = createOBBNodeMB(alloc);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                      const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                      const auto child = recurse(children[i],nullptr,true);
+                      setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<children.size(); i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                  const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                  const auto child = recurse(children[i],alloc,false);
+                  setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                }
+              }
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+          }
+
+        public:
+
+          /*! entry point into builder */
+          NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            BuildRecord record(SetMB(pinfo,&prims),1);
+            auto root = recurse(record,nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return root;
+          }
+
+        private:
+          Settings cfg;
+          Scene* scene;
+          const RecalculatePrimRef& recalculatePrimRef;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeMBFunc& createAABBNodeMB;
+          const SetAABBNodeMBFunc& setAABBNodeMB;
+          const CreateOBBNodeMBFunc& createOBBNodeMB;
+          const SetOBBNodeMBFunc& setOBBNodeMB;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+
+        private:
+          HeuristicBinning alignedHeuristic;
+          UnalignedHeuristicBinning unalignedHeuristic;
+          HeuristicTemporal temporalSplitHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+                                               const RecalculatePrimRef& recalculatePrimRef,
+                                               const CreateAllocFunc& createAlloc,
+                                               const CreateAABBNodeMBFunc& createAABBNodeMB,
+                                               const SetAABBNodeMBFunc& setAABBNodeMB,
+                                               const CreateOBBNodeMBFunc& createOBBNodeMB,
+                                               const SetOBBNodeMBFunc& setOBBNodeMB,
+                                               const CreateLeafFunc& createLeaf,
+                                               const ProgressMonitor& progressMonitor,
+                                               const Settings settings)
+        {
+          typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+            CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+            CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+            CreateLeafFunc,ProgressMonitor> Builder;
+
+          Builder builder(scene,recalculatePrimRef,createAlloc,
+                          createAABBNodeMB,setAABBNodeMB,
+                          createOBBNodeMB,setOBBNodeMB,
+                          createLeaf,progressMonitor,settings);
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..fff4bf2a35
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+#  define NUM_OBJECT_BINS 16
+#  define NUM_SPATIAL_BINS 16
+#else
+#  define NUM_OBJECT_BINS 32
+#  define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+  namespace isa
+  {
+    MAYBE_UNUSED static const float travCost = 1.0f;
+    MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    struct GeneralBVHBuilder
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor      
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;       //!< create balanced tree of we are that many levels before the maximum tree depth
+      
+
+      /*! settings for SAH builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(settings.sahBlockSize);
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost  )) intCost         = settings.intersectionCost;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+        size_t primrefarrayalloc;  //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+      };
+
+      /*! recursive state of builder */
+      template<typename Set, typename Split>
+        struct BuildRecordT
+        {
+        public:
+          __forceinline BuildRecordT () {}
+
+          __forceinline BuildRecordT (size_t depth)
+            : depth(depth), alloc_barrier(false), prims(empty) {}
+
+          __forceinline BuildRecordT (size_t depth, const Set& prims)
+            : depth(depth), alloc_barrier(false), prims(prims) {}
+
+          __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+          __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+          __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size();  }
+
+          __forceinline size_t size() const { return prims.size(); }
+
+        public:
+          size_t depth;       //!< Depth of the root of this subtree.
+          bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+          Set prims;          //!< The list of primitives.
+        };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafFunc
+      {
+        __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+      };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafSplitFunc
+      {
+        __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+      };
+
+      template<typename BuildRecord,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          friend struct GeneralBVHBuilder;
+
+          BuilderT (PrimRef* prims,
+                    Heuristic& heuristic,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateNodeFunc& createNode,
+                    const UpdateNodeFunc& updateNode,
+                    const CreateLeafFunc& createLeaf,
+                    const CanCreateLeafFunc& canCreateLeaf,
+                    const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings& settings) :
+                    cfg(settings),
+                    prims(prims),
+                    heuristic(heuristic),
+                    createAlloc(createAlloc),
+                    createNode(createNode),
+                    updateNode(updateNode),
+                    createLeaf(createLeaf),
+                    canCreateLeaf(canCreateLeaf),
+                    canCreateLeafSplit(canCreateLeafSplit),
+                    progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+              return createLeaf(prims,current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            size_t numChildren = 1;
+            children[0] = current;
+            do {
+
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].prims.size() > bestSize) {
+                  bestSize = children[i].prims.size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == (size_t)-1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!canCreateLeaf(prims,children[bestChild].prims)) {
+                canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+              } else {
+                heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* create node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* recurse into each child  and perform reduction */
+            for (size_t i=0; i<numChildren; i++)
+              values[i] = createLargeLeaf(children[i],alloc);
+
+            /* perform reduction */
+            return updateNode(current,children,node,values,numChildren);
+          }
+
+          const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+            assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              heuristic.deterministic_order(current.prims);
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            Set lprims,rprims;
+            heuristic.split(split,current.prims,lprims,rprims);
+	    
+            /*! initialize child list with initial split */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            children[0] = BuildRecord(current.depth+1,lprims);
+            children[1] = BuildRecord(current.depth+1,rprims);
+            size_t numChildren = 2;
+
+            /*! split until node is full or SAH tells us to stop */
+            while (numChildren < cfg.branchingFactor)
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+                /* find child with largest surface area */
+                if (halfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i;
+                  bestArea = halfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+              heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+              children[bestChild  ] = lrecord;
+              children[numChildren] = rrecord;
+              numChildren++;
+            }
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* sort buildrecords for faster shadow ray traversal */
+            std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+            /*! create an inner node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* spawn tasks */
+            if (current.size() > cfg.singleThreadThreshold)
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+            /* recurse into each child */
+            else
+            {
+              for (size_t i=0; i<numChildren; i++)
+                values[i] = recurse(children[i],alloc,false);
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          Heuristic& heuristic;
+          const CreateAllocFunc& createAlloc;
+          const CreateNodeFunc& createNode;
+          const UpdateNodeFunc& updateNode;
+          const CreateLeafFunc& createLeaf;
+          const CanCreateLeafFunc& canCreateLeaf;
+          const CanCreateLeafSplitFunc& canCreateLeafSplit;
+          const ProgressMonitor& progressMonitor;
+        };
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          DefaultCanCreateLeafFunc<PrimRef, Set>,
+          DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        DefaultCanCreateLeafFunc<PrimRef, Set>(),
+                        DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const CanCreateLeafFunc& canCreateLeaf,
+                                            const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          CanCreateLeafFunc,
+          CanCreateLeafSplitFunc,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        canCreateLeaf,
+                        canCreateLeafSplit,
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+    };
+
+    /* SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedSAH
+    {
+      typedef PrimInfoRange Set;
+      typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const CanCreateLeafFunc& canCreateLeaf,
+                                 const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          canCreateLeaf,
+          canCreateLeafSplit,
+          progressMonitor,
+          settings);
+      }
+    };
+
+    /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+    struct BVHBuilderBinnedFastSpatialSAH
+    {
+      typedef PrimInfoExtRange Set;
+      typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      static const unsigned int GEOMID_MASK = 0xFFFFFFFF >>     RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+      static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+      template<typename ReductionTy, typename UserCreateLeaf>
+      struct CreateLeafExt
+      {
+        __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+          : userCreateLeaf(userCreateLeaf) {}
+
+        // __noinline is workaround for ICC2016 compiler bug
+        template<typename Allocator>
+        __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+        {
+          for (size_t i=range.begin(); i<range.end(); i++)
+            prims[i].lower.u &= GEOMID_MASK;
+
+          return userCreateLeaf(prims,range,alloc);
+        }
+
+        const UserCreateLeaf userCreateLeaf;
+      };
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename SplitPrimitiveFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode,
+                                 UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 SplitPrimitiveFunc splitPrimitive,
+                                 ProgressMonitor progressMonitor,
+                                 PrimRef* prims,
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo,
+                                 const Settings& settings)
+        {
+          typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+          Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+          /* calculate total surface area */ // FIXME: this sum is not deterministic
+          const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+              double A = 0.0f;
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                A += area(prim.bounds());
+              }
+              return A;
+            },std::plus<double>());
+
+
+          /* calculate maximum number of spatial splits per primitive */
+          const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+          const float f = 10.0f;
+
+          const float invA = 1.0f / A;
+          parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                assert((prim.geomID() & SPLITS_MASK) == 0);
+                // FIXME: is there a better general heuristic ?
+                const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+                unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+                prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+              }
+            });
+
+          return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+            heuristic,
+            prims,
+            PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+            createAlloc,
+            createNode,
+            updateNode,
+            CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+            progressMonitor,
+            settings);
+        }
+    };
+
+    /* Open/Merge SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedOpenMergeSAH
+    {
+      static const size_t NUM_OBJECT_BINS_HQ = 32;
+      typedef PrimInfoExtRange Set;
+      typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+      
+      /*! special builder that propagates reduction over the tree */
+      template<
+        typename ReductionTy, 
+        typename BuildRef,
+        typename CreateAllocFunc, 
+        typename CreateNodeFunc, 
+        typename UpdateNodeFunc, 
+        typename CreateLeafFunc, 
+        typename NodeOpenerFunc, 
+        typename ProgressMonitor>
+        
+        static ReductionTy build(CreateAllocFunc createAlloc, 
+                                 CreateNodeFunc createNode, 
+                                 UpdateNodeFunc updateNode, 
+                                 const CreateLeafFunc& createLeaf, 
+                                 NodeOpenerFunc nodeOpenerFunc,
+                                 ProgressMonitor progressMonitor,
+                                 BuildRef* prims, 
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo, 
+                                 const Settings& settings)
+      {
+        typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+        Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+          heuristic,
+          prims,
+          PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..ee29d09ac9
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning.h
@@ -0,0 +1,496 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct BinMapping
+      {
+      public:
+        __forceinline BinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline BinMapping(size_t N, const BBox3fa& centBounds) 
+        {
+          num = min(BINS,size_t(4.0f + 0.05f*N));
+          assert(num >= 1);
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        __forceinline BinMapping(const BBox3fa& centBounds) 
+        {
+          num = BINS;
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        template<typename PrimInfo>
+        __forceinline BinMapping(const PrimInfo& pinfo) 
+        {
+          const vfloat4 eps = 1E-34f;
+          num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) pinfo.centBounds.lower;
+        }
+
+        /*! returns number of bins */
+        __forceinline size_t size() const { return num; }
+        
+        /*! slower but safe binning */
+        __forceinline Vec3ia bin(const Vec3fa& p) const 
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+          assert(i[0] >= 0 && (size_t)i[0] < num); 
+          assert(i[1] >= 0 && (size_t)i[1] < num);
+          assert(i[2] >= 0 && (size_t)i[2] < num);
+          return Vec3ia(i);
+#else
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+        }
+
+        /*! faster but unsafe binning */
+        __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+          return bin_unsafe(p.binCenter());
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef, typename BinBoundsAndCenter>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+          return bin_unsafe(binBoundsAndCenter.binCenter(p));
+        }
+
+        template<typename PrimRef>
+        __forceinline bool bin_unsafe(const PrimRef& ref,
+                                      const vint4&   vSplitPos,
+                                      const vbool4&  splitDimMask) const // FIXME: rename to isLeft
+        {
+          return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+        }
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+        }
+
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+          return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+        }
+        
+      public:
+        size_t num;
+        vfloat4 ofs,scale;        //!< linear function that maps to bin ID
+      };
+    
+    /*! stores all information to perform some split */
+    template<size_t BINS>
+      struct BinSplit
+      {
+        enum
+        {
+          SPLIT_OBJECT   = 0,
+          SPLIT_FALLBACK = 1,
+          SPLIT_ENFORCE  = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+          SPLIT_TEMPORAL = 2,
+          SPLIT_GEOMID   = 3,
+        };
+
+        /*! construct an invalid split by default */
+        __forceinline BinSplit()
+          : sah(inf), dim(-1), pos(0), data(0) {}
+
+        __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+          : sah(sah), dim(dim), fpos(fpos), data(data) {}
+        
+        /*! constructs specified split */
+        __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+          return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+        }
+        
+      public:
+        float sah;                //!< SAH cost of the split
+        int dim;                  //!< split dimension
+        union { int pos; float fpos; };                  //!< bin index for splitting
+        unsigned int data;        //!< extra optional split data
+        BinMapping<BINS> mapping; //!< mapping into bins
+      };
+    
+    /*! stores extended information about the split */
+    template<typename BBox>
+      struct SplitInfoT
+    {
+
+      __forceinline SplitInfoT () {}
+      
+      __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+	: leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+      
+    public:
+      size_t leftCount,rightCount;
+      BBox leftBounds,rightBounds;
+    };
+
+    typedef SplitInfoT<BBox3fa> SplitInfo;
+    typedef SplitInfoT<LBBox3fa> SplitInfo2;
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef, typename BBox>
+      struct __aligned(64) BinInfoT
+    {		  
+      typedef BinSplit<BINS> Split;
+      typedef vbool4 vbool;
+      typedef vint4 vint;
+      typedef vfloat4 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+
+      /*! bin access function */
+      __forceinline BBox &bounds(const size_t binID, const size_t dimID)             { return _bounds[binID][dimID]; }
+      __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+      __forceinline unsigned int &counts(const size_t binID, const size_t dimID)             { return _counts[binID][dimID]; }
+      __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+      __forceinline vuint4 &counts(const size_t binID)             { return _counts[binID]; }
+      __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+	for (size_t i=0; i<BINS; i++) {
+	  bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+	  counts(i) = vuint4(zero);
+	}
+      }
+      
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+      {
+	if (unlikely(N == 0)) return;
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i+0].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          BBox prim1; Vec3fa center1;
+          prims[i+1].binBoundsAndCenter(prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); 
+          const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); 
+          const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); 
+          const unsigned int s0 = (unsigned int)prims[i+0].size();
+          counts(b00,0)+=s0;
+          counts(b01,1)+=s0;
+          counts(b02,2)+=s0;
+
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int b10 = extract<0>(bin1);  bounds(b10,0).extend(prim1); 
+          const unsigned int b11 = extract<1>(bin1);  bounds(b11,1).extend(prim1); 
+          const unsigned int b12 = extract<2>(bin1);  bounds(b12,2).extend(prim1); 
+          const unsigned int s1 = (unsigned int)prims[i+1].size();
+          counts(b10,0)+=s1;
+          counts(b11,1)+=s1;
+          counts(b12,2)+=s1;
+        }
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = (unsigned int)prims[i].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+
+      /*! bins an array of primitives */
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+      {
+	if (N == 0) return;
+        
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+          
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int s1 = prims[i+1].size();
+          const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+          const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+          const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+        }
+	
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+      
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+	bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+		
+	for (size_t i=0; i<numBins; i++) 
+        {
+          counts(i) += other.counts(i);
+          bounds(i,0).extend(other.bounds(i,0));
+          bounds(i,1).extend(other.bounds(i,1));
+          bounds(i,2).extend(other.bounds(i,2));
+        }
+      }
+
+      /*! reduces binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<numBins; i++) 
+        {
+          c.counts(i) = a.counts(i)+b.counts(i);
+          c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+          c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+          c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = (1 << blocks_shift)-1;
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
+      
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+	  return;
+	}
+	
+	size_t leftCount = 0;
+	BBox leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += counts(i,split.dim);
+	  leftBounds.extend(bounds(i,split.dim));
+	}
+	size_t rightCount = 0;
+	BBox rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += counts(i,split.dim);
+	  rightBounds.extend(bounds(i,split.dim));
+	}
+	new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += counts(i, split.dim);
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += counts(i, split.dim);
+        }
+        return rightCount;
+      }
+
+    private:
+      BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+      vuint4   _counts[BINS];    //!< counts number of primitives that map into the bins
+    };
+  }
+
+  template<typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..ab3b97efb9
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,200 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+    {
+      __forceinline PrimInfoRange () {
+      }
+
+      __forceinline PrimInfoRange(const PrimInfo& pinfo)
+        : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+      __forceinline PrimInfoRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+      __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+    
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct HeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        __forceinline HeuristicArrayBinningSAH ()
+          : prims(nullptr) {}
+
+        /*! remember prim array */
+        __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+          : prims(prims) {}
+
+        /*! finds the best split */
+        __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_template<false>(pinfo,logBlockSize);
+          else
+            return find_template<true>(pinfo,logBlockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            split_template<false>(split,pinfo,linfo,rinfo);
+          else
+            split_template<true>(split,pinfo,linfo,rinfo);
+        }
+
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename Binner::vint vSplitPos(splitPos);
+          const typename Binner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          size_t center = 0;
+          if (!parallel)
+            center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(
+              prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+              [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+              [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+              PARALLEL_PARTITION_BLOCK_SIZE);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+
+        void deterministic_order(const PrimInfoRange& pinfo)
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+        }
+
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          const size_t begin = pinfo.begin();
+          const size_t end   = pinfo.end();
+          const size_t center = (begin + end)/2;
+
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&linfo) PrimInfoRange(begin,center,left);
+
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rinfo) PrimInfoRange(center,end,right);
+        }
+
+        void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          assert(range.size() > 1);
+          CentGeomBBox3fa left(empty);
+          CentGeomBBox3fa right(empty);
+          unsigned int geomID = prims[range.begin()].geomID();
+          size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+                                              [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+                                              [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+          new (&linfo) PrimInfoRange(range.begin(),center,left);
+          new (&rinfo) PrimInfoRange(center,range.end(),right);
+        }
+
+      private:
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct HeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..34a7f121bb
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct UnalignedHeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+          : scene(nullptr), prims(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+          : scene(scene), prims(prims) {}
+
+        const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+        {
+          Vec3fa axis(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const unsigned int geomID = prims[i].geomID();
+            const unsigned int primID = prims[i].primID();
+            const uint64_t geomprimID = prims[i].ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+          return frame(axis).transposed();
+        }
+        
+        const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+        {
+          auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+            {
+              CentGeomBBox3fa bounds(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                Geometry* mesh = scene->get(prims[i].geomID());
+                bounds.extend(mesh->vbounds(space,prims[i].primID()));
+              }
+              return bounds;
+            };
+          
+          const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), 
+                                                         CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+          return PrimInfo(set.begin(),set.end(),bounds);
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+            : scene(scene), space(space) {}
+          
+            /*! returns center for binning */
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            return embree::center2(bounds);
+          }
+          
+          /*! returns bounds and centroid used for binning */
+          __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            bounds_o = bounds;
+            center_o = embree::center2(bounds);
+          }
+
+        private:
+          Scene* scene;
+          const LinearSpace3fa space;
+        };
+        
+        /*! finds the best split */
+        __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          if (likely(pinfo.size() < 10000))
+            return find_template<false>(pinfo,logBlockSize,space);
+          else
+            return find_template<true>(pinfo,logBlockSize,space);
+        }
+
+        /*! finds the best split */
+        template<bool parallel>
+        const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(set);
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+          bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (likely(set.size() < 10000))
+            split_template<false>(split,space,set,lset,rset);
+          else
+            split_template<true>(split,space,set,lset,rset);
+        }
+
+        /*! array partitioning */
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const int splitPos = split.pos;
+          const int splitDim = split.dim;
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+          size_t center = 0;
+          if (likely(set.size() < 10000))
+            center = serial_partitioning(prims,begin,end,local_left,local_right,
+                                         [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+                                           [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                           [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+                                           [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+                                           128);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+        
+        void deterministic_order(const range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[set.begin()],&prims[set.end()]);
+        }
+        
+        void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+          
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&lset) PrimInfoRange(begin,center,left);
+          
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rset) PrimInfoRange(center,end,right);
+        }
+        
+      private:
+        Scene* const scene;
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct UnalignedHeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        UnalignedHeuristicArrayBinningMB(Scene* scene)
+        : scene(scene) {}
+
+        const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+        {
+          Vec3fa axis0(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const PrimRefMB& prim = (*set.prims)[i];
+            const unsigned int geomID = prim.geomID();
+            const unsigned int primID = prim.primID();
+            const uint64_t geomprimID = prim.ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            
+            const Geometry* mesh = scene->get(geomID);
+            const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+            if (tbounds.size() == 0) continue;
+
+            const size_t t = (tbounds.begin()+tbounds.end())/2;
+            const Vec3fa axis1 = mesh->computeDirection(primID,t);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis0 = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+
+          return frame(axis0).transposed();
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+            : scene(scene), time_range(time_range), space(space) {}
+          
+          /*! returns center for binning */
+          template<typename PrimRef>
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            return center2(lbounds.interpolate(0.5f));
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds.interpolate(0.5f);
+            center_o = center2(bounds_o);
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds;
+            center_o = center2(lbounds.interpolate(0.5f));
+          }
+          
+        private:
+          Scene* scene;
+          BBox1f time_range;
+          const LinearSpace3fa space;
+        };
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+        }
+
+      private:
+        Scene* scene;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..4249d16ea1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO: 
+//       - adjust parallel build thresholds
+//       - openNodesBasedOnExtend should consider max extended size
+  
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD   0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+      struct HeuristicArrayOpenMergeSAH
+      {
+        typedef BinSplit<OBJECT_BINS> Split;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+        
+        static const size_t PARALLEL_THRESHOLD = 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+        __forceinline HeuristicArrayOpenMergeSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+          : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) 
+        {
+          assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+        }
+
+        struct OpenHeuristic
+        {
+          __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+          {
+            const Vec3fa diag = pinfo.geomBounds.size();
+            dim = maxDim(diag);
+            assert(diag[dim] > 0.0f);
+            inv_max_extend = 1.0f / diag[dim];
+          }
+
+          __forceinline bool operator () ( PrimRef& prim ) const {
+            return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+          }
+
+        private:
+          size_t dim;
+          float inv_max_extend;
+        };
+
+        /*! compute extended ranges */
+        __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+        __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const unsigned int geomID = prims0[set.begin()].geomID();
+          
+          auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { 
+            bool commonGeomID = true;
+            size_t opens = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              commonGeomID &= prims0[i].geomID() == geomID; 
+              if (heuristic(prims0[i]))
+                opens += prims0[i].node.getN()-1; // coarse approximation
+            }
+            return std::pair<size_t,bool>(opens,commonGeomID); 
+          };
+          auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { 
+            return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); 
+          };
+          return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+        }
+
+        // FIXME: should consider maximum available extended size 
+        __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const size_t ext_range_start = set.end();
+
+          if (false && set.size() < PARALLEL_THRESHOLD) 
+          {
+            size_t extra_elements = 0;
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0; j<n; j++)
+                  set.extend_center2(tmp[j]);
+
+                prims0[i] = tmp[0];
+                for (size_t j=1; j<n; j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+              }
+            }
+            set._end += extra_elements;
+          }
+          else 
+          {
+            std::atomic<size_t> ext_elements;
+            ext_elements.store(0);
+            PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+                PrimInfo info(empty);
+                for (size_t i=r.begin(); i<r.end(); i++)
+                  if (heuristic(prims0[i]))
+                  {
+                    PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                    const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                    const size_t ID = ext_elements.fetch_add(n-1);
+                    assert(ID + n-1 <= set.ext_range_size());
+
+                    for (size_t j=0; j<n; j++)
+                      info.extend_center2(tmp[j]);
+
+                    prims0[i] = tmp[0];
+                    for (size_t j=1; j<n; j++)
+                      prims0[ext_range_start+ID+j-1] = tmp[j]; 
+                  }
+                return info;
+              }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+            set.centBounds.extend(info.centBounds);
+            assert(ext_elements.load() <= set.ext_range_size());
+            set._end += ext_elements.load();
+          }
+        } 
+
+        __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+        {
+          const OpenHeuristic heuristic(set);
+          size_t next_iteration_extra_elements = est_new_elements;          
+          
+          while (next_iteration_extra_elements <= set.ext_range_size()) 
+          {
+            next_iteration_extra_elements = 0;
+            size_t extra_elements = 0;
+            const size_t ext_range_start = set.end();
+
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0;j<n;j++)
+                  set.extend_center2(tmp[j]);
+                  
+                prims0[i] = tmp[0];
+                for (size_t j=1;j<n;j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+
+                for (size_t j=0; j<n; j++)
+                  if (heuristic(tmp[j]))
+                    next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+              }
+            }
+            assert( extra_elements <= set.ext_range_size());
+            set._end += extra_elements;
+
+            for (size_t i=set.begin();i<set.end();i++)
+              assert(prims0[i].numPrimitives() > 0);
+
+            if (unlikely(next_iteration_extra_elements == 0)) break;
+          }
+        } 
+
+        __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          /* single element */
+          if (set.size() <= 1)
+            return Split();
+
+          /* disable opening if there is no overlap */
+          const size_t D = 4;
+          if (unlikely(set.has_ext_range() && set.size() <= D))
+          {
+            bool disjoint = true;
+            for (size_t j=set.begin(); j<set.end()-1; j++) {
+              for (size_t i=set.begin()+1; i<set.end(); i++) {
+                if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { 
+                  disjoint = false; break; 
+                }
+              }
+            }
+            if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+          }
+
+          std::pair<size_t,bool> p(0,false);
+
+          /* disable opening when all primitives are from same geometry */
+          if (unlikely(set.has_ext_range()))
+          {
+            p =  getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+            if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif         
+          }
+
+          /* open nodes when we have sufficient space available */
+          if (unlikely(set.has_ext_range()))
+          {
+#if USE_LOOP_OPENING == 1
+            openNodesBasedOnExtendLoop(set,p.first);
+#else
+            if (p.first <= set.ext_range_size())
+              openNodesBasedOnExtend(set);
+#endif
+
+            /* disable opening when unsufficient space for opening a node available */
+            if (set.ext_range_size() < max_open_size-1) 
+              set.set_ext_range(set.end()); /* disable opening */
+          }
+                    
+          /* find best split */
+          return object_find(set,logBlockSize);
+        }
+
+
+        /*! finds the best object split */
+        __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+          else                                 return parallel_object_find  (set,logBlockSize);
+        }
+
+        /*! finds the best object split */
+        __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! finds the best split */
+        __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          auto body = [&] (const range<size_t>& r) -> Binner { 
+            Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; 
+          };
+          auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { 
+            Binner r = b0; r.merge(b1,_mapping.size()); return r; 
+          };
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            splitFallback(set,lset,rset);
+            return;
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          /* object split */
+          if (likely(set.size() < PARALLEL_THRESHOLD)) 
+            ext_weights = sequential_object_split(split,set,lset,rset);
+          else
+            ext_weights = parallel_object_split(split,set,lset,rset);
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });          
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+          auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+            [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+
+          return std::pair<size_t,size_t>(left.size(),right.size());
+        }
+
+        void deterministic_order(const extended_range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.add_center2(prims0[i]);
+
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++)
+            right.add_center2(prims0[i]);	
+
+          const size_t rweight = right.end;
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) 
+          {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const NodeOpenerFunc& nodeOpenerFunc;
+        size_t max_open_size;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..a6939ba258
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+  static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+  namespace isa
+  {
+
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct SpatialBinMapping
+      {
+      public:
+        __forceinline SpatialBinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+        {
+          const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+          const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+          const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+          scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+          ofs  = (vfloat4) pinfo.geomBounds.lower;
+          inv_scale = 1.0f / scale; 
+        }
+
+        /*! slower but safe binning */
+        __forceinline vint4 bin(const Vec3fa& p) const
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+          return clamp(i,vint4(0),vint4(BINS-1));
+        }
+
+        __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+        {
+#if defined(__AVX__)
+          const vfloat8 ofs8(ofs);
+          const vfloat8 scale8(scale);
+          const vint8 lu   = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+          const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+          return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+          const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+          const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+          const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+          const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+          return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+        }
+
+        
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),inv_scale[dim],ofs[dim]);
+        }
+
+        /*! calculates left spatial position of bin */
+        template<size_t N>
+        __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+          return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+        }
+        
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+      public:
+        vfloat4 ofs,scale,inv_scale;  //!< linear function that maps to bin ID
+      };
+
+    /*! stores all information required to perform some split */
+    template<size_t BINS>
+      struct SpatialBinSplit
+      {
+        /*! construct an invalid split by default */
+        __forceinline SpatialBinSplit() 
+          : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+        
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+          return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+        }
+        
+      public:
+        float sah;                 //!< SAH cost of the split
+        int   dim;                 //!< split dimension
+        int   pos;                 //!< split position
+        int   left;                //!< number of elements on the left side
+        int   right;               //!< number of elements on the right side
+        float factor;              //!< factor splitting the extended range
+        SpatialBinMapping<BINS> mapping; //!< mapping into bins
+      };    
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef>
+      struct __aligned(64) SpatialBinInfo
+    {
+      SpatialBinInfo() {
+      }
+
+      __forceinline SpatialBinInfo(EmptyTy) {
+	clear();
+      }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        for (size_t i=0; i<BINS; i++) { 
+          bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+          numBegin[i] = numEnd[i] = 0;
+        }
+      }
+      
+      /*! adds binning data */
+      __forceinline void add(const size_t dim,
+                             const size_t beginID, 
+                             const size_t endID, 
+                             const size_t binID, 
+                             const BBox3fa &b,
+                             const size_t n = 1) 
+      {
+        assert(beginID < BINS);
+        assert(endID < BINS);
+        assert(binID < BINS);
+
+        numBegin[beginID][dim]+=(unsigned int)n;
+        numEnd  [endID][dim]+=(unsigned int)n;
+        bounds  [binID][dim].extend(b);        
+      }
+
+      /*! extends binning bounds */
+      __forceinline void extend(const size_t dim,
+                                const size_t binID, 
+                                const BBox3fa &b) 
+      {
+        assert(binID < BINS);
+        bounds  [binID][dim].extend(b);        
+      }
+      
+      /*! bins an array of triangles */
+      template<typename SplitPrimitive>
+        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=0; i<N; i++)
+        {
+          const PrimRef prim = prims[i];
+          unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+          if (unlikely(splits == 1))
+          {
+            const vint4 bin = mapping.bin(center(prim.bounds()));
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+              numBegin[bin[dim]][dim]++;
+              numEnd  [bin[dim]][dim]++;
+              bounds  [bin[dim]][dim].extend(prim.bounds());
+            }
+          } 
+          else
+          {
+            const vint4 bin0 = mapping.bin(prim.bounds().lower);
+            const vint4 bin1 = mapping.bin(prim.bounds().upper);
+            
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              size_t bin;
+              PrimRef rest = prim;
+              size_t l = bin0[dim];
+              size_t r = bin1[dim];
+
+              // same bin optimization
+              if (likely(l == r)) 
+              {
+                numBegin[l][dim]++;
+                numEnd  [l][dim]++;
+                bounds  [l][dim].extend(prim.bounds());
+                continue;
+              }
+
+              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              {
+                const float pos = mapping.pos(bin+1,dim);
+                
+                PrimRef left,right;
+                splitPrimitive(rest,(int)dim,pos,left,right);
+                if (unlikely(left.bounds().empty())) l++;                
+                bounds[bin][dim].extend(left.bounds());
+                rest = right;
+              }
+              if (unlikely(rest.bounds().empty())) r--;
+              numBegin[l][dim]++;
+              numEnd  [r][dim]++;
+              bounds  [bin][dim].extend(rest.bounds());
+            }
+          }
+        }
+      }
+      
+      /*! bins a range of primitives inside an array */
+      template<typename SplitPrimitive>
+        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+	bin(splitPrimitive,prims+begin,end-begin,mapping);
+      }
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            size_t bin;
+            size_t l = bin0[dim];
+            size_t r = bin1[dim];
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds());
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            BBox3fa rest = prim.bounds();
+            const auto splitter = splitterFactory(prim);
+            for (bin=bin_start; bin<bin_end; bin++) 
+            {
+              const float pos = mapping.pos(bin+1,dim);
+              BBox3fa left,right;
+              splitter(rest,dim,pos,left,right);
+              if (unlikely(left.empty())) l++;                
+              extend(dim,bin,left);
+              rest = right;
+            }
+            if (unlikely(rest.empty())) r--;
+            add(dim,l,r,bin,rest);
+          }
+        }              
+      }
+
+
+
+      /*! bins an array of primitives */
+      __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            const size_t l = bin0[dim];
+            const size_t r = bin1[dim];
+
+            const unsigned int n  = prim.primID();
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds(),n);
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            for (size_t bin=bin_start; bin<bin_end; bin++) 
+              add(dim,l,r,bin,prim.bounds(),n);
+          }
+        }              
+      }
+      
+      /*! merges in other binning information */
+      void merge (const SpatialBinInfo& other)
+      {
+        for (size_t i=0; i<BINS; i++) 
+        {
+          numBegin[i] += other.numBegin[i];
+          numEnd  [i] += other.numEnd  [i];
+          bounds[i][0].extend(other.bounds[i][0]);
+          bounds[i][1].extend(other.bounds[i][1]);
+          bounds[i][2].extend(other.bounds[i][2]);
+        }
+      }
+
+      /*! merges in other binning information */
+      static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+      {
+        SpatialBinInfo c(empty);
+        for (size_t i=0; i<BINS; i++) 
+        {
+          c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+          c.numEnd  [i] += a.numEnd  [i]+b.numEnd  [i];
+          c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+          c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+          c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const 
+      {
+        /* sweep from right to left and compute parallel prefix of merged bounds */
+        vfloat4 rAreas[BINS];
+        vuint4 rCounts[BINS];
+        vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+        for (size_t i=BINS-1; i>0; i--)
+        {
+          count += numEnd[i];
+          rCounts[i] = count;
+          bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+          by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+          bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+        
+        /* sweep from left to right and compute SAH */
+        vuint4 blocks_add = (1 << blocks_shift)-1;
+        vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+        count = 0; bx = empty; by = empty; bz = empty;
+        for (size_t i=1; i<BINS; i++, ii+=1)
+        {
+          count += numBegin[i-1];
+          bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+          by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+          bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift);
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+          const vbool4 mask = sah < vbestSAH;
+          vbestPos      = select(mask,ii ,vbestPos);
+          vbestSAH      = select(mask,sah,vbestSAH);
+          vbestlCount   = select(mask,count,vbestlCount);
+          vbestrCount   = select(mask,rCounts[i],vbestrCount);
+        }
+        
+        /* find best dimension */
+        float bestSAH = inf;
+        int   bestDim = -1;
+        int   bestPos = 0;
+        unsigned int   bestlCount = 0;
+        unsigned int   bestrCount = 0;
+        for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+            bestlCount = vbestlCount[dim];
+            bestrCount = vbestrCount[dim];
+          }
+        }
+        assert(bestSAH >= 0.0f);
+        
+        /* return invalid split if no split found */
+        if (bestDim == -1) 
+          return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+        
+        /* return best found split */
+        return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+      }
+      
+    private:
+      BBox3fa bounds[BINS][3];  //!< geometry bounds for each bin in each dimension
+      vuint4    numBegin[BINS];   //!< number of primitives starting in bin
+      vuint4    numEnd[BINS];     //!< number of primitives ending in bin
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..60d235f48d
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,546 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+    struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+    {
+      __forceinline PrimInfoExtRange() {
+      }
+
+      __forceinline PrimInfoExtRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+      __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+
+    template<typename ObjectSplit, typename SpatialSplit>
+      struct Split2
+      {
+        __forceinline Split2 () {}
+        
+        __forceinline Split2 (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+        }
+        
+        __forceinline Split2& operator= (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+          return *this;
+        }
+          
+          __forceinline     ObjectSplit&  objectSplit()        { return *(      ObjectSplit*)data; }
+        __forceinline const ObjectSplit&  objectSplit() const  { return *(const ObjectSplit*)data; }
+        
+        __forceinline       SpatialSplit& spatialSplit()       { return *(      SpatialSplit*)data; }
+        __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+        
+        __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+          : spatial(false), sah(sah) 
+        {
+          new (data) ObjectSplit(objectSplit);
+        }
+        
+        __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+          : spatial(true), sah(sah) 
+        {
+          new (data) SpatialSplit(spatialSplit);
+        }
+        
+        __forceinline float splitSAH() const { 
+          return sah; 
+        }
+        
+        __forceinline bool valid() const {
+          return sah < float(inf);
+        }
+        
+      public:
+        __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+        bool spatial;
+        float sah;
+      };
+    
+    /*! Performs standard object binning */
+    template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+      struct HeuristicArraySpatialSAH
+      {
+        typedef BinSplit<OBJECT_BINS> ObjectSplit;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+        typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+        typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+        //typedef extended_range<size_t> Set;
+        typedef Split2<ObjectSplit,SpatialSplit> Split;
+        
+        static const size_t PARALLEL_THRESHOLD = 3*1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+        __forceinline HeuristicArraySpatialSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+          : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+        /*! compute extended ranges */
+        __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /*! finds the best split */
+        const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SplitInfo oinfo;
+          const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+          const float object_split_sah = object_split.splitSAH();
+
+          if (unlikely(set.has_ext_range()))
+          {
+            const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+            
+            /* do only spatial splits if the child bounds overlap */
+            if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+                safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+            {              
+              const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+              const float spatial_split_sah = spatial_split.splitSAH();
+
+              /* valid spatial split, better SAH and number of splits do not exceed extended range */
+              if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+                  spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+              {          
+                return Split(spatial_split,spatial_split_sah);
+              }
+            }
+          }
+
+          return Split(object_split,object_split_sah);
+        }
+
+        /*! finds the best object split */
+        __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+          else                                 return parallel_object_find  (set,logBlockSize,info);
+        }
+
+        /*! finds the best object split */
+        __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best split */
+        __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+                                   [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best spatial split */
+        __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+          else                                 return parallel_spatial_find  (set, logBlockSize);
+        }
+
+        /*! finds the best spatial split */
+        __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty); 
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+        __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty);
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> SpatialBinner { 
+                                     SpatialBinner binner(empty); 
+                                     binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+                                     return binner; },
+                                   [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+
+        /*! subdivides primitives based on a spatial split */
+        __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+        {
+          assert(set.has_ext_range());
+          const size_t max_ext_range_size = set.ext_range_size();
+          const size_t ext_range_start = set.end();
+
+          /* atomic counter for number of primref splits */
+          std::atomic<size_t> ext_elements;
+          ext_elements.store(0);
+          
+          const float fpos = split.mapping.pos(split.pos,split.dim);
+        
+          const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+          parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+              for (size_t i=r.begin();i<r.end();i++)
+              {
+                const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+                if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+                //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+                //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+                //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+                if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+                {
+                  assert(splits > 1);
+
+                  PrimRef left,right;
+                  const auto splitter = splitterFactory(prims0[i]);
+                  splitter(prims0[i],split.dim,fpos,left,right);
+                
+                  // no empty splits
+                  if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+                
+                  left.lower.u  = (left.lower.u  & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+                  right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+                  const size_t ID = ext_elements.fetch_add(1);
+
+                  /* break if the number of subdivided elements are greater than the maximum allowed size */
+                  if (unlikely(ID >= max_ext_range_size)) 
+                    break;
+
+                  /* only write within the correct bounds */
+                  assert(ID < max_ext_range_size);
+                  prims0[i] = left;
+                  prims0[ext_range_start+ID] = right;     
+                }
+              }
+            });
+
+          const size_t numExtElements = min(max_ext_range_size,ext_elements.load());          
+          assert(set.end()+numExtElements<=set.ext_end());
+          set._end += numExtElements;
+        }
+        
+        /*! array partitioning */
+        void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+          
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          if (unlikely(split.spatial))
+          {
+            create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); 
+
+            /* spatial split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+          }
+          else
+          {
+            /* object split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+          }
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { 
+                                                return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });          
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) {
+                                                const Vec3fa c = ref.bounds().center();
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          auto isLeft = [&] (const PrimRef &ref) { 
+            const Vec3fa c = ref.bounds().center();
+            return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        void deterministic_order(const PrimInfoExtRange& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        void splitFallback(const PrimInfoExtRange& set, 
+                           PrimInfoExtRange& lset, 
+                           PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++) {
+            left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+          }
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++) {
+            right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));	
+          }
+          const size_t rweight = right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const PrimitiveSplitterFactory& splitterFactory;
+        const CentGeomBBox3fa& root_info;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_strand_array.h b/thirdparty/embree/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..19c7fcdaa8
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    struct HeuristicStrandSplit
+    {
+      typedef range<size_t> Set;
+  
+      static const size_t PARALLEL_THRESHOLD = 10000;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+      /*! stores all information to perform some split */
+      struct Split
+      {    
+	/*! construct an invalid split by default */
+	__forceinline Split()
+	  : sah(inf), axis0(zero), axis1(zero) {}
+	
+	/*! constructs specified split */
+	__forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+	  : sah(sah), axis0(axis0), axis1(axis1) {}
+	
+	/*! calculates standard surface area heuristic for the split */
+	__forceinline float splitSAH() const { return sah; }
+
+        /*! test if this split is valid */
+        __forceinline bool valid() const { return sah != float(inf); }
+		
+      public:
+	float sah;             //!< SAH cost of the split
+	Vec3fa axis0, axis1;   //!< axis the two strands are aligned into
+      };
+
+      __forceinline HeuristicStrandSplit () // FIXME: required?
+        : scene(nullptr), prims(nullptr) {}
+      
+      /*! remember prim array */
+      __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+        : scene(scene), prims(prims) {}
+      
+      __forceinline const Vec3fa direction(const PrimRef& prim) {
+        return scene->get(prim.geomID())->computeDirection(prim.primID());
+      }
+      
+      __forceinline const BBox3fa bounds(const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(prim.primID());
+      }
+
+      __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(space,prim.primID());
+      }
+
+      /*! finds the best split */
+      const Split find(const range<size_t>& set, size_t logBlockSize)
+      {
+        Vec3fa axis0(0,0,1);
+        uint64_t bestGeomPrimID = -1;
+
+        /* curve with minimum ID determines first axis */
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          if (geomprimID >= bestGeomPrimID) continue;
+          const Vec3fa axis = direction(prims[i]);
+          if (sqr_length(axis) > 1E-18f) {
+            axis0 = normalize(axis);
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+        float bestCos = 1.0f;
+        Vec3fa axis1 = axis0;
+        bestGeomPrimID = -1;
+        for (size_t i=set.begin(); i<set.end(); i++) 
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          Vec3fa axisi = direction(prims[i]);
+          float leni = length(axisi);
+          if (leni == 0.0f) continue;
+          axisi /= leni;
+          float cos = abs(dot(axisi,axis0));
+          if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+            bestCos = cos; axis1 = axisi;
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* partition the two strands */
+        size_t lnum = 0, rnum = 0;
+        BBox3fa lbounds = empty, rbounds = empty;
+        const LinearSpace3fa space0 = frame(axis0).transposed();
+        const LinearSpace3fa space1 = frame(axis1).transposed();
+        
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          PrimRef& prim = prims[i];
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,axis0));
+          const float cos1 = abs(dot(axisi,axis1));
+          
+          if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+          else             { rnum++; rbounds.extend(bounds(space1,prim)); }
+        }
+      
+        /*! return an invalid split if we do not partition */
+        if (lnum == 0 || rnum == 0) 
+          return Split(inf,axis0,axis1);
+      
+        /*! calculate sah for the split */
+        const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+        return Split(sah,axis0,axis1);
+      }
+
+      /*! array partitioning */
+      void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) 
+      {
+        if (!split.valid()) {
+          deterministic_order(set);
+          return splitFallback(set,lset,rset);
+        }
+        
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        CentGeomBBox3fa local_left(empty);
+        CentGeomBBox3fa local_right(empty);
+
+        auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { 
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,split.axis0));
+          const float cos1 = abs(dot(axisi,split.axis1));
+          return cos0 > cos1;
+        };
+
+        auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { 
+          pinfo.extend(bounds(ref)); 
+        };
+        
+        size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+        
+        new (&lset) PrimInfoRange(begin,center,local_left);
+        new (&rset) PrimInfoRange(center,end,local_right);
+        assert(area(lset.geomBounds) >= 0.0f);
+        assert(area(rset.geomBounds) >= 0.0f);
+      }
+
+      void deterministic_order(const Set& set) 
+      {
+        /* required as parallel partition destroys original primitive order */
+        std::sort(&prims[set.begin()],&prims[set.end()]);
+      }
+      
+      void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+      {
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        const size_t center = (begin + end)/2;
+        
+        CentGeomBBox3fa left(empty);
+        for (size_t i=begin; i<center; i++)
+          left.extend(bounds(prims[i]));
+        new (&lset) PrimInfoRange(begin,center,left);
+        
+        CentGeomBBox3fa right(empty);
+        for (size_t i=center; i<end; i++)
+          right.extend(bounds(prims[i]));	
+        new (&rset) PrimInfoRange(center,end,right);
+      }
+      
+    private:
+      Scene* const scene;
+      PrimRef* const prims;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..b968e01c90
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+      struct HeuristicMBlurTemporalSplit
+      {
+        typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+        typedef mvector<PrimRefMB>* PrimRefVector;
+        typedef typename PrimRefMB::BBox BBox; 
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+          : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+        struct TemporalBinInfo
+        {
+          __forceinline TemporalBinInfo () {
+          }
+
+          __forceinline TemporalBinInfo (EmptyTy)
+          {
+            for (size_t i=0; i<BINS-1; i++)
+            {
+              count0[i] = count1[i] = 0;
+              bounds0[i] = bounds1[i] = empty;
+            }
+          }
+          
+          void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+          {
+            for (int b=0; b<BINS-1; b++)
+            {
+              const float t = float(b+1)/float(BINS);
+              const float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* find linear bounds for both time segments */
+              for (size_t i=begin; i<end; i++) 
+              {
+                if (prims[i].time_range_overlap(dt0))
+                {
+                  const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+                  bounds0[b].extend(bn0);
+#else
+                  bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+                  count0[b] += prims[i].timeSegmentRange(dt0).size();
+                }
+
+                if (prims[i].time_range_overlap(dt1))
+                {
+                  const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+                  bounds1[b].extend(bn1);
+#else
+                  bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+                  count1[b] += prims[i].timeSegmentRange(dt1).size();
+                }
+              }
+            }
+          }
+
+          __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) 
+          {
+            if (likely(end-begin < parallelThreshold)) {
+              bin(prims,begin,end,time_range,set,recalculatePrimRef);
+            } 
+            else 
+            {
+              auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { 
+                TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; 
+              };
+              *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+            }
+          }
+          
+          /*! merges in other binning information */
+          __forceinline void merge (const TemporalBinInfo& other)
+          {
+            for (size_t i=0; i<BINS-1; i++) 
+            {
+              count0[i] += other.count0[i];
+              count1[i] += other.count1[i];
+              bounds0[i].extend(other.bounds0[i]);
+              bounds1[i].extend(other.bounds1[i]);
+            }
+          }
+
+          static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+            TemporalBinInfo r = a; r.merge(b); return r;
+          }
+                    
+          Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+          {
+            float bestSAH = inf;
+            float bestPos = 0.0f;
+            for (int b=0; b<BINS-1; b++)
+            {
+              float t = float(b+1)/float(BINS);
+              float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* calculate sah */
+              const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+              float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+              if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+              if (unlikely(rCount == 0)) sah1 = 0.0f;
+              const float sah = sah0+sah1;
+              if (sah < bestSAH) {
+                bestSAH = sah;
+                bestPos = center_time;
+              }
+            }
+            return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+          }
+          
+        public:
+          size_t count0[BINS-1];
+          size_t count1[BINS-1];
+          BBox bounds0[BINS-1];
+          BBox bounds1[BINS-1];
+        };
+        
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          assert(set.size() > 0);
+          TemporalBinInfo binner(empty);
+          binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+          Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+          if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return tsplit;
+        }
+
+        __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          assert(tsplit.sah != float(inf));
+          assert(tsplit.fpos > set.time_range.lower);
+          assert(tsplit.fpos < set.time_range.upper);
+
+          float center_time = tsplit.fpos;
+          const BBox1f time_range0(set.time_range.lower,center_time);
+          const BBox1f time_range1(center_time,set.time_range.upper);
+          mvector<PrimRefMB>& prims = *set.prims;
+          
+          /* calculate primrefs for first time range */
+          std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+          PrimRefVector lprims = new_vector.get();
+          
+          auto reduction_func0 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range0)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+                (*lprims)[i-set.begin()] = prim;
+                pinfo.add_primref(prim);
+              }
+              else
+              {
+                (*lprims)[i-set.begin()] = prims[i];
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+          /* primrefs for first time range are in lprims[0 .. set.size()) */
+          /* some primitives may need to be filtered out */
+          if (linfo.size() != set.size())
+            linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+                      
+          lset = SetMB(linfo,lprims,time_range0);
+
+          /* calculate primrefs for second time range */
+          auto reduction_func1 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range1)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+                prims[i] = prim;
+                pinfo.add_primref(prim);
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+          rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+          /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+          /* some primitives may need to be filtered out */
+          if (rinfo.size() != set.size())
+            rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+        
+          rset = SetMB(rinfo,&prims,time_range1);
+
+          return new_vector;
+        }
+
+      private:
+        MemoryMonitorInterface* device;              // device to report memory usage to
+        const RecalculatePrimRef recalculatePrimRef;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/priminfo.h b/thirdparty/embree/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..fee515247a
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  // FIXME: maybe there's a better place for this util fct
+  __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+  {
+    const Vec3fa e0 = v1-v0;
+    const Vec3fa e1 = v2-v0;
+    const Vec3fa d = cross(e0,e1);
+    return fabs(d.x) + fabs(d.y) + fabs(d.z);
+  }
+
+  //namespace isa
+  //{
+    template<typename BBox>
+      class CentGeom
+    {
+    public:
+      __forceinline CentGeom () {}
+
+      __forceinline CentGeom (EmptyTy) 
+	: geomBounds(empty), centBounds(empty) {}
+      
+      __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) 
+	: geomBounds(geomBounds), centBounds(centBounds) {}
+      
+      template<typename PrimRef> 
+        __forceinline void extend_primref(const PrimRef& prim) 
+      {
+        BBox bounds; Vec3fa center;
+        prim.binBoundsAndCenter(bounds,center);
+        geomBounds.extend(bounds);
+        centBounds.extend(center);
+      }
+
+       template<typename PrimRef> 
+         __forceinline void extend_center2(const PrimRef& prim) 
+       {
+         BBox3fa bounds = prim.bounds();
+         geomBounds.extend(bounds);
+         centBounds.extend(bounds.center2());
+       }
+       
+      __forceinline void extend(const BBox& geomBounds_) {
+	geomBounds.extend(geomBounds_);
+	centBounds.extend(center2(geomBounds_));
+      }
+
+      __forceinline void merge(const CentGeom& other) 
+      {
+	geomBounds.extend(other.geomBounds);
+	centBounds.extend(other.centBounds);
+      }
+
+      static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+        CentGeom r = a; r.merge(b); return r;
+      }
+
+    public:
+      BBox geomBounds;   //!< geometry bounds of primitives
+      BBox3fa centBounds;   //!< centroid bounds of primitives
+    };
+
+    typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoT () {}
+
+      __forceinline PrimInfoT (EmptyTy) 
+	: CentGeom<BBox>(empty), begin(0), end(0) {}
+
+      __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        end++;
+      }
+
+       template<typename PrimRef> 
+         __forceinline void add_center2(const PrimRef& prim) {
+         CentGeom<BBox>::extend_center2(prim);
+         end++;
+       }
+
+        template<typename PrimRef> 
+          __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+          CentGeom<BBox>::extend_center2(prim);
+          end+=i;
+        }
+
+      /*__forceinline void add(const BBox& geomBounds_) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end++;
+      }
+
+      __forceinline void add(const BBox& geomBounds_, const size_t i) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end+=i;
+        }*/
+
+      __forceinline void merge(const PrimInfoT& other) 
+      {
+	CentGeom<BBox>::merge(other);
+        begin += other.begin;
+	end += other.end;
+      }
+
+      static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+        PrimInfoT r = a; r.merge(b); return r;
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return end-begin; 
+      }
+
+      __forceinline float halfArea() {
+        return expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+	//return halfArea(geomBounds)*float((num+3) >> 2);
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+	return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+      }
+      
+    public:
+      size_t begin,end;          //!< number of primitives
+    };
+
+    typedef PrimInfoT<BBox3fa> PrimInfo;
+    //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..d279dc4993
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
@@ -0,0 +1,312 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
+    template<typename Mesh>
+    size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+    {
+      size_t numPrimitives = morton.size();
+
+      /* compute scene bounds */
+      std::pair<size_t,BBox3fa> cb_empty(0,empty);
+      auto cb = parallel_reduce 
+        ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+          {
+            size_t num = 0;
+            BBox3fa bounds = empty;
+            
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa prim_bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+              bounds.extend(center2(prim_bounds));
+              num++;
+            }
+            return std::make_pair(num,bounds);
+          }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+          return std::make_pair(a.first + b.first,merge(a.second,b.second)); 
+        });
+      
+      
+      size_t numPrimitivesGen = cb.first;
+      const BBox3fa centBounds = cb.second;
+      
+      /* compute morton codes */
+      if (likely(numPrimitivesGen == numPrimitives))
+      {
+        /* fast path if all primitives were valid */
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+              generator(mesh->bounds(j),unsigned(j));
+          });
+      }
+      else
+      {
+        /* slow path, fallback in case some primitives were invalid */
+        ParallelPrefixSumState<size_t> pstate;
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());
+        
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (!mesh->buildBounds(j,&bounds)) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());          
+      }
+      return numPrimitivesGen;
+    }
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    // special variants for grid meshes
+
+// -- GODOT start --
+#if defined(EMBREE_GEOMETRY_GRID)
+// -- GODOT end --
+    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+    {
+      PrimInfo pinfo(empty);
+      size_t numPrimitives = 0;
+      
+      /* first run to get #primitives */
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator<GridMesh,false> iter(scene);
+
+      pstate.init(iter,size_t(1024));
+
+      /* iterate over all meshes in the scene */
+      pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            if (!mesh->valid(j)) continue;
+            BBox3fa bounds = empty;
+            const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j);
+            if (!mesh->valid(j)) continue;
+            pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      numPrimitives = pinfo.size();
+          
+      /* resize arrays */
+      sgrids.resize(numPrimitives); 
+      prims.resize(numPrimitives); 
+
+      /* second run to fill primrefs and SubGridBuildData arrays */
+      pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+          k = base.size();
+          size_t p_index = k;
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            if (!mesh->valid(j)) continue;
+            const GridMesh::Grid &g = mesh->grid(j);
+            for (unsigned int y=0; y<g.resY-1u; y+=2)
+              for (unsigned int x=0; x<g.resX-1u; x+=2)
+              {
+                BBox3fa bounds = empty;
+                if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
+                pinfo.add_center2(prim);
+                sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                prims[p_index++] = prim;                
+              }
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      assert(pinfo.size() == numPrimitives);
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+    {
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+
+      PrimInfo pinfo(empty);
+      size_t numPrimitives = 0;
+      
+      ParallelPrefixSumState<PrimInfo> pstate;
+      /* iterate over all grids in a single mesh */
+      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                   {
+                                     PrimInfo pinfo(empty);
+                                     for (size_t j=r.begin(); j<r.end(); j++)
+                                     {
+                                       if (!mesh->valid(j)) continue;
+                                       BBox3fa bounds = empty;
+                                       const PrimRef prim(bounds,geomID_,unsigned(j));
+                                       pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+                                     }
+                                     return pinfo;
+                                   }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      numPrimitives = pinfo.size();
+      /* resize arrays */
+      sgrids.resize(numPrimitives); 
+      prims.resize(numPrimitives); 
+
+      /* second run to fill primrefs and SubGridBuildData arrays */
+      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                   {
+
+                                     size_t p_index = base.size();
+                                     PrimInfo pinfo(empty);
+                                     for (size_t j=r.begin(); j<r.end(); j++)
+                                     {
+                                       if (!mesh->valid(j)) continue;
+                                       const GridMesh::Grid &g = mesh->grid(j);
+                                       for (unsigned int y=0; y<g.resY-1u; y+=2)
+                                         for (unsigned int x=0; x<g.resX-1u; x+=2)
+                                         {
+                                           BBox3fa bounds = empty;
+                                           if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                                           const PrimRef prim(bounds,geomID_,unsigned(p_index));
+                                           pinfo.add_center2(prim);
+                                           sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                           prims[p_index++] = prim;                
+                                         }
+                                     }
+                                     return pinfo;
+                                   }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      return pinfo;
+    }
+// -- GODOT start --
+#endif
+// -- GODOT end --
+    
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/primrefgen.h b/thirdparty/embree/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..c09a848ba3
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+    template<typename Mesh>
+      size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+
+    /* special variants for grids */
+    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
+
+    PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
+    
+  }
+}
+
diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8cd251ddd2
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)   
+#define CHECK_PRESPLIT(x) 
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{  
+  namespace isa
+  {
+
+    struct PresplitItem
+    {
+      union {
+        float priority;    
+        unsigned int data;
+      };
+      unsigned int index;
+      
+      __forceinline operator unsigned() const
+      {
+	return reinterpret_cast<const unsigned&>(priority);
+      }
+      __forceinline bool operator < (const PresplitItem& item) const
+      {
+	return (priority < item.priority);
+      }
+
+      template<typename Mesh>
+      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      {
+	const unsigned int geomID = ref.geomID();
+	const unsigned int primID = ref.primID();
+	const float area_aabb  = area(ref.bounds());
+	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+        const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+        assert(area_prim <= area_aabb);
+        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        assert(priority >= 0.0f && priority < FLT_LARGE);
+	return priority;      
+      }
+
+    
+    };
+
+    inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+      return cout << "index " << item.index << " priority " << item.priority;    
+    };
+
+    template<typename SplitterFactory>    
+      void splitPrimitive(SplitterFactory &Splitter,
+                          const PrimRef &prim,
+                          const unsigned int geomID,
+                          const unsigned int primID,
+                          const unsigned int split_level,
+                          const Vec3fa &grid_base, 
+                          const float grid_scale,
+                          const float grid_extend,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+      if (split_level == 0)
+      {
+        assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+        subPrims[numSubPrims++] = prim;
+      }
+      else
+      {
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+			
+        /* if all bits are equal then we cannot split */
+        if(unlikely(lower_code == upper_code))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = prim;
+          return;
+        }
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+        assert(prim.lower[dim] <= fsplit &&
+               prim.upper[dim] >= fsplit);
+		
+        /* split primitive */
+        const auto splitter = Splitter(prim);
+        BBox3fa left,right;
+        splitter(prim.bounds(),dim,fsplit,left,right);
+        assert(!left.empty());
+        assert(!right.empty());
+
+			    
+        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+      }
+    }
+    
+    
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	  return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	      return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+      return pinfo;	
+    }
+    
+    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    {
+      const Vec3fa lower = ref.lower;
+      const Vec3fa upper = ref.upper;
+      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+      Vec3ia ilower(floor(glower));
+      Vec3ia iupper(floor(gupper));
+      
+      /* this ignores dimensions that are empty */
+      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+      /* compute a morton code for the lower and upper grid coordinates. */
+      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+      return Vec2i(lower_code,upper_code);
+    }
+
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {	
+      static const size_t MIN_STEP_SIZE = 128;
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+      /* use correct number of primitives */
+      size_t numPrimitives = pinfo.size();
+      const size_t alloc_numPrimitives = prims.size(); 
+      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+      /* set up primitive splitter */
+      SplitterFactory Splitter(scene);
+
+
+      DBG_PRESPLIT(
+        const size_t org_numPrimitives = pinfo.size();
+        PRINT(numPrimitives);		
+        PRINT(alloc_numPrimitives);		
+        PRINT(numSplitPrimitivesBudget);
+        );
+
+      /* allocate double buffer presplit items */
+      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+      /* compute grid */
+      const Vec3fa grid_base    = pinfo.geomBounds.lower;
+      const Vec3fa grid_diag    = pinfo.geomBounds.size();
+      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
+      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+      /* init presplit items and get total sum */
+      const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+          float sum = 0.0f;
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {		
+            presplitItem[i].index = (unsigned int)i;
+            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            /* if all bits are equal then we cannot split */
+            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            /* FIXME: sum undeterministic */
+            sum += presplitItem[i].priority;
+          }
+          return sum;
+        },[](const float& a, const float& b) -> float { return a+b; });
+
+      /* compute number of splits per primitive */
+      const float inv_psum = 1.0f / psum;
+      parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {
+            if (presplitItem[i].priority > 0.0f)
+            {
+              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+              {
+                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              }
+              else
+                presplitItem[i].priority = 0.0f;
+            }
+          }
+        });
+
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
+      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+      /* anything to split ? */
+      if (center < numPrimitives)
+      {
+        const size_t numPrimitivesToSplit = numPrimitives - center;
+        assert(presplitItem[center].priority >= 1.0f);
+
+        /* sort presplit items in ascending order */
+        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+        CHECK_PRESPLIT(
+          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+              for (size_t i=r.begin(); i<r.end(); i++)
+                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+            });
+          );
+
+        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+            size_t sum = 0;
+            for (size_t i=t.begin(); i<t.end(); i++)
+            {	
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+              assert(presplitItem[i].priority >= 1.0f);
+              const unsigned int  primrefID = presplitItem[i].index;	
+              const float prio              = presplitItem[i].priority;
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = (unsigned int)prio;
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              assert(numSubPrims);
+              numSubPrims--; // can reuse slot 
+              sum+=numSubPrims;
+              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+              primOffset0[i-center] = numSubPrims;
+            }
+            return sum;
+          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        
+        /* if we are over budget, need to shrink the range */
+        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        {
+          size_t new_center = numPrimitives-1;
+          size_t sum = 0;
+          for (;new_center>=center;new_center--)
+          {
+            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+            sum += numSubPrims;
+          }
+          new_center++;
+          center = new_center;
+        }
+
+        /* parallel prefix sum to compute offsets for storing sub-primitives */
+        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+            for (size_t j=rn.begin(); j<rn.end(); j++)		    
+            {
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+              const unsigned int  primrefID = presplitItem[j].index;	
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+              assert(split_levels);
+              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              const size_t newID = numPrimitives + primOffset1[j-center];              
+              assert(newID+numSubPrims <= alloc_numPrimitives);
+              prims[primrefID] = subPrims[0];
+              for (size_t i=1;i<numSubPrims;i++)
+                prims[newID+i-1] = subPrims[i];
+            }
+          });
+
+        numPrimitives += offset;
+        DBG_PRESPLIT(
+          PRINT(pinfo.size());
+          PRINT(numPrimitives);
+          PRINT((float)numPrimitives/org_numPrimitives));                
+      }
+                
+      /* recompute centroid bounding boxes */
+      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+          PrimInfo p(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            p.add_center2(prims[j]);
+          return p;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+  
+      assert(pinfo.size() == numPrimitives);
+      
+      /* free double buffer presplit items */
+      alignedFree(tmp_presplitItem);		
+      alignedFree(presplitItem);
+      return pinfo;	
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h
new file mode 100644
index 0000000000..f7720bd284
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/splitter.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
+                                    const Vec3fa (&inv_length)[N],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+      __forceinline void splitPolygon(const PrimRef& prim, 
+                                      const size_t dim, 
+                                      const float pos, 
+                                      const Vec3fa (&v)[N+1],
+                                      PrimRef& left_o, 
+                                      PrimRef& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+      new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+    }
+    
+    struct TriangleSplitter
+    {
+      __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );  
+        TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+        v[0] = mesh->vertex(tri.v[0]);
+        v[1] = mesh->vertex(tri.v[1]);
+        v[2] = mesh->vertex(tri.v[2]);
+        v[3] = mesh->vertex(tri.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[4];
+      Vec3fa inv_length[3];
+    };
+    
+    struct TriangleSplitterFactory
+    {
+      __forceinline TriangleSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+        return TriangleSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+    struct QuadSplitter
+    {
+      __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
+        QuadMesh::Quad quad = mesh->quad(prim.primID());
+        v[0] = mesh->vertex(quad.v[0]);
+        v[1] = mesh->vertex(quad.v[1]);
+        v[2] = mesh->vertex(quad.v[2]);
+        v[3] = mesh->vertex(quad.v[3]);
+        v[4] = mesh->vertex(quad.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[5];
+      Vec3fa inv_length[4];
+    };
+    
+    struct QuadSplitterFactory
+    {
+      __forceinline QuadSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+        return QuadSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+
+
+    struct DummySplitter
+    {
+      __forceinline DummySplitter(const Scene* scene, const PrimRef& prim)
+      {
+      }
+    };
+    
+    struct DummySplitterFactory
+    {
+      __forceinline DummySplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline DummySplitter operator() (const PrimRef& prim) const {
+        return DummySplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+  }
+}
+
diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp
new file mode 100644
index 0000000000..a84295f0da
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh.cpp
@@ -0,0 +1,190 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+
+namespace embree
+{
+  template<int N>
+  BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene)
+    : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN),
+      primTy(&primTy), device(scene->device), scene(scene),
+      root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0)
+  {
+  }
+
+  template<int N>
+  BVHN<N>::~BVHN ()
+  {
+    for (size_t i=0; i<objects.size(); i++) 
+      delete objects[i];
+  }
+
+  template<int N>
+  void BVHN<N>::clear()
+  {
+    set(BVHN::emptyNode,empty,0);
+    alloc.clear();
+  }
+
+  template<int N>
+  void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives)
+  {
+    this->root = root;
+    this->bounds = bounds;
+    this->numPrimitives = numPrimitives;
+  }	
+
+  template<int N>
+  void BVHN<N>::clearBarrier(NodeRef& node)
+  {
+    if (node.isBarrier())
+      node.clearBarrier();
+    else if (!node.isLeaf()) {
+      BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH
+      for (size_t c=0; c<N; c++)
+        clearBarrier(n->child(c));
+    }
+  }
+
+  template<int N>
+  void BVHN<N>::layoutLargeNodes(size_t num)
+  {
+#if defined(__64BIT__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+    struct NodeArea 
+    {
+      __forceinline NodeArea() {}
+
+      __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds)
+        : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {}
+
+      __forceinline bool operator< (const NodeArea& other) const {
+        return this->A < other.A;
+      }
+
+      NodeRef* node;
+      float A;
+    };
+    std::vector<NodeArea> lst;
+    lst.reserve(num);
+    lst.push_back(NodeArea(root,empty));
+
+    while (lst.size() < num)
+    {
+      std::pop_heap(lst.begin(), lst.end());
+      NodeArea n = lst.back(); lst.pop_back();
+      if (!n.node->isAABBNode()) break;
+      AABBNode* node = n.node->getAABBNode();
+      for (size_t i=0; i<N; i++) {
+        if (node->child(i) == BVHN::emptyNode) continue;
+        lst.push_back(NodeArea(node->child(i),node->bounds(i)));
+        std::push_heap(lst.begin(), lst.end());
+      }
+    }
+
+    for (size_t i=0; i<lst.size(); i++)
+      lst[i].node->setBarrier();
+      
+    root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator());
+#endif
+  }
+  
+  template<int N>
+  typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator)
+  {
+    if (node.isBarrier()) {
+      node.clearBarrier();
+      return node;
+    }
+    else if (node.isAABBNode()) 
+    {
+      AABBNode* oldnode = node.getAABBNode();
+      AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment);
+      *newnode = *oldnode;
+      for (size_t c=0; c<N; c++)
+        newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator);
+      return encodeNode(newnode);
+    }
+    else return node;
+  }
+
+  template<int N>
+  double BVHN<N>::preBuild(const std::string& builderName)
+  {
+    if (builderName == "") 
+      return inf;
+
+    if (device->verbosity(2))
+    {
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush;
+    }
+
+    double t0 = 0.0;
+    if (device->benchmark || device->verbosity(2)) t0 = getSeconds();
+    return t0;
+  }
+
+  template<int N>
+  void BVHN<N>::postBuild(double t0)
+  {
+    if (t0 == double(inf))
+      return;
+    
+    double dt = 0.0;
+    if (device->benchmark || device->verbosity(2)) 
+      dt = getSeconds()-t0;
+
+    std::unique_ptr<BVHNStatistics<N>> stat;
+
+    /* print statistics */
+    if (device->verbosity(2))
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      const size_t usedBytes = alloc.getUsedBytes();
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl;
+    
+      if (device->verbosity(2))
+        std::cout << stat->str();
+
+      if (device->verbosity(2))
+      {
+        FastAllocator::AllStatistics stat(&alloc);
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i])
+            stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc);
+
+        stat.print(numPrimitives);
+      }
+
+      if (device->verbosity(3))
+      {
+        alloc.print_blocks();
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i]) 
+            objects[i]->alloc.print_blocks();
+      }
+
+      std::cout << std::flush;
+    }
+
+    /* benchmark mode */
+    if (device->benchmark)
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush;
+    }
+  }
+
+#if defined(__AVX__)
+  template class BVHN<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+  template class BVHN<4>;
+#endif
+}
+
diff --git a/thirdparty/embree/kernels/bvh/bvh.h b/thirdparty/embree/kernels/bvh/bvh.h
new file mode 100644
index 0000000000..565eec5a58
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* include all node types */
+#include "bvh_node_aabb.h"
+#include "bvh_node_aabb_mb.h"
+#include "bvh_node_aabb_mb4d.h"
+#include "bvh_node_obb.h"
+#include "bvh_node_obb_mb.h"
+#include "bvh_node_qaabb.h"
+
+namespace embree
+{
+  /*! flags used to enable specific node types in intersectors */
+  enum BVHNodeFlags
+  {
+    BVH_FLAG_ALIGNED_NODE = 0x00001,
+    BVH_FLAG_ALIGNED_NODE_MB = 0x00010,
+    BVH_FLAG_UNALIGNED_NODE = 0x00100,
+    BVH_FLAG_UNALIGNED_NODE_MB = 0x01000,
+    BVH_FLAG_QUANTIZED_NODE = 0x100000,
+    BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000,
+    
+    /* short versions */
+    BVH_AN1 = BVH_FLAG_ALIGNED_NODE,
+    BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB,
+    BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_UN1 = BVH_FLAG_UNALIGNED_NODE,
+    BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE,
+    BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_QN1 = BVH_FLAG_QUANTIZED_NODE
+  };
+  
+  /*! Multi BVH with N children. Each node stores the bounding box of
+   * it's N children as well as N child references. */
+  template<int N>
+    class BVHN : public AccelData
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    
+    /*! forward declaration of node ref type */
+    typedef NodeRefPtr<N> NodeRef;
+    typedef BaseNode_t<NodeRef,N> BaseNode;
+    typedef AABBNode_t<NodeRef,N> AABBNode;
+    typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB;
+    typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D;
+    typedef OBBNode_t<NodeRef,N> OBBNode;
+    typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB;
+    typedef QuantizedBaseNode_t<N> QuantizedBaseNode;
+    typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB;
+    typedef QuantizedNode_t<NodeRef,N> QuantizedNode;
+    
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+    
+    /*! Empty node */
+    static const size_t emptyNode = NodeRef::emptyNode;
+    
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = NodeRef::invalidNode;
+    static const size_t popRay      = NodeRef::popRay;
+    
+    /*! Maximum depth of the BVH. */
+    static const size_t maxBuildDepth = 32;
+    static const size_t maxBuildDepthLeaf = maxBuildDepth+8;
+    static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder
+    
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks;
+    
+  public:
+    
+    /*! Builder interface to create allocator */
+    struct CreateAlloc : public FastAllocator::Create {
+      __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
+    };
+    
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+  public:
+    
+    /*! BVHN default constructor. */
+    BVHN (const PrimitiveType& primTy, Scene* scene);
+    
+    /*! BVHN destruction */
+    ~BVHN ();
+    
+    /*! clears the acceleration structure */
+    void clear();
+    
+    /*! sets BVH members after build */
+    void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives);
+    
+    /*! Clears the barrier bits of a subtree. */
+    void clearBarrier(NodeRef& node);
+    
+    /*! lays out num large nodes of the BVH */
+    void layoutLargeNodes(size_t num);
+    NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator);
+    
+    /*! called by all builders before build starts */
+    double preBuild(const std::string& builderName);
+    
+    /*! called by all builders after build ended */
+    void postBuild(double t0);
+    
+    /*! allocator class */
+    struct Allocator {
+      BVHN* bvh;
+      Allocator (BVHN* bvh) : bvh(bvh) {}
+      __forceinline void* operator() (size_t bytes) const { 
+        return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); 
+      }
+    };
+    
+    /*! post build cleanup */
+    void cleanup() {
+      alloc.cleanup();
+    }
+    
+  public:
+    
+    /*! Encodes a node */
+    static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); }
+    static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); }
+    
+  public:
+    
+    /*! Prefetches the node this reference points to */
+    __forceinline static void prefetch(const NodeRef ref, int types=0)
+    {
+#if defined(__AVX512PF__) // MIC
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL2(((char*)ref.ptr)+2*64);
+          prefetchL2(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* KNL still needs L2 prefetches for large nodes */
+          prefetchL2(((char*)ref.ptr)+4*64);
+          prefetchL2(((char*)ref.ptr)+5*64);
+          prefetchL2(((char*)ref.ptr)+6*64);
+          prefetchL2(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        prefetchL2(((char*)ref.ptr)+2*64);
+      }
+#else
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL1(((char*)ref.ptr)+2*64);
+          prefetchL1(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* deactivate for large nodes on Xeon, as it introduces regressions */
+          //prefetchL1(((char*)ref.ptr)+4*64);
+          //prefetchL1(((char*)ref.ptr)+5*64);
+          //prefetchL1(((char*)ref.ptr)+6*64);
+          //prefetchL1(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        prefetchL1(((char*)ref.ptr)+2*64);
+      }
+#endif
+    }
+    
+    __forceinline static void prefetchW(const NodeRef ref, int types=0)
+    {
+      embree::prefetchEX(((char*)ref.ptr)+0*64);
+      embree::prefetchEX(((char*)ref.ptr)+1*64);
+      if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+2*64);
+        embree::prefetchEX(((char*)ref.ptr)+3*64);
+      }
+      if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+4*64);
+        embree::prefetchEX(((char*)ref.ptr)+5*64);
+        embree::prefetchEX(((char*)ref.ptr)+6*64);
+        embree::prefetchEX(((char*)ref.ptr)+7*64);
+      }
+    }
+    
+    /*! bvh type information */
+  public:
+    const PrimitiveType* primTy;       //!< primitive type stored in the BVH
+    
+    /*! bvh data */
+  public:
+    Device* device;                    //!< device pointer
+    Scene* scene;                      //!< scene pointer
+    NodeRef root;                      //!< root node
+    FastAllocator alloc;               //!< allocator used to allocate nodes
+    
+    /*! statistics data */
+  public:
+    size_t numPrimitives;              //!< number of primitives the BVH is build over
+    size_t numVertices;                //!< number of vertices the BVH references
+    
+    /*! data arrays for special builders */
+  public:
+    std::vector<BVHN*> objects;
+    vector_t<char,aligned_allocator<char,32>> subdiv_patches;
+  };
+  
+  typedef BVHN<4> BVH4;
+  typedef BVHN<8> BVH8;
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
new file mode 100644
index 0000000000..890d5e7b7c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
@@ -0,0 +1,1325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh4_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+    
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+  
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+  
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+  
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+
+  BVH4Factory::BVH4Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom);
+
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH4Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelInstanceSAH));
+
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1BuilderSAH));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1MBBuilderSAH));
+  }
+
+  void BVH4Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4iMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1MB));
+    
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Moeller));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Triangle4iIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector1));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector1));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector4));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector4));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector4Chunk));
+    
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector8));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector8));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridMBIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1Intersector16));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1MBIntersector16));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridMBIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersectorStream));
+    
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH4Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH4Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH4Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH4Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH4Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH4Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH4Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH4Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::ROBUST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4Triangle4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8  = BVH4Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN  = BVH4Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH4Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH4Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH4Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH4Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH4Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH4Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH4Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH4Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH4VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH4ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH4InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+  
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1Intersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1Intersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1Intersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1MBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1MBIntersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1MBIntersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4v::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+  
+  Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4v::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default"     ) {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4vMB::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "sah"              ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "dynamic"          ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+    Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel);
+    Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel);
+    Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif      
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4GridMBIntersector4HybridMoeller();
+    intersectors.intersector8  = BVH4GridMBIntersector8HybridMoeller();
+    intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>");
+    
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.h b/thirdparty/embree/kernels/bvh/bvh4_factory.h
new file mode 100644
index 0000000000..30973971a4
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.h
@@ -0,0 +1,316 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH4 instantiations */
+  class BVH4Factory : public BVHFactory
+  {
+  public:
+    BVH4Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+        
+    Accel* BVH4Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST);
+    Accel* BVH4Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4QuantizedTriangle4i(Scene* scene);
+    Accel* BVH4QuantizedQuad4i(Scene* scene);
+ 
+    Accel* BVH4SubdivPatch1(Scene* scene);
+    Accel* BVH4SubdivPatch1MB(Scene* scene);
+
+    Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4UserGeometryMB(Scene* scene);
+
+    Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+    
+  private:
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh);
+    Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
+    
+    Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
+    Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    
+  private:
+
+    DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+        
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+       
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // spatial scene builder
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
new file mode 100644
index 0000000000..d4521af241
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
@@ -0,0 +1,1165 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8
+
+#if defined (EMBREE_TARGET_SIMD8)
+
+#include "bvh8_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+  
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom);
+    
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH8Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelInstanceSAH));
+  }
+
+  void BVH8Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1MB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Woop));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4iIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector4Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+
+    SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersectorStream));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH8Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH8Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH8Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH8Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH8Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH8Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH8Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH8Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+#define ENABLE_WOOP_TEST 0
+#if ENABLE_WOOP_TEST == 0
+    //assert(ivariant == IntersectVariant::ROBUST);
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Pluecker();
+#else
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Woop();
+#endif
+
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4    = BVH8Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8    = BVH8Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16   = BVH8Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN    = BVH8Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH8Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH8Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH8Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH8Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH8Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH8Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH8Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH8Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH8VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH8ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH8InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8v::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant);
+    Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant);
+    Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         )  builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit")     builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4v::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") { // FIXME: implement
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4vMB::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "dynamic"      ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else if (scene->device->quad_builder == "morton"       ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel);
+    Builder* builder = nullptr;
+    if      (scene->device->quad_builder == "default"     ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif            
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = nullptr;
+    intersectors.intersector8  = nullptr;
+    intersectors.intersector16 = nullptr;
+    intersectors.intersectorN  = nullptr;
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder == "default") {
+      builder = BVH8GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>");
+
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder_mb == "default") {
+      builder = BVH8GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.h b/thirdparty/embree/kernels/bvh/bvh8_factory.h
new file mode 100644
index 0000000000..198d6f1df0
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.h
@@ -0,0 +1,280 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH8 instantiations */
+  class BVH8Factory : public BVHFactory
+  {
+  public:
+    BVH8Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+    
+    Accel* BVH8Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8QuantizedTriangle4i(Scene* scene);
+    Accel* BVH8QuantizedTriangle4(Scene* scene);
+    Accel* BVH8QuantizedQuad4i(Scene* scene);
+
+    Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8UserGeometryMB(Scene* scene);
+
+    Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+
+  private:
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+  private:
+    DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // SAH spatial scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.cpp b/thirdparty/embree/kernels/bvh/bvh_builder.cpp
new file mode 100644
index 0000000000..161d01bb5c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder.cpp
@@ -0,0 +1,60 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+      
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+            
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template<int N>
+    typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB {
+        return createLeaf(prims,set,alloc);
+      };
+
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRecordMB>
+        (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template struct BVHNBuilderVirtual<4>;
+    template struct BVHNBuilderQuantizedVirtual<4>;
+    template struct BVHNBuilderMblurVirtual<4>;    
+
+#if defined(__AVX__)
+    template struct BVHNBuilderVirtual<8>;
+    template struct BVHNBuilderQuantizedVirtual<8>;
+    template struct BVHNBuilderMblurVirtual<8>;
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.h b/thirdparty/embree/kernels/bvh/bvh_builder.h
new file mode 100644
index 0000000000..e35d052a62
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder.h
@@ -0,0 +1,115 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N>
+      struct BVHNBuilderVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderQuantizedVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderMblurVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::AABBNodeMB AABBNodeMB;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef typename BVH::NodeRecordMB NodeRecordMB;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange);
+          virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
new file mode 100644
index 0000000000..4a4d8d71df
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
@@ -0,0 +1,531 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+#include "bvh_rotate.h"
+#include "../common/profile.h"
+#include "../../common/algorithms/parallel_prefix_sum.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/bvh_builder_morton.h"
+
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+#if defined(__64BIT__)
+#  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
+#else
+#  define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#endif
+
+namespace embree 
+{
+  namespace isa
+  {
+    template<int N>
+    struct SetBVHNBounds
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+      typedef typename BVH::AABBNode AABBNode;
+
+      BVH* bvh;
+      __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num)
+      {
+        AABBNode* node = ref.getAABBNode();
+
+        BBox3fa res = empty;
+        for (size_t i=0; i<num; i++) {
+          const BBox3fa b = children[i].bounds;
+          res.extend(b);
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,b);
+        }
+
+        BBox3fx result = (BBox3fx&)res;
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          size_t n = 0;
+          for (size_t i=0; i<num; i++)
+            n += children[i].bounds.lower.a;
+
+          if (n >= 4096) {
+            for (size_t i=0; i<num; i++) {
+              if (children[i].bounds.lower.a < 4096) {
+                for (int j=0; j<ROTATE_TREE; j++)
+                  BVHNRotate<N>::rotate(node->child(i));
+                node->child(i).setBarrier();
+              }
+            }
+          }
+          result.lower.a = unsigned(n);
+        }
+#endif
+
+        return NodeRecord(ref,result);
+      }
+    };
+
+    template<int N, typename Primitive>
+    struct CreateMortonLeaf;
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+
+        Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = unsigned(current.size());
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+    
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);       
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+        Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4i>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 v0 = zero, v1 = zero, v2 = zero;
+        vuint4 vgeomID = -1, vprimID = -1;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+        
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID[i] = geomID_;
+          vprimID[i] = primID;
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride; 
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+        }
+        
+        for (size_t i=items; i<4; i++)
+        {
+          vgeomID[i] = vgeomID[0];
+          vprimID[i] = -1;
+          v0[i] = 0;
+          v1[i] = 0; 
+          v2[i] = 0;
+        }
+        Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Quad4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+        const QuadMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const QuadMesh::Quad& tri = mesh->quad(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          const Vec3fa& p3 = mesh->vertex(tri.v[3]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+          v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+        }
+        Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      QuadMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Object>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        
+        /* allocate leaf node */
+        Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const UserGeometry* mesh = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int index = morton[start+i].index;
+          const unsigned int primID = index; 
+          bounds.extend(mesh->bounds(primID));
+          new (&accel[i]) Object(geomID_,primID);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      UserGeometry* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,InstancePrimitive>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items <= 1);
+        
+        /* allocate leaf node */
+        InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const Instance* instance = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index; 
+          bounds.extend(instance->bounds(primID));
+          new (&accel[i]) InstancePrimitive(instance, geomID_);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      Instance* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<typename Mesh>
+    struct CalculateMeshBounds
+    {
+      __forceinline CalculateMeshBounds (Mesh* mesh)
+        : mesh(mesh) {}
+      
+      __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) {
+        return mesh->bounds(morton.index);
+      }
+      
+    private:
+      Mesh* mesh;
+    };        
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNMeshBuilderMorton : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+    public:
+      
+      BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD)
+        : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {}
+      
+      /* build function */
+      void build() 
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+          morton.clear();
+        }
+        size_t numPrimitives = mesh->size();
+        numPreviousPrimitives = numPrimitives;
+        
+        /* skip build for empty scene */
+        if (numPrimitives == 0) {
+          bvh->set(BVH::emptyNode,empty,0);
+          return;
+        }
+        
+        /* preallocate arrays */
+        morton.resize(numPrimitives);
+        size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+        size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim);
+        bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes
+        bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated);
+
+        /* create morton code array */
+        BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes);
+        size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface);
+
+        /* create BVH */
+        SetBVHNBounds<N> setBounds(bvh);
+        CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data());
+        CalculateMeshBounds<Mesh> calculateBounds(mesh);
+        auto root = BVHBuilderMorton::build<NodeRecord>(
+          typename BVH::CreateAlloc(bvh), 
+          typename BVH::AABBNode::Create(),
+          setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface,
+          morton.data(),dest,numPrimitivesGen,settings);
+        
+        bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives);
+        
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          for (int i=0; i<ROTATE_TREE; i++)
+            BVHNRotate<N>::rotate(bvh->root);
+          bvh->clearBarrier(bvh->root);
+        }
+#endif
+
+        /* clear temporary data for static geometry */
+        if (bvh->scene->isStaticAccel()) {
+          morton.clear();
+        }
+        bvh->cleanup();
+      }
+      
+      void clear() {
+        morton.clear();
+      }
+      
+    private:
+      BVH* bvh;
+      Mesh* mesh;
+      mvector<BVHBuilderMorton::BuildPrim> morton;
+      BVHBuilderMorton::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+    };
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
new file mode 100644
index 0000000000..fad02fcc04
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define PROFILE 0
+#define PROFILE_RUNS 20
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafQuantized
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+      bool primrefarrayalloc;
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize,
+                      const Geometry::GTypeMask gtype, bool primrefarrayalloc = false)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0),
+          settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {}
+
+      BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            /* create primref array */
+            if (primrefarrayalloc) {
+              settings.primrefarrayalloc = numPrimitives/1000;
+              if (settings.primrefarrayalloc < 1000)
+                settings.primrefarrayalloc = inf;
+            }
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* initialize allocator */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            prims.resize(numPrimitives); 
+
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface);
+
+            /* pinfo might has zero size due to invalid geometry */
+            if (unlikely(pinfo.size() == 0))
+            {
+              bvh->clear();
+              prims.clear();
+              return;
+            }
+
+            /* call BVH builder */
+            NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+#if PROFILE
+          });
+#endif
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAHQuantized : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {}
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          prims.clear();
+          bvh->clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+            /* create primref array */
+            prims.resize(numPrimitives);
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface);
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* call BVH builder */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!!
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); //Primitive::blocks(n);
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs);
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()];
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            bounds[pos] = prims[start+i].bounds();
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos);
+        }
+
+        return node;
+      }
+
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+
+    template<int N>
+    struct BVHNBuilderSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      
+      BVH* bvh;
+      Scene* scene;
+      GridMesh* mesh;
+      mvector<PrimRef> prims;
+      mvector<SubGridBuildData> sgrids;
+      GeneralBVHBuilder::Settings settings;
+      const unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {}
+
+      BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {}
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+        
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+        const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false);
+        numPreviousPrimitives = numGridPrimitives;
+
+
+        PrimInfo pinfo = mesh ? createPrimRefArrayGrids(mesh,prims,sgrids) : createPrimRefArrayGrids(scene,prims,sgrids);
+        const size_t numPrimitives = pinfo.size();
+        /* no primitives */
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          sgrids.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+        /* create primref array */
+        settings.primrefarrayalloc = numPrimitives/1000;
+        if (settings.primrefarrayalloc < 1000)
+          settings.primrefarrayalloc = inf;
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+
+        /* initialize allocator */
+        const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+
+        /* pinfo might has zero size due to invalid geometry */
+        if (unlikely(pinfo.size() == 0))
+        {
+          bvh->clear();
+          sgrids.clear();
+          prims.clear();
+          return;
+        }
+
+        /* call BVH builder */
+        NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+        /* clear temporary array */
+        sgrids.clear();
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH4Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+
+    Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH8Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+    Builder* BVH8QuantizedTriangle4iSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8QuantizedTriangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH4QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH8QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+
+    Builder* BVH4VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH4VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#if defined(__AVX__)
+
+    Builder* BVH8VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH8VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
+    }
+#if defined(__AVX__)
+    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); }
+    Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct
+
+#if defined(__AVX__)
+    Builder* BVH8GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); }
+    Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
new file mode 100644
index 0000000000..d163a80ab1
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -0,0 +1,705 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+
+namespace embree
+{
+  namespace isa
+  {
+
+#if 0
+    template<int N, typename Primitive>
+    struct CreateMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(set.size());
+        size_t start = set.begin();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time));
+
+        return NodeRecordMB(node,allBounds);
+      }
+
+      BVH* bvh;
+      PrimRef* prims;
+      size_t time;
+    };
+#endif
+
+    template<int N, typename Mesh, typename Primitive>
+    struct CreateMSMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(current.prims.size());
+        size_t start = current.prims.begin();
+        size_t end   = current.prims.end();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range));
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+      }
+
+      BVH* bvh;
+    };
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N, typename Mesh, typename Primitive>
+    struct BVHNBuilderMBlurSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      const Geometry::GTypeMask gtype_;
+
+      BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {}
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(gtype_,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>();
+            //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+
+            /*if (numTimeSegments == 1)
+              buildSingleSegment(numPrimitives);
+              else*/
+              buildMultiSegment(numPrimitives);
+
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf.
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+	const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(),
+           CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+	PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = Primitive::singleTimeSegment;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            RecalculatePrimRef<Mesh>(scene),
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh),
+                                            bvh->scene->progressInterface,
+                                            settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    struct GridRecalculatePrimRef
+    {
+      Scene* scene;
+      const SubGridBuildData * const sgrids;
+
+      __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids)
+        : scene(scene), sgrids(sgrids) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+          const unsigned num_time_segments = mesh->numTimeSegments();
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          return mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+        }
+
+    };
+
+    template<int N>
+    struct CreateMSMBlurLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = current.prims.size(); 
+        const size_t start = current.prims.begin();
+
+        const PrimRefMB* prims = current.prims.prims->data();
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range);
+            allBounds.extend(newBounds);
+            bounds0[pos] = newBounds.bounds0;
+            bounds1[pos] = newBounds.bounds1;
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos);
+        }
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);       
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+#if 0
+    template<int N>
+    struct CreateLeafGridMB
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) 
+		  : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); 
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]);
+            bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]);
+            assert(valid0);
+            assert(valid1);
+            allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos]));
+            pos++;
+          }
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos);
+        }
+        return NodeRecordMB(node,allBounds);
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+#endif
+
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N>
+    struct BVHNBuilderMBlurSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      mvector<SubGridBuildData> sgrids;
+
+
+      BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {}
+
+
+      PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfo> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+
+        /* iterate over all meshes in the scene */
+        PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+            
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              BBox3fa bounds = empty;
+              const PrimRef prim(bounds,unsigned(geomID),unsigned(j));
+              pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        size_t numPrimitives = pinfo.size();
+        if (numPrimitives == 0) return pinfo;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              const GridMesh::Grid &g = mesh->grid(j);
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  BBox3fa bounds = empty;
+                  if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid
+                  const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index));
+                  pinfo.add_center2(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                                      prims[p_index++] = prim;                
+                }
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        assert(pinfo.size() == numPrimitives);
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f))
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfoMB> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+        /* iterate over all meshes in the scene */
+        PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+            
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              LBBox3fa bounds(empty);
+              PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+              pinfoMB.merge(gridMB);
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        size_t numPrimitives = pinfoMB.size();
+        if (numPrimitives == 0) return pinfoMB;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              const GridMesh::Grid &g = mesh->grid(j);
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index));
+                  pinfoMB.add_primref(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                  prims[p_index++] = prim;                
+                }
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        assert(pinfoMB.size() == numPrimitives);
+        pinfoMB.time_range = t0t1;
+        return pinfoMB;
+      }
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid");
+
+        //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>();
+        //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+        //if (numTimeSegments == 1)
+        //  buildSingleSegment(numPrimitives);
+        //else
+        buildMultiSegment(numPrimitives);
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        //TODO: check leaf_bytes
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),
+           typename BVH::AABBNodeMB::Create(),
+           typename BVH::AABBNodeMB::Set(),
+           CreateLeafGridMB<N>(scene,bvh,sgrids.data()),
+           bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+      
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+
+
+        GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data());
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        //FIXME: check leaf_bytes
+        //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>));
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = false; 
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            recalculatePrimRef,
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()),
+                                            bvh->scene->progressInterface,
+                                            settings);
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); }
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp
new file mode 100644
index 0000000000..a4e55d7484
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp
@@ -0,0 +1,201 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/primrefgen_presplit.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeafSpatial
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    template<int N, typename Mesh, typename Primitive, typename Splitter>
+    struct BVHNBuilderFastSpatialSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      BVH* bvh;
+      Scene* scene;
+      Mesh* mesh;
+      mvector<PrimRef> prims0;
+      GeneralBVHBuilder::Settings settings;
+      const float splitFactor;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications) {}
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false);
+        numPreviousPrimitives = numOriginalPrimitives;
+        if (numOriginalPrimitives == 0) {
+          prims0.clear();
+          bvh->clear();
+          return;
+        }
+
+        const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>();
+        const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)));
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH"));
+
+        /* create primref array */
+        const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives));
+        prims0.resize(numSplitPrimitives);
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+	
+	NodeRef root(0);
+	PrimInfo pinfo;
+	
+
+        if (likely(usePreSplits))
+	  {		     
+            /* spatial presplit SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings);
+	  }
+	else
+	  {
+            /* standard spatial split SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray(mesh,geomID_,numSplitPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,Mesh::geom_type,false,numSplitPrimitives,prims0,bvh->scene->progressInterface);
+	
+	    Splitter splitter(scene);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>(
+								  typename BVH::CreateAlloc(bvh),
+								  typename BVH::AABBNode::Create2(),
+								  typename BVH::AABBNode::Set2(),
+								  CreateLeafSpatial<N,Primitive>(bvh),
+								  splitter,
+								  bvh->scene->progressInterface,
+								  prims0.data(),
+								  numSplitPrimitives,
+								  pinfo,settings);
+
+	    /* ==================== */
+	  }
+
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims0.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims0.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+
+    Builder* BVH4Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
new file mode 100644
index 0000000000..5d45ed3748
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
@@ -0,0 +1,369 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder_twolevel.h"
+#include "bvh_statistics.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../common/scene_line_segments.h"
+#include "../common/scene_triangle_mesh.h"
+#include "../common/scene_quad_mesh.h"
+
+#define PROFILE 0
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold)
+      : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {}
+    
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () {
+    }
+
+    // ===========================================================================
+    // ===========================================================================
+    // ===========================================================================
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build()
+    {
+      /* delete some objects */
+      size_t num = scene->size();
+      if (num < bvh->objects.size()) {
+        parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) {
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              builders[i].reset();
+              delete bvh->objects[i]; bvh->objects[i] = nullptr;
+            }
+          });
+      }
+      
+#if PROFILE
+      while(1) 
+#endif
+      {
+      /* reset memory allocator */
+      bvh->alloc.reset();
+      
+      /* skip build for empty scene */
+      const size_t numPrimitives = scene->getNumPrimitives(gtype,false);
+
+      if (numPrimitives == 0) {
+        prims.resize(0);
+        bvh->set(BVH::emptyNode,empty,0);
+        return;
+      }
+
+      /* calculate the size of the entire BVH */
+      const size_t numLeafBlocks = Primitive::blocks(numPrimitives);
+      const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N;
+      const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive));
+      bvh->alloc.init_estimate(node_bytes+leaf_bytes); 
+
+      double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel");
+
+      /* resize object array if scene got larger */
+      if (bvh->objects.size()  < num) bvh->objects.resize(num);
+      if (builders.size() < num) builders.resize(num);
+      resizeRefsList ();
+      nextRef.store(0);
+      
+      /* create acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+      
+          /* ignore meshes we do not support */
+          if (mesh == nullptr || mesh->numTimeSteps != 1)
+            continue;
+          
+          if (isSmallGeometry(mesh)) {
+             setupSmallBuildRefBuilder (objectID, mesh);
+          } else {
+            setupLargeBuildRefBuilder (objectID, mesh);
+          }
+        }
+      });
+
+      /* parallel build of acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          /* ignore if no triangle mesh or not enabled */
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+          if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) 
+            continue;
+
+          builders[objectID]->attachBuildRefs (this);
+        }
+      });
+
+
+#if PROFILE
+      double d0 = getSeconds();
+#endif
+      /* fast path for single geometry scenes */
+      if (nextRef == 1) { 
+        bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives);
+      }
+
+      else
+      {     
+        /* open all large nodes */
+        refs.resize(nextRef);
+
+        /* this probably needs some more tuning */
+        const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR));
+ 
+#if !ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+#if ENABLE_OPEN_SEQUENTIAL
+        open_sequential(extSize); 
+#endif
+        /* compute PrimRefs */
+        prims.resize(refs.size());
+#endif
+        
+        {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+          
+#else
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+                prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+#endif   
+       
+          /* skip if all objects where empty */
+          if (pinfo.size() == 0)
+            bvh->set(BVH::emptyNode,empty,0);
+        
+          /* otherwise build toplevel hierarchy */
+          else
+          {
+            /* settings for BVH build */
+            GeneralBVHBuilder::Settings settings;
+            settings.branchingFactor = N;
+            settings.maxDepth = BVH::maxBuildDepthLeaf;
+            settings.logBlockSize = bsr(N);
+            settings.minLeafSize = 1;
+            settings.maxLeafSize = 1;
+            settings.travCost = 1.0f;
+            settings.intCost = 1.0f;
+            settings.singleThreadThreshold = singleThreadThreshold;
+      
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            
+            refs.resize(extSize); 
+         
+            NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef  {
+                assert(range.size() == 1);
+                return (NodeRef) refs[range.begin()].node;
+              },
+              [&] (BuildRef &bref, BuildRef *refs) -> size_t { 
+                return openBuildRef(bref,refs);
+              },              
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              refs.data(),extSize,pinfo,settings);
+#else
+            NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+                assert(range.size() == 1);
+                return (NodeRef) prims[range.begin()].ID();
+              },
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              prims.data(),pinfo,settings);
+#endif
+
+            
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives);
+          }
+        }
+      }  
+        
+      bvh->alloc.cleanup();
+      bvh->postBuild(t0);
+#if PROFILE
+      double d1 = getSeconds();
+      std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl;
+#endif
+      }
+
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID)
+    {
+      if (geomID >= bvh->objects.size()) return;
+      if (builders[geomID]) builders[geomID].reset();
+      delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr;
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear()
+    {
+      for (size_t i=0; i<bvh->objects.size(); i++) 
+        if (bvh->objects[i]) bvh->objects[i]->clear();
+
+      for (size_t i=0; i<builders.size(); i++) 
+        if (builders[i]) builders[i].reset();
+
+      refs.clear();
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize)
+    {
+      if (refs.size() == 0)
+	return;
+
+      refs.reserve(extSize);
+
+#if 1
+      for (size_t i=0;i<refs.size();i++)
+      {
+        NodeRef ref = refs[i].node;
+        if (ref.isAABBNode())
+          BVH::prefetch(ref);
+      }
+#endif
+
+      std::make_heap(refs.begin(),refs.end());
+      while (refs.size()+N-1 <= extSize)
+      {
+        std::pop_heap (refs.begin(),refs.end()); 
+        NodeRef ref = refs.back().node;
+        if (ref.isLeaf()) break;
+        refs.pop_back();    
+        
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs.push_back(BuildRef(node->bounds(i),node->child(i)));
+         
+#if 1
+          NodeRef ref_pre = node->child(i);
+          if (ref_pre.isAABBNode())
+            ref_pre.prefetch();
+#endif
+          std::push_heap (refs.begin(),refs.end()); 
+        }
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/)
+    {
+      if (builders[objectID] == nullptr ||                                         // new mesh
+          dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr)     // size change resulted in large->small change
+      {
+        builders[objectID].reset (new RefBuilderSmall(objectID));
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh)
+    {
+      if (bvh->objects[objectID] == nullptr ||                                  // new mesh
+          builders[objectID]->meshQualityChanged (mesh->quality) ||             // changed build quality
+          dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr)  // size change resulted in small->large change
+      {
+        Builder* builder = nullptr;
+        delete bvh->objects[objectID]; 
+        createMeshAccel(objectID, builder);
+        builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality));
+      }
+    }
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#if defined(__AVX__)
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
new file mode 100644
index 0000000000..dc7ec7d278
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+
+#include "bvh_builder_twolevel_internal.h"
+#include "bvh.h"
+#include "../common/primref.h"
+#include "../builders/priminfo.h"
+#include "../builders/primrefgen.h"
+
+/* new open/merge builder */
+#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1
+#define ENABLE_OPEN_SEQUENTIAL 0
+#define SPLIT_MEMORY_RESERVE_FACTOR 1000
+#define SPLIT_MEMORY_RESERVE_SCALE 2
+#define SPLIT_MIN_EXT_SPACE 1000
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNBuilderTwoLevel : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline static bool isSmallGeometry(Mesh* mesh) {
+        return mesh->size() <= 4;
+      }
+
+    public:
+
+      typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+      struct BuildRef : public PrimRef
+      {
+      public:
+        __forceinline BuildRef () {}
+
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node)
+          : PrimRef(bounds,(size_t)node), node(node)
+        {
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        /* used by the open/merge bvh builder */
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives)
+          : PrimRef(bounds,geomID,numPrimitives), node(node)
+        {
+          /* important for relative buildref ordering */
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        __forceinline size_t size() const {
+          return primID();
+        }
+
+        friend bool operator< (const BuildRef& a, const BuildRef& b) {
+          return a.bounds_area < b.bounds_area;
+        }
+
+        friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) {
+          return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }";
+        }
+
+        __forceinline unsigned int numPrimitives() const { return primID(); }
+
+      public:
+        NodeRef node;
+        float bounds_area;
+      };
+
+
+      __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) {
+        if (bref.node.isLeaf())
+        {
+          refs[0] = bref;
+          return 1;
+        }
+        NodeRef ref = bref.node;
+        unsigned int geomID   = bref.geomID();
+        unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1);
+        AABBNode* node = ref.getAABBNode();
+        size_t n = 0;
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims);
+          n++;
+        }
+        assert(n > 1);
+        return n;        
+      }
+      
+      /*! Constructor. */
+      BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD);
+      
+      /*! Destructor */
+      ~BVHNBuilderTwoLevel ();
+      
+      /*! builder entry point */
+      void build();
+      void deleteGeometry(size_t geomID);
+      void clear();
+
+      void open_sequential(const size_t extSize);
+      
+    private:
+
+      class RefBuilderBase {
+      public:
+        virtual ~RefBuilderBase () {}
+        virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0;
+        virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0;
+      };
+
+      class RefBuilderSmall : public RefBuilderBase {
+      public:
+
+        RefBuilderSmall (size_t objectID)
+          : objectID_ (objectID) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) {
+
+          Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_);
+          size_t meshSize = mesh->size();
+          assert(isSmallGeometry(mesh));
+          
+          mvector<PrimRef> prefs(topBuilder->scene->device, meshSize);
+          auto pinfo = createPrimRefArray(mesh,objectID_,meshSize,prefs,topBuilder->bvh->scene->progressInterface);
+
+          size_t begin=0;
+          while (begin < pinfo.size())
+          {
+            Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment);
+            typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1);
+            accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene);
+            
+            /* create build primitive */
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1);
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node);
+#endif
+          }
+          assert(begin == pinfo.size());
+        }
+
+        bool meshQualityChanged (RTCBuildQuality /*currQuality*/) {
+          return false;
+        }
+        
+        size_t  objectID_;
+      };
+
+      class RefBuilderLarge : public RefBuilderBase {
+      public:
+        
+        RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality)
+        : objectID_ (objectID), builder_ (builder), quality_ (quality) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder)
+        {
+          BVH* object  = topBuilder->getBVH(objectID_); assert(object);
+          
+          /* build object if it got modified */
+          if (topBuilder->isGeometryModified(objectID_))
+            builder_->build();
+
+          /* create build primitive */
+          if (!object->getBounds().empty())
+          {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            Mesh* mesh = topBuilder->getMesh(objectID_);
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size());
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root);
+#endif
+          }
+        }
+
+        bool meshQualityChanged (RTCBuildQuality currQuality) {
+          return currQuality != quality_;
+        }
+
+      private:
+        size_t          objectID_;
+        Ref<Builder>    builder_;
+        RTCBuildQuality quality_;
+      };
+
+      void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+      void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+
+      BVH*  getBVH (size_t objectID) {
+        return this->bvh->objects[objectID];
+      }
+      Mesh* getMesh (size_t objectID) {
+        return this->scene->template getSafe<Mesh>(objectID);
+      }
+      bool  isGeometryModified (size_t objectID) {
+        return this->scene->isGeometryModified(objectID);
+      }
+
+      void resizeRefsList ()
+      {
+        size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), 
+          [this](const range<size_t>& r)->size_t {
+            size_t c = 0;
+            for (auto i=r.begin(); i<r.end(); ++i) {
+              Mesh* mesh = scene->getSafe<Mesh>(i);
+              if (mesh == nullptr || mesh->numTimeSteps != 1)
+                continue;
+              size_t meshSize = mesh->size();
+              c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1;
+            }
+            return c;
+          },
+          std::plus<size_t>()
+        );
+
+        if (refs.size() < num) {
+          refs.resize(num);
+        }
+      }
+
+      void createMeshAccel (size_t geomID, Builder*& builder)
+      {
+        bvh->objects[geomID] = new BVH(Primitive::type,scene);
+        BVH* accel = bvh->objects[geomID];
+        auto mesh = scene->getSafe<Mesh>(geomID);
+        if (nullptr == mesh) {
+          throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type");
+          return;
+        }
+
+        __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder);
+      }      
+
+      using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>;
+
+      BuilderList         builders;
+      BVH*                bvh;
+      Scene*              scene;      
+      mvector<BuildRef>   refs;
+      mvector<PrimRef>    prims;
+      std::atomic<int>    nextRef;
+      const size_t        singleThreadThreshold;
+      Geometry::GTypeMask gtype;
+      bool                useMortonBuilder_ = false;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
new file mode 100644
index 0000000000..023b52b780
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -0,0 +1,267 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
+  
+  namespace isa
+  {
+
+    namespace __internal_two_level_builder__ {
+
+      template<int N, typename Mesh, typename Primitive>
+      struct MortonBuilder {};
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct SAHBuilder {};
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct RefitBuilder {};
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      
+      template<int N, typename Mesh, typename Primitive>
+      struct MeshBuilder {
+        MeshBuilder () {}
+        void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) {
+          if(useMortonBuilder) {
+            builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype);
+            return;
+          }
+          switch (mesh->quality) {
+            case RTC_BUILD_QUALITY_LOW:    builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_MEDIUM:
+            case RTC_BUILD_QUALITY_HIGH:   builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_REFIT:  builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality");
+          }
+        }
+      };
+    }
+  }
+}
+\ No newline at end of file
diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.cpp b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
new file mode 100644
index 0000000000..9428c0b88e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
@@ -0,0 +1,375 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_collider.h"
+#include "../geometry/triangle_triangle_intersector.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+#define CSTAT(x)
+
+    size_t parallel_depth_threshold = 3;
+    CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0));
+
+    struct Collision
+    {
+      __forceinline Collision() {}
+
+      __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1)
+        : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {}
+
+      unsigned geomID0;
+      unsigned primID0;
+      unsigned geomID1;
+      unsigned primID1;
+    };
+    
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1)
+    {
+      CSTAT(bvh_collide_prim_intersections1++);
+      const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0);
+      const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1);
+      const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0);
+      const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1);
+      
+      /* special culling for scene intersection with itself */
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore self intersections */
+        if (primID0 == primID1)
+          return false;
+      }
+      CSTAT(bvh_collide_prim_intersections2++);
+      
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore intersection with topological neighbors */
+        const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]);
+        if (any(vint4(tri1.v[0]) == t0)) return false;
+        if (any(vint4(tri1.v[1]) == t0)) return false;
+        if (any(vint4(tri1.v[2]) == t0)) return false;
+      }
+      CSTAT(bvh_collide_prim_intersections3++);
+      
+      const Vec3fa a0 = mesh0->vertex(tri0.v[0]);
+      const Vec3fa a1 = mesh0->vertex(tri0.v[1]);
+      const Vec3fa a2 = mesh0->vertex(tri0.v[2]);
+      const Vec3fa b0 = mesh1->vertex(tri1.v[0]);
+      const Vec3fa b1 = mesh1->vertex(tri1.v[1]);
+      const Vec3fa b2 = mesh1->vertex(tri1.v[2]);
+      
+      return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2);
+    }
+    
+    template<int N>
+    __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1)
+    {
+      Collision collisions[16];
+      size_t num_collisions = 0;
+
+      size_t N0; Object* leaf0 = (Object*) node0.leaf(N0);
+      size_t N1; Object* leaf1 = (Object*) node1.leaf(N1);
+      for (size_t i=0; i<N0; i++) {
+        for (size_t j=0; j<N1; j++) {
+          const unsigned geomID0 = leaf0[i].geomID();
+          const unsigned primID0 = leaf0[i].primID();
+          const unsigned geomID1 = leaf1[j].geomID();
+          const unsigned primID1 = leaf1[j].primID();
+          if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue;
+          collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1);
+          if (num_collisions == 16) {
+            this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+            num_collisions = 0;
+          }
+        }
+      }
+      if (num_collisions)
+        this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+    }
+
+    template<int N>
+    void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1)
+    {
+      CSTAT(bvh_collide_traversal_steps++);
+      if (unlikely(ref0.isLeaf())) {
+        if (unlikely(ref1.isLeaf())) {
+          CSTAT(bvh_collide_leaf_pairs++);
+          processLeaf(ref0,ref1);
+          return;
+        } else goto recurse_node1;
+        
+      } else {
+        if (unlikely(ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(bounds0) > area(bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+
+      {
+      recurse_node0:
+        AABBNode* node0 = ref0.getAABBNode();
+        size_t mask = overlap<N>(bounds1,*node0);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth0 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+              }
+            });
+        } 
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+          }
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        AABBNode* node1 = ref1.getAABBNode();
+        size_t mask = overlap<N>(bounds0,*node1);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth1 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+              }
+            });
+        }
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+          }
+        }
+        return;
+      }
+    }
+
+    template<int N>
+    void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs)
+    {
+      if (unlikely(job.ref0.isLeaf())) {
+        if (unlikely(job.ref1.isLeaf())) {
+          jobs.push_back(job);
+          return;
+        } else goto recurse_node1;
+      } else {
+        if (unlikely(job.ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(job.bounds0) > area(job.bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+      
+      {
+      recurse_node0:
+        const AABBNode* node0 = job.ref0.getAABBNode();
+        size_t mask = overlap<N>(job.bounds1,*node0);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1));
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        const AABBNode* node1 = job.ref1.getAABBNode();
+        size_t mask = overlap<N>(job.bounds0,*node1);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1));
+        }
+        return;
+      }
+    }
+    
+    template<int N>
+    void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1)
+    {
+      CSTAT(bvh_collide_traversal_steps = 0);
+      CSTAT(bvh_collide_leaf_pairs = 0);
+      CSTAT(bvh_collide_leaf_iterations = 0);
+      CSTAT(bvh_collide_prim_intersections1 = 0);
+      CSTAT(bvh_collide_prim_intersections2 = 0);
+      CSTAT(bvh_collide_prim_intersections3 = 0);
+      CSTAT(bvh_collide_prim_intersections4 = 0);
+      CSTAT(bvh_collide_prim_intersections5 = 0);
+      CSTAT(bvh_collide_prim_intersections = 0);
+#if 0
+      collide_recurse(ref0,bounds0,ref1,bounds1,0,0);
+#else
+      const int M = 2048;
+      jobvector jobs[2];
+      jobs[0].reserve(M);
+      jobs[1].reserve(M);
+      jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0));
+      int source = 0;
+      int target = 1;
+
+      /* try to split job until job list is full */
+      while (jobs[source].size()+8 <= M)
+      {
+        for (size_t i=0; i<jobs[source].size(); i++)
+        {
+          const CollideJob& job = jobs[source][i];
+          size_t remaining = jobs[source].size()-i;
+          if (jobs[target].size()+remaining+8 > M) {
+            jobs[target].push_back(job);
+          } else {
+            split(job,jobs[target]);
+          }
+        }
+
+        /* stop splitting jobs if we reached only leaves and cannot make progress anymore */
+        if (jobs[target].size() == jobs[source].size())
+          break;
+
+        jobs[source].resize(0);
+        std::swap(source,target);
+      }
+
+      /* parallel processing of all jobs */
+      parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) {
+          CollideJob& j = jobs[source][i];
+          collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1);
+        });
+      
+      
+#endif
+      CSTAT(PRINT(bvh_collide_traversal_steps));
+      CSTAT(PRINT(bvh_collide_leaf_pairs));
+      CSTAT(PRINT(bvh_collide_leaf_iterations));
+      CSTAT(PRINT(bvh_collide_prim_intersections1));
+      CSTAT(PRINT(bvh_collide_prim_intersections2));
+      CSTAT(PRINT(bvh_collide_prim_intersections3));
+      CSTAT(PRINT(bvh_collide_prim_intersections4));
+      CSTAT(PRINT(bvh_collide_prim_intersections5));
+      CSTAT(PRINT(bvh_collide_prim_intersections));
+    }
+   
+    template<int N>
+    void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr)
+    { 
+      BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr).
+        collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds());
+    }
+
+#if defined (EMBREE_LOWEST_ISA)
+    struct collision_regression_test : public RegressionTest
+    {
+      collision_regression_test(const char* name) : RegressionTest(name) {
+        registerRegressionTest(this);
+      }
+    
+      bool run ()
+      {
+        bool passed = true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f),
+                                                                            Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false;
+        return passed;
+      }
+    };
+
+    collision_regression_test collision_regression("collision_regression_test");
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Collider Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>);
+
+#if defined(__AVX__)
+    DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.h b/thirdparty/embree/kernels/bvh/bvh_collider.h
new file mode 100644
index 0000000000..3c42f211c1
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_collider.h
@@ -0,0 +1,72 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/object.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+      class BVHNCollider
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      struct CollideJob
+      {
+        CollideJob () {}
+        
+        CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0,
+                    NodeRef ref1, const BBox3fa& bounds1, size_t depth1)
+        : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {}
+        
+        NodeRef ref0;
+        BBox3fa bounds0;
+        size_t depth0;
+        NodeRef ref1;
+        BBox3fa bounds1;
+        size_t depth1;
+      };
+
+      typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector;
+
+      void split(const CollideJob& job, jobvector& jobs);
+      
+    public:
+      __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {}
+
+    public:
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0;
+      void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1);
+      void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1);
+    
+    protected:
+      Scene* scene0;
+      Scene* scene1;
+      RTCCollideFunc callback;
+      void* userPtr;
+    };
+
+    template<int N>
+      class BVHNColliderUserGeom : public BVHNCollider<N>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : BVHNCollider<N>(scene0,scene1,callback,userPtr) {}
+
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1);
+    public:
+      static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_factory.h b/thirdparty/embree/kernels/bvh/bvh_factory.h
new file mode 100644
index 0000000000..453d455bd9
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_factory.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../common/isa.h"
+#include "../common/accel.h"
+#include "../common/scene.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  /*! BVH instantiations */
+  class BVHFactory
+  {
+  public:
+    enum class BuildVariant     { STATIC, DYNAMIC, HIGH_QUALITY };
+    enum class IntersectVariant { FAST, ROBUST };
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
new file mode 100644
index 0000000000..9594f402c3
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.h"
+#include "node_intersector1.h"
+#include "bvh_traverser1.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
+                                                                              RayHit& __restrict__ ray,
+                                                                              IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+      StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+      StackItemT<NodeRef>* stackEnd = stack+stackSize;
+      stack[0].ptr  = bvh->root;
+      stack[0].dist = neg_inf;
+      
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = NodeRef(stackPtr->ptr);
+
+        /* if popped node is too far, pop next one */
+        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
+          continue;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<N> tNear;
+          STAT3(normal.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(normal.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node);
+        tray.tfar = ray.tfar;
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          stackPtr->ptr = lazy_node;
+          stackPtr->dist = neg_inf;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
+                                                                             Ray& __restrict__ ray,
+                                                                             IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+       
+      /* early out for already occluded rays */
+      if (unlikely(ray.tfar < 0.0f))
+        return;
+
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      NodeRef stack[stackSize];    // stack of nodes that still need to get traversed
+      NodeRef* stackPtr = stack+1; // current stack pointer
+      NodeRef* stackEnd = stack+stackSize;
+      stack[0] = bvh->root;
+
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = (NodeRef)*stackPtr;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<N> tNear;
+          STAT3(shadow.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(shadow.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) {
+          ray.tfar = neg_inf;
+          break;
+        }
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          *stackPtr = (NodeRef)lazy_node;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    struct PointQueryDispatch
+    {
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+      {
+        const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+        
+        /* we may traverse an empty BVH in case all geometry was invalid */
+        if (bvh->root == BVH::emptyNode)
+          return false;
+        
+        /* stack state */
+        StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+        StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+        StackItemT<NodeRef>* stackEnd = stack+stackSize;
+        stack[0].ptr  = bvh->root;
+        stack[0].dist = neg_inf;
+        
+        /* verify correct input */
+        assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f));
+
+        /* load the point query into SIMD registers */
+        TravPointQuery<N> tquery(query->p, context->query_radius);
+
+        /* initialize the node traverser */
+        BVHNNodeTraverser1Hit<N,types> nodeTraverser;
+
+        bool changed = false;
+        float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                          ? query->radius * query->radius
+                          : dot(context->query_radius, context->query_radius);
+
+        /* pop loop */
+        while (true) pop:
+        {
+          /* pop next node */
+          if (unlikely(stackPtr == stack)) break;
+          stackPtr--;
+          NodeRef cur = NodeRef(stackPtr->ptr);
+
+          /* if popped node is too far, pop next one */
+          if (unlikely(*(float*)&stackPtr->dist > cull_radius))
+            continue;
+
+          /* downtraversal loop */
+          while (true)
+          {
+            /* intersect node */
+            size_t mask; vfloat<N> tNear;
+            STAT3(point_query.trav_nodes,1,1,1);
+            bool nodeIntersected;
+            if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+              nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            } else {
+              nodeIntersected = BVHNNodePointQueryAABB1  <N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            }
+            if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; }
+
+            /* if no child is hit, pop next node */
+            if (unlikely(mask == 0))
+              goto pop;
+
+            /* select next child and push other children */
+            nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+          }
+
+          /* this is a leaf node */
+          assert(cur != BVH::emptyNode);
+          STAT3(point_query.trav_leaves,1,1,1);
+          size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+          size_t lazy_node = 0;
+          if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node))
+          {
+            changed = true;
+            tquery.rad = context->query_radius;
+            cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                        ? query->radius * query->radius
+                        : dot(context->query_radius, context->query_radius);
+          }
+
+          /* push lazy node onto stack */
+          if (unlikely(lazy_node)) {
+            stackPtr->ptr = lazy_node;
+            stackPtr->dist = neg_inf;
+            stackPtr++;
+          }
+        }
+        return changed;
+      }
+    };
+
+    /* disable point queries for not yet supported geometry types */
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery(
+      const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+    {
+      return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.h b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
new file mode 100644
index 0000000000..2df3d6eddb
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH single ray intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    class BVHNIntersector1
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+    public:
+      static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
+      static void occluded  (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+      static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
new file mode 100644
index 0000000000..831d613367
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.cpp"
+
+namespace embree
+{
+  namespace isa
+  {
+    int getISA() {
+      return VerifyMultiTargetLinking::getISA();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// BVH4Intersector1 Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller,  BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller  <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>));
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>));
+    
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >));
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >));
+
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >));
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >));
+    //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
new file mode 100644
index 0000000000..50ebf375c4
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
@@ -0,0 +1,58 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+#include "node_intersector_frustum.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true>
+    class BVHNIntersectorKHybrid
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersectorK::Precalculations Precalculations;
+      typedef typename PrimitiveIntersectorK::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+      static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth;
+
+      static const size_t switchThresholdIncoherent = \
+      (K==4)  ? 3 :
+      (K==8)  ? ((N==4) ? 5 : 7) :
+      (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal
+      0;
+
+    private:
+      static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+      static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                            RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+
+    public:
+      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+    };
+
+    /*! BVH packet intersector. */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK>
+    class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {};
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
new file mode 100644
index 0000000000..717f559677
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
@@ -0,0 +1,270 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector_packet_stream.h"
+#include "node_intersector_frustum.h"
+#include "bvh_traverser_stream.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! BVH ray stream intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector>
+    class BVHNIntersectorStream
+    {
+      /* shortcuts for frequently used types */
+      template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
+      template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB AABBNodeMB;
+
+      template<int K>
+      __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
+                                                        TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
+      {
+        const size_t numPackets = (numOctantRays+K-1)/K;
+
+        Vec3vf<K> tmp_min_rdir(pos_inf);
+        Vec3vf<K> tmp_max_rdir(neg_inf);
+        Vec3vf<K> tmp_min_org(pos_inf);
+        Vec3vf<K> tmp_max_org(neg_inf);
+        vfloat<K> tmp_min_dist(pos_inf);
+        vfloat<K> tmp_max_dist(neg_inf);
+
+        size_t m_active = 0;
+        for (size_t i = 0; i < numPackets; i++)
+        {
+          const vfloat<K> tnear = inputPackets[i]->tnear();
+          const vfloat<K> tfar  = inputPackets[i]->tfar;
+          vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+          m_valid &= inputPackets[i]->valid();
+#endif
+
+          m_active |= (size_t)movemask(m_valid) << (i*K);
+
+          vfloat<K> packet_min_dist = max(tnear, 0.0f);
+          vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+          tmp_min_dist = min(tmp_min_dist, packet_min_dist);
+          tmp_max_dist = max(tmp_max_dist, packet_max_dist);
+
+          const Vec3vf<K>& org = inputPackets[i]->org;
+          const Vec3vf<K>& dir = inputPackets[i]->dir;
+
+          new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
+
+          tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
+          tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
+          tmp_min_org  = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
+          tmp_max_org  = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
+        }
+
+        m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
+
+        
+        const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
+                                      reduce_min(tmp_min_rdir.y),
+                                      reduce_min(tmp_min_rdir.z));
+
+        const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
+                                      reduce_max(tmp_max_rdir.y),
+                                      reduce_max(tmp_max_rdir.z));
+
+        const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
+                                        reduce_min(tmp_min_org.y),
+                                        reduce_min(tmp_min_org.z));
+
+        const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
+                                        reduce_max(tmp_max_org.y),
+                                        reduce_max(tmp_max_org.z));
+
+        commonOctant =
+          (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
+          (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
+          (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
+        
+        const float frustum_min_dist = reduce_min(tmp_min_dist);
+        const float frustum_max_dist = reduce_max(tmp_max_dist);
+
+        frustum.init(reduced_min_origin, reduced_max_origin,
+                     reduced_min_rdir, reduced_max_rdir,
+                     frustum_min_dist, frustum_max_dist,
+                     N);
+        
+        return m_active;
+      }
+
+      template<int K>
+      __forceinline static size_t intersectAABBNodePacket(size_t m_active,
+                                                             const TravRayKStream<K,robust>* packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             size_t boxID,
+                                                             const NearFarPrecalculations& nf)
+      {
+        assert(m_active);
+        const size_t startPacketID = bsf(m_active) / K;
+        const size_t endPacketID   = bsr(m_active) / K;
+        size_t m_trav_active = 0;
+        for (size_t i = startPacketID; i <= endPacketID; i++)
+        {
+          const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
+          m_trav_active |= m_hit << (i*K);
+        } 
+        return m_trav_active;
+      }
+      
+      template<int K>
+      __forceinline static size_t traverseCoherentStream(size_t m_active,
+                                                         TravRayKStream<K, robust>* packets,
+                                                         const AABBNode* __restrict__ node,
+                                                         const Frustum<robust>& frustum,
+                                                         size_t* maskK,
+                                                         vfloat<N>& dist)
+      {
+        size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist);
+        const size_t first_index    = bsf(m_active);
+        const size_t first_packetID = first_index / K;
+        const size_t first_rayID    = first_index % K;
+        size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf);
+
+        /* this make traversal independent of the ordering of rays */
+        size_t m_node = m_node_hit ^ m_first_hit;
+        while (unlikely(m_node))
+        {
+          const size_t boxID = bscf(m_node);
+          const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
+          m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
+          maskK[boxID] = m_current;
+        }
+        return m_node_hit;
+      }
+      
+      // TODO: explicit 16-wide path for KNL
+      template<int K>
+      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamFast<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<N> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<N> bitmask(shiftTable[rayID]);
+          const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
+          const vfloat<N> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<N> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<N> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
+          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));      
+
+          const vbool<N> hit_mask = tNear <= tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<N>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+        } while(m_active);
+        return vmask;        
+      }
+
+      template<int K>
+      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamRobust<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<N> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<N> bitmask(shiftTable[rayID]);
+          const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<N> tFarX  = (bmaxX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<N> tFarY  = (bmaxY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<N> tFarZ  = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
+          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));
+          const float round_down  = 1.0f-2.0f*float(ulp);
+          const float round_up    = 1.0f+2.0f*float(ulp);
+          const vbool<N> hit_mask = round_down*tNear <= round_up*tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<N>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+        } while(m_active);
+        return vmask;
+      }
+                                                         
+
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
+
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+
+
+    /*! BVH ray stream intersector with direct fallback to packets. */
+    template<int N>
+    class BVHNIntersectorStreamPacketFallback
+    {
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
new file mode 100644
index 0000000000..e7df7c2ae2
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/ray.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class RayStreamFilter
+    {
+    public:
+      static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
+      static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
+      static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
+
+      static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
+      static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
+      static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
+
+    private:
+      template<int K, bool intersect>
+      static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
+    };
+  }
+};
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
new file mode 100644
index 0000000000..57530692bc
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const {
+        node.getAABBNode()->setRef(i,child);
+        node.getAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear();
+        for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds());
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    struct Set3
+    {
+      Set3 (FastAllocator* allocator, PrimRef* prims)
+      : allocator(allocator), prims(prims) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        
+        if (unlikely(precord.alloc_barrier))
+        {
+          PrimRef* begin = &prims[precord.prims.begin()];
+          PrimRef* end   = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!!
+          size_t bytes = (size_t)end - (size_t)begin;
+          allocator->addBlock(begin,bytes);
+        }
+        
+        return ref;
+      }
+      
+      FastAllocator* const allocator;
+      PrimRef* const prims;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      lower_x = lower_y = lower_z = pos_inf;
+      upper_x = upper_y = upper_z = neg_inf;
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds)
+    {
+      assert(i < N);
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) {
+      setBounds(i,bounds);
+      children[i] = ref;
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z));
+      const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]);
+      const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]);
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extend(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    /*! Returns bounds of all children (implemented later as specializations) */
+    __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const;
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(upper_z[i],upper_z[j]);
+    }
+
+    /*! swap the children of two nodes */
+    __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(a->children[i],b->children[j]);
+      std::swap(a->lower_x[i],b->lower_x[j]);
+      std::swap(a->lower_y[i],b->lower_y[j]);
+      std::swap(a->lower_z[i],b->lower_z[j]);
+      std::swap(a->upper_x[i],b->upper_x[j]);
+      std::swap(a->upper_y[i],b->upper_y[j]);
+      std::swap(a->upper_z[i],b->upper_z[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNode_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n)
+    {
+      o << "AABBNode { " << embree_endl;
+      o << "  lower_x " << n.lower_x << embree_endl;
+      o << "  upper_x " << n.upper_x << embree_endl;
+      o << "  lower_y " << n.lower_y << embree_endl;
+      o << "  upper_y " << n.upper_y << embree_endl;
+      o << "  lower_z " << n.lower_z << embree_endl;
+      o << "  upper_z " << n.upper_z << embree_endl;
+      o << "  children = ";
+      for (size_t i=0; i<N; i++) o << n.children[i] << " ";
+      o << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  public:
+    vfloat<N> lower_x;           //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;           //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;           //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;           //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;           //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;           //!< Z dimension of upper bounds of all N children.
+  };
+
+  template<>
+    __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const {
+    transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower);
+    transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper);
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
new file mode 100644
index 0000000000..c4cea7d8ba
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Motion Blur AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    { 
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,children[i].lbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+    };
+    
+    struct SetTimeRange
+    {
+      __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i, children[i].ref);
+          node->setBounds(i, children[i].lbounds, tbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+      
+      BBox1f tbounds;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_x = lower_y = lower_z = vfloat<N>(pos_inf);
+      upper_x = upper_y = upper_z = vfloat<N>(neg_inf);
+      lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f);
+      upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, NodeRef ref) {
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i)
+    {
+      /*! for empty bounds we have to avoid inf-inf=nan */
+      BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX)));
+      BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX)));
+      bounds0 = bounds0.enlarge_by(4.0f*float(ulp));
+      bounds1 = bounds1.enlarge_by(4.0f*float(ulp));
+      Vec3fa dlower = bounds1.lower-bounds0.lower;
+      Vec3fa dupper = bounds1.upper-bounds0.upper;
+      
+      lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z;
+      upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z;
+      
+      lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z;
+      upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds) {
+      setBounds(i, bounds.bounds0, bounds.bounds1);
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) {
+      setBounds(i, bounds.global(tbounds));
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) {
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child)
+    {
+      setRef(i, child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Return bounding box for time 0 */
+    __forceinline BBox3fa bounds0(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]),
+                     Vec3fa(upper_x[i],upper_y[i],upper_z[i]));
+    }
+    
+    /*! Return bounding box for time 1 */
+    __forceinline BBox3fa bounds1(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]),
+                     Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i]));
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)),
+                            reduce_min(min(lower_y,lower_y+lower_dy)),
+                            reduce_min(min(lower_z,lower_z+lower_dz))),
+                     Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)),
+                            reduce_max(max(upper_y,upper_y+upper_dy)),
+                            reduce_max(max(upper_z,upper_z+upper_dz))));
+    }
+    
+    /*! Return bounding box of child i */
+    __forceinline BBox3fa bounds(size_t i) const {
+      return merge(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return linear bounding box of child i */
+    __forceinline LBBox3fa lbounds(size_t i) const {
+      return LBBox3fa(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return bounding box of child i at specified time */
+    __forceinline BBox3fa bounds(size_t i, float time) const {
+      return lerp(bounds0(i),bounds1(i),time);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return lbounds(i).expectedHalfArea();
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const {
+      return lbounds(i).expectedHalfArea(t0t1); 
+    }
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_z[i],upper_z[j]);
+      
+      std::swap(lower_dx[i],lower_dx[j]);
+      std::swap(upper_dx[i],upper_dx[j]);
+      std::swap(lower_dy[i],lower_dy[j]);
+      std::swap(upper_dy[i],upper_dy[j]);
+      std::swap(lower_dz[i],lower_dz[j]);
+      std::swap(upper_dz[i],upper_dz[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNodeMB_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) 
+    {
+      cout << "AABBNodeMB {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << b0 << ", " << embree_endl;
+        cout << "    bounds1 = " << b1 << ", " << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_x;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;        //!< Z dimension of upper bounds of all N children.
+    
+    vfloat<N> lower_dx;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_dx;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_dy;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_dy;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_dz;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_dz;        //!< Z dimension of upper bounds of all N children.
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
new file mode 100644
index 0000000000..46a81d7581
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -0,0 +1,107 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_aabb_mb.h"
+
+namespace embree
+{
+  /*! Aligned 4D Motion Blur Node */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using AABBNodeMB_t<NodeRef,N>::set;
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const
+      {
+        if (hasTimeSplits)
+        {
+          AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+        else
+        {
+          AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+      }
+    };
+
+    struct Set
+    {
+      template<typename BuildRecord>
+      __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
+      {
+        if (likely(ref.isAABBNodeMB())) {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB()->set(i, children[i]);
+        } else {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB4D()->set(i, children[i]);
+        }
+      }
+    };
+
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_t = vfloat<N>(pos_inf);
+      upper_t = vfloat<N>(neg_inf);
+      AABBNodeMB_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds)
+    {
+      AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds));
+      lower_t[i] = tbounds.lower;
+      upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child) {
+      AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i));
+    }
+    
+    /*! returns time range for specified child */
+    __forceinline BBox1f timeRange(size_t i) const {
+      return BBox1f(lower_t[i],upper_t[i]);
+    }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) 
+    {
+      cout << "AABBNodeMB4D {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl;
+        cout << "    bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl;
+        cout << "    time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_t;        //!< time dimension of lower bounds of all N children
+    vfloat<N> upper_t;        //!< time dimension of upper bounds of all N children
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_base.h b/thirdparty/embree/kernels/bvh/bvh_node_base.h
new file mode 100644
index 0000000000..a5570a7b9e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_base.h
@@ -0,0 +1,43 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_ref.h"
+
+namespace embree
+{
+  
+  /*! BVHN Base Node */
+  template<typename NodeRef, int N>
+    struct BaseNode_t
+  {
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      for (size_t i=0; i<N; i++)
+        children[i] = NodeRef::emptyNode;
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! verifies the node */
+    __forceinline bool verify() const
+    {
+      for (size_t i=0; i<N; i++) {
+        if (child(i) == NodeRef::emptyNode) {
+          for (; i<N; i++) {
+            if (child(i) != NodeRef::emptyNode)
+              return false;
+          }
+          break;
+        }
+      }
+      return true;
+    }
+    
+    NodeRef children[N];    //!< Pointer to the N children (can be a node or leaf)
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb.h
new file mode 100644
index 0000000000..e6b500691e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_obb.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Node with unaligned bounds */
+  template<typename NodeRef, int N>
+    struct OBBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const {
+        node.ungetAABBNode()->setRef(i,child);
+        node.ungetAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      naabb.l.vx = Vec3fa(nan);
+      naabb.l.vy = Vec3fa(nan);
+      naabb.l.vz = Vec3fa(nan);
+      naabb.p    = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box. */
+    __forceinline void setBounds(size_t i, const OBBox3fa& b)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = b.space;
+      space.p -= b.bounds.lower;
+      space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space;
+      
+      naabb.l.vx.x[i] = space.l.vx.x;
+      naabb.l.vx.y[i] = space.l.vx.y;
+      naabb.l.vx.z[i] = space.l.vx.z;
+      
+      naabb.l.vy.x[i] = space.l.vy.x;
+      naabb.l.vy.y[i] = space.l.vy.y;
+      naabb.l.vy.z[i] = space.l.vy.z;
+      
+      naabb.l.vz.x[i] = space.l.vz.x;
+      naabb.l.vz.y[i] = space.l.vz.y;
+      naabb.l.vz.z[i] = space.l.vz.z;
+      
+      naabb.p.x[i] = space.p.x;
+      naabb.p.y[i] = space.p.y;
+      naabb.p.z[i] = space.p.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent(size_t i) const {
+      assert(i<N);
+      const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]);
+      const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]);
+      const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n)
+    {
+      o << "UnAABBNode { " << n.naabb << " } " << embree_endl;
+      return o;
+    }
+    
+  public:
+    AffineSpace3vf<N> naabb;   //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space)
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h
new file mode 100644
index 0000000000..c06b1aea5e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h
@@ -0,0 +1,90 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  template<typename NodeRef, int N>
+    struct OBBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const {
+        node.ungetAABBNodeMB()->setRef(i,child);
+        node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt));
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      space0 = one;
+      //b0.lower = b0.upper = Vec3fa(nan);
+      b1.lower = b1.upper = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) {
+      setBounds(i,space,lbounds.bounds0,lbounds.bounds1);
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = s0;
+      space.p -= a.lower;
+      Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower);
+      space = AffineSpace3fa::scale(scale)*space;
+      BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale);
+      BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale);
+      
+      space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z;
+      space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z;
+      space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z;
+      space0.p   .x[i] = space.p   .x; space0.p   .y[i] = space.p   .y; space0.p   .z[i] = space.p   .z;
+      
+      /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z;
+        b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/
+      
+      b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z;
+      b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent0(size_t i) const {
+      assert(i < N);
+      const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]);
+      const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]);
+      const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+  public:
+    AffineSpace3vf<N> space0;
+    //BBox3vf<N> b0; // these are the unit bounds
+    BBox3vf<N> b1;
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
new file mode 100644
index 0000000000..2afc8c98e7
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
@@ -0,0 +1,265 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNode_t
+  {
+    typedef unsigned char T;
+    static const T MIN_QUAN = 0;
+    static const T MAX_QUAN = 255;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN;
+      for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN;
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x),
+                         madd(scale.y,(float)lower_y[i],start.y),
+                         madd(scale.z,(float)lower_z[i],start.z));
+      const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x),
+                         madd(scale.y,(float)upper_y[i],start.y),
+                         madd(scale.z,(float)upper_z[i],start.z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    static __forceinline void init_dim(const vfloat<N> &lower,
+                                       const vfloat<N> &upper,
+                                       T lower_quant[N],
+                                       T upper_quant[N],
+                                       float &start,
+                                       float &scale)
+    {
+      /* quantize bounds */
+      const vbool<N> m_valid = lower != vfloat<N>(pos_inf);
+      const float minF = reduce_min(lower);
+      const float maxF = reduce_max(upper);
+      float diff = (1.0f+2.0f*float(ulp))*(maxF - minF);
+      float decode_scale = diff / float(MAX_QUAN);
+      if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
+      assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
+      const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
+      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
+      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+      
+      /* lower/upper correction */
+      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
+      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+      ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
+      iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
+      
+      /* disable invalid lanes */
+      ilower = select(m_valid,ilower,MAX_QUAN);
+      iupper = select(m_valid,iupper,MIN_QUAN);
+      
+      /* store as uchar to memory */
+      vint<N>::store(lower_quant,ilower);
+      vint<N>::store(upper_quant,iupper);
+      start = minF;
+      scale = decode_scale;
+      
+#if defined(DEBUG)
+      vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
+      vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+      vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
+      vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
+      assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
+      assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid));
+#endif
+    }
+    
+    __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node)
+    {
+      init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x);
+      init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y);
+      init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z);
+    }
+    
+    __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); }
+    
+#if defined(__AVX512F__) // KNL
+    __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
+#endif
+    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+    
+    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+    
+    template <int M>
+      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+    
+#if defined(__AVX512F__)
+    __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
+    __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); }
+    __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }      
+#endif
+    
+    union {
+      struct {
+        T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children
+        T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children
+        T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children
+        T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children
+        T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children
+        T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children
+      };
+      T all_planes[6*N];
+    };
+    
+    Vec3f start;
+    Vec3f scale;
+    
+    friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n)
+    {
+      o << "QuantizedBaseNode { " << embree_endl;
+      o << "  start   " << n.start << embree_endl;
+      o << "  scale   " << n.scale << embree_endl;
+      o << "  lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl;
+      o << "  upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl;
+      o << "  lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl;
+      o << "  upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl;
+      o << "  lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl;
+      o << "  upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  };
+
+  template<typename NodeRef, int N>
+    struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using QuantizedBaseNode_t<N>::lower_x;
+    using QuantizedBaseNode_t<N>::upper_x;
+    using QuantizedBaseNode_t<N>::lower_y;
+    using QuantizedBaseNode_t<N>::upper_y;
+    using QuantizedBaseNode_t<N>::lower_z;
+    using QuantizedBaseNode_t<N>::upper_z;
+    using QuantizedBaseNode_t<N>::start;
+    using QuantizedBaseNode_t<N>::scale;
+    using QuantizedBaseNode_t<N>::init_dim;
+    
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const
+      {
+        __aligned(64) AABBNode_t<NodeRef,N> node;
+        node.clear();
+        for (size_t i=0; i<n; i++) {
+          node.setBounds(i,children[i].bounds());
+        }
+        QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment);
+        qnode->init(node);
+        
+        return (size_t)qnode | NodeRef::tyQuantizedNode;
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        QuantizedNode_t* node = ref.quantizedNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    __forceinline void init(AABBNode_t<NodeRef,N>& node)
+    {
+      for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode;
+      init_dim(node);
+    }
+    
+  }; 
+  
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNodeMB_t
+  {
+    QuantizedBaseNode_t<N> node0;
+    QuantizedBaseNode_t<N> node1;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      node0.clear();
+      node1.clear();
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      BBox3fa bounds0 = node0.bounds(i);
+      BBox3fa bounds1 = node1.bounds(i);
+      bounds0.extend(bounds1);
+      return bounds0;
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    __forceinline vbool<N> validMask() const { return node0.validMask(); }
+    
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); }
+    
+    
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); }
+    
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_ref.h b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
new file mode 100644
index 0000000000..6f6da758de
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/alloc.h"
+#include "../common/accel.h"
+#include "../common/device.h"
+#include "../common/scene.h"
+#include "../geometry/primitive.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  /* BVH node reference with bounds */
+  template<typename NodeRef>
+  struct BVHNodeRecord
+  {
+    __forceinline BVHNodeRecord() {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {}
+
+    NodeRef ref;
+    BBox3fx bounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB
+  {
+    __forceinline BVHNodeRecordMB() {}
+    __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB4D
+  {
+    __forceinline BVHNodeRecordMB4D() {}
+    __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+    BBox1f dt;
+  };
+
+  template<typename NodeRef, int N> struct BaseNode_t;
+  template<typename NodeRef, int N> struct AABBNode_t;
+  template<typename NodeRef, int N> struct AABBNodeMB_t;
+  template<typename NodeRef, int N> struct AABBNodeMB4D_t;
+  template<typename NodeRef, int N> struct OBBNode_t;
+  template<typename NodeRef, int N> struct OBBNodeMB_t;
+  template<typename NodeRef, int N> struct QuantizedNode_t;
+  template<typename NodeRef, int N> struct QuantizedNodeMB_t;
+  
+  /*! Pointer that points to a node or a list of primitives */
+  template<int N>
+    struct NodeRefPtr
+  {
+    //template<int NN> friend class BVHN;
+
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+
+    /*! highest address bit is used as barrier for some algorithms */
+    static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1));
+
+    /*! Masks the bits that store the number of items per leaf. */
+    static const size_t align_mask = byteAlignment-1;
+    static const size_t items_mask = byteAlignment-1;
+
+    /*! different supported node types */
+    static const size_t tyAABBNode = 0;
+    static const size_t tyAABBNodeMB = 1;
+    static const size_t tyAABBNodeMB4D = 6;
+    static const size_t tyOBBNode = 2;
+    static const size_t tyOBBNodeMB = 3;
+    static const size_t tyQuantizedNode = 5;
+    static const size_t tyLeaf = 8;
+
+    /*! Empty node */
+    static const size_t emptyNode = tyLeaf;
+
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0);
+    static const size_t popRay      = (((size_t)-1) & (~items_mask)) | (tyLeaf+1);
+
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = items_mask-tyLeaf;
+        
+    /*! Default constructor */
+    __forceinline NodeRefPtr () {}
+    
+    /*! Construction from integer */
+    __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {}
+    
+    /*! Cast to size_t */
+    __forceinline operator size_t() const { return ptr; }
+    
+    /*! Sets the barrier bit. */
+    __forceinline void setBarrier() {
+#if defined(__64BIT__)
+      assert(!isBarrier());
+      ptr |= barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Clears the barrier bit. */
+    __forceinline void clearBarrier() {
+#if defined(__64BIT__)
+      ptr &= ~barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */
+    __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; }
+    
+    /*! checks if this is a leaf */
+    __forceinline size_t isLeaf() const { return ptr & tyLeaf; }
+    
+    /*! returns node type */
+    __forceinline int type() const { return ptr & (size_t)align_mask; }
+    
+    /*! checks if this is a node */
+    __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; }
+    
+    /*! checks if this is a motion blur node */
+    __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; }
+    
+    /*! checks if this is a 4D motion blur node */
+    __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; }
+    
+    /*! checks if this is a node with unaligned bounding boxes */
+    __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; }
+    
+    /*! checks if this is a motion blur node with unaligned bounding boxes */
+    __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; }
+    
+    /*! checks if this is a quantized node */
+    __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; }
+
+    /*! Encodes a node */
+    static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB4D);
+    }
+
+    /*! Encodes an unaligned node */
+    static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNode);
+    }
+
+    /*! Encodes an unaligned motion blur node */
+    static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNodeMB);
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) {
+      assert(!((size_t)tri & align_mask));
+      assert(num <= maxLeafBlocks);
+      return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks)));
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) {
+      assert(!((size_t)ptr & align_mask));
+      return NodeRefPtr((size_t)ptr | (tyLeaf+ty));
+    }
+    
+    /*! returns base node pointer */
+    __forceinline BaseNode_t<NodeRefPtr,N>* baseNode()
+    {
+      assert(!isLeaf());
+      return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const
+    {
+      assert(!isLeaf());
+      return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! returns node pointer */
+    __forceinline       AABBNode_t<NodeRefPtr,N>* getAABBNode()       { assert(isAABBNode()); return (      AABBNode_t<NodeRefPtr,N>*)ptr; }
+    __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; }
+    
+    /*! returns motion blur node pointer */
+    __forceinline       AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB()       { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (      AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns 4D motion blur node pointer */
+    __forceinline       AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D()       { assert(isAABBNodeMB4D()); return (      AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned node pointer */
+    __forceinline       OBBNode_t<NodeRefPtr,N>* ungetAABBNode()       { assert(isOBBNode()); return (      OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned motion blur node pointer */
+    __forceinline       OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB()       { assert(isOBBNodeMB()); return (      OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns quantized node pointer */
+    __forceinline       QuantizedNode_t<NodeRefPtr,N>* quantizedNode()       { assert(isQuantizedNode()); return (      QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    
+    /*! returns leaf pointer */
+    __forceinline char* leaf(size_t& num) const {
+      assert(isLeaf());
+      num = (ptr & (size_t)items_mask)-tyLeaf;
+      return (char*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! clear all bit flags */
+    __forceinline void clearFlags() {
+      ptr &= ~(size_t)align_mask;
+    }
+    
+     /*! returns the wideness */
+    __forceinline size_t getN() const { return N; }
+    
+  public:
+    size_t ptr;
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.cpp b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
new file mode 100644
index 0000000000..bf5c8538ba
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
@@ -0,0 +1,247 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_refit.h"
+#include "bvh_statistics.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t SINGLE_THREAD_THRESHOLD = 4*1024;
+    
+    template<int N>
+    __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b)
+    {
+      size_t sa = *(size_t*)&a->node()->lower_x;
+      size_t sb = *(size_t*)&b->node()->lower_x;
+      return sa < sb;
+    }
+
+    template<int N>
+    BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds)
+      : bvh(bvh), leafBounds(leafBounds), numSubTrees(0)
+    {
+    }
+
+    template<int N>
+    void BVHNRefitter<N>::refit()
+    {
+      if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) {
+        bvh->bounds = LBBox3fa(recurse_bottom(bvh->root));
+      }
+      else
+      {
+        BBox3fa subTreeBounds[MAX_NUM_SUB_TREES];
+        numSubTrees = 0;
+        gather_subtree_refs(bvh->root,numSubTrees,0);
+        if (numSubTrees)
+          parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) {
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                NodeRef& ref = subTrees[i];
+                subTreeBounds[i] = recurse_bottom(ref);
+              }
+            });
+
+        numSubTrees = 0;        
+        bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0));
+      }    
+  }
+
+    template<int N>
+    void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref,
+                                              size_t &subtrees,
+                                              const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        subTrees[subtrees++] = ref;
+        return;
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          NodeRef& child = node->child(i);
+          if (unlikely(child == BVH::emptyNode)) continue;
+          gather_subtree_refs(child,subtrees,depth+1); 
+        }
+      }
+    }
+
+    template<int N>
+    BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
+                                            size_t &subtrees,
+											const BBox3fa *const subTreeBounds,
+                                            const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        assert(subTrees[subtrees] == ref);
+        return subTreeBounds[subtrees++];
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        BBox3fa bounds[N];
+
+        for (size_t i=0; i<N; i++)
+        {
+          NodeRef& child = node->child(i);
+
+          if (unlikely(child == BVH::emptyNode)) 
+            bounds[i] = BBox3fa(empty);
+          else
+            bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); 
+        }
+        
+        BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+        /* set new bounds */
+        node->lower_x = boundsT.lower.x;
+        node->lower_y = boundsT.lower.y;
+        node->lower_z = boundsT.lower.z;
+        node->upper_x = boundsT.upper.x;
+        node->upper_y = boundsT.upper.y;
+        node->upper_z = boundsT.upper.z;
+        
+        return merge<N>(bounds);
+      }
+      else
+        return leafBounds.leafBounds(ref);
+    }
+
+    // =========================================================
+    // =========================================================
+    // =========================================================
+
+    
+    template<int N>
+    BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref)
+    {
+      /* this is a leaf node */
+      if (unlikely(ref.isLeaf()))
+        return leafBounds.leafBounds(ref);
+      
+      /* recurse if this is an internal node */
+      AABBNode* node = ref.getAABBNode();
+
+      /* enable exclusive prefetch for >= AVX platforms */      
+#if defined(__AVX__)      
+      BVH::prefetchW(ref);
+#endif      
+      BBox3fa bounds[N];
+
+      for (size_t i=0; i<N; i++)
+        if (unlikely(node->child(i) == BVH::emptyNode))
+        {
+          bounds[i] = BBox3fa(empty);          
+        }
+      else
+        bounds[i] = recurse_bottom(node->child(i));
+      
+      /* AOS to SOA transform */
+      BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+      /* set new bounds */
+      node->lower_x = boundsT.lower.x;
+      node->lower_y = boundsT.lower.y;
+      node->lower_z = boundsT.lower.z;
+      node->upper_x = boundsT.upper.x;
+      node->upper_y = boundsT.upper.y;
+      node->upper_z = boundsT.upper.z;
+
+      return merge<N>(bounds);
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode)
+      : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {}
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::clear()
+    {
+      if (builder) 
+        builder->clear();
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::build()
+    {
+      if (mesh->topologyChanged(topologyVersion)) {
+        topologyVersion = mesh->getTopologyVersion();
+        builder->build();
+      }
+      else
+        refitter->refit();
+    }
+
+    template class BVHNRefitter<4>;
+#if defined(__AVX__)
+    template class BVHNRefitter<8>;
+#endif
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH4Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#if  defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH8Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.h b/thirdparty/embree/kernels/bvh/bvh_refit.h
new file mode 100644
index 0000000000..09bb3d8da5
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_refit.h
@@ -0,0 +1,95 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    class BVHNRefitter
+    {
+    public:
+
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      struct LeafBoundsInterface {
+        virtual const BBox3fa leafBounds(NodeRef& ref) const = 0;
+      };
+
+    public:
+    
+      /*! Constructor. */
+      BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds);
+
+      /*! refits the BVH */
+      void refit();
+
+    private:
+      /* single-threaded subtree extraction based on BVH depth */
+      void gather_subtree_refs(NodeRef& ref, 
+                               size_t &subtrees,
+                               const size_t depth = 0);
+
+      /* single-threaded top-level refit */
+      BBox3fa refit_toplevel(NodeRef& ref,
+                             size_t &subtrees,
+							 const BBox3fa *const subTreeBounds,
+                             const size_t depth = 0);
+
+      /* single-threaded subtree refit */
+      BBox3fa recurse_bottom(NodeRef& ref);
+      
+    public:
+      BVH* bvh;                              //!< BVH to refit
+      const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves
+
+      static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4   : (N==8) ? 3    : 3;
+      static const size_t MAX_NUM_SUB_TREES             = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH
+      size_t numSubTrees;
+      NodeRef subTrees[MAX_NUM_SUB_TREES];
+    };
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface
+    {
+    public:
+      
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      
+    public:
+      BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode);
+
+      virtual void build();
+      
+      virtual void clear();
+
+      virtual const BBox3fa leafBounds (NodeRef& ref) const
+      {
+        size_t num; char* prim = ref.leaf(num);
+        if (unlikely(ref == BVH::emptyNode)) return empty;
+
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++)
+            bounds.extend(((Primitive*)prim)[i].update(mesh));
+        return bounds;
+      }
+      
+    private:
+      BVH* bvh;
+      std::unique_ptr<Builder> builder;
+      std::unique_ptr<BVHNRefitter<N>> refitter;
+      Mesh* mesh;
+      unsigned int topologyVersion;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp
new file mode 100644
index 0000000000..460bd60c62
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_rotate.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! Computes half surface area of box. */
+    __forceinline float halfArea3f(const BBox<vfloat4>& box) {
+      const vfloat4 d = box.size();
+      const vfloat4 a = d*shuffle<1,2,0,3>(d);
+      return a[0]+a[1]+a[2];
+    }
+    
+    size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
+    {
+      /*! nothing to rotate if we reached a leaf node. */
+      if (parentRef.isBarrier()) return 0;
+      if (parentRef.isLeaf()) return 0;
+      AABBNode* parent = parentRef.getAABBNode();
+      
+      /*! rotate all children first */
+      vint4 cdepth;
+      for (size_t c=0; c<4; c++)
+	cdepth[c] = (int)rotate(parent->child(c),depth+1);
+      
+      /* compute current areas of all children */
+      vfloat4 sizeX = parent->upper_x-parent->lower_x;
+      vfloat4 sizeY = parent->upper_y-parent->lower_y;
+      vfloat4 sizeZ = parent->upper_z-parent->lower_z;
+      vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ);
+      
+      /*! get node bounds */
+      BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
+      parent->bounds(child1_0,child1_1,child1_2,child1_3);
+      
+      /*! Find best rotation. We pick a first child (child1) and a sub-child 
+	(child2child) of a different second child (child2), and swap child1 
+	and child2child. We perform the best such swap. */
+      float bestArea = 0;
+      size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
+      for (size_t c2=0; c2<4; c2++)
+      {
+	/*! ignore leaf nodes as we cannot descent into them */
+	if (parent->child(c2).isBarrier()) continue;
+	if (parent->child(c2).isLeaf()) continue;
+	AABBNode* child2 = parent->child(c2).getAABBNode();
+	
+	/*! transpose child bounds */
+	BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
+	child2->bounds(child2c0,child2c1,child2c2,child2c3);
+	
+	/*! put child1_0 at each child2 position */
+	float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
+	float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
+	float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
+	float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
+	vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
+	vfloat4 min0 = vreduce_min(cost0);
+	int pos0 = (int)bsf(movemask(min0 == cost0));
+	
+	/*! put child1_1 at each child2 position */
+	float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
+	float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
+	float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
+	float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
+	vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
+	vfloat4 min1 = vreduce_min(cost1);
+	int pos1 = (int)bsf(movemask(min1 == cost1));
+	
+	/*! put child1_2 at each child2 position */
+	float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
+	float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
+	float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
+	float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
+	vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
+	vfloat4 min2 = vreduce_min(cost2);
+	int pos2 = (int)bsf(movemask(min2 == cost2));
+	
+	/*! put child1_3 at each child2 position */
+	float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
+	float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
+	float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
+	float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
+	vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
+	vfloat4 min3 = vreduce_min(cost3);
+	int pos3 = (int)bsf(movemask(min3 == cost3));
+	
+	/*! find best other child */
+	vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
+	int pos[4] = { pos0,pos1,pos2,pos3 };
+	const size_t mbd = BVH4::maxBuildDepth;
+	vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
+	valid &= vint4(int(c2)) != vint4(step);
+	if (none(valid)) continue;
+	size_t c1 = select_min(valid,area0123);
+	float area = area0123[c1]; 
+        if (c1 == c2) continue; // can happen if bounds are NANs
+	
+	/*! accept a swap when it reduces cost and is not swapping a node with itself */
+	if (area < bestArea) {
+	  bestArea = area;
+	  bestChild1 = c1;
+	  bestChild2 = c2;
+	  bestChild2Child = pos[c1];
+	}
+      }
+      
+      /*! if we did not find a swap that improves the SAH then do nothing */
+      if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
+      
+      /*! perform the best found tree rotation */
+      AABBNode* child2 = parent->child(bestChild2).getAABBNode();
+      AABBNode::swap(parent,bestChild1,child2,bestChild2Child);
+      parent->setBounds(bestChild2,child2->bounds());
+      AABBNode::compact(parent);
+      AABBNode::compact(child2);
+      
+      /*! This returned depth is conservative as the child that was
+       *  pulled up in the tree could have been on the critical path. */
+      cdepth[bestChild1]++; // bestChild1 was pushed down one level
+      return 1+reduce_max(cdepth); 
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.h b/thirdparty/embree/kernels/bvh/bvh_rotate.h
new file mode 100644
index 0000000000..61ef64a679
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_rotate.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa 
+  { 
+    template<int N>
+    class BVHNRotate
+    {
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+    public:
+      static const bool enabled = false;
+
+      static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; }
+      static __forceinline void restructure(NodeRef ref, size_t depth = 1) {}
+    };
+
+    /* BVH4 tree rotations */
+    template<>
+    class BVHNRotate<4>
+    {
+      typedef BVH4::AABBNode AABBNode;
+      typedef BVH4::NodeRef NodeRef;
+      
+    public:
+      static const bool enabled = true;
+
+      static size_t rotate(NodeRef parentRef, size_t depth = 1);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
new file mode 100644
index 0000000000..d857ff7d95
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -0,0 +1,168 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_statistics.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  template<int N>
+  BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh)
+  {
+    double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea());
+    stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f));
+  }
+  
+  template<int N>
+  std::string BVHNStatistics<N>::str()
+  {
+    std::ostringstream stream;
+    stream.setf(std::ios::fixed, std::ios::floatfield);
+    stream << "  primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl;
+    size_t totalBytes = stat.bytes(bvh);
+    double totalSAH = stat.sah(bvh);
+    stream << "  total            : sah = "  << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), ";
+    stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), ";
+    stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), ";
+    stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl;
+    if (stat.statAABBNodes.numNodes    ) stream << "  getAABBNodes     : "  << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodes.numNodes  ) stream << "  ungetAABBNodes   : "  << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB.numNodes  ) stream << "  getAABBNodesMB   : "  << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB4D.numNodes) stream << "  getAABBNodesMB4D : "  << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodesMB.numNodes) stream << "  ungetAABBNodesMB : "  << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statQuantizedNodes.numNodes  ) stream << "  quantizedNodes   : "  << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "  leaves           : "  << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "    histogram      : "  << stat.statLeaf.histToString() << std::endl;
+    return stream.str();
+  }
+  
+  template<int N>
+  typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1)
+  {
+    Statistics s;
+    assert(t0t1.size() > 0.0f);
+    double dt = max(0.0f,t0t1.size());
+    if (node.isAABBNode())
+    {
+      AABBNode* n = node.getAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extend(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statAABBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodes.numNodes++;
+      s.statAABBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNode())
+    {
+      OBBNode* n = node.ungetAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodes.numNodes++;
+      s.statOBBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB())
+    {
+      AABBNodeMB* n = node.getAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1));
+          Statistics s = statistics(n->child(i),Ai,t0t1);
+          s.statAABBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB.numNodes++;
+      s.statAABBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB4D())
+    {
+      AABBNodeMB4D* n = node.getAABBNodeMB4D();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const BBox1f t0t1i = intersect(t0t1,n->timeRange(i));
+          assert(!t0t1i.empty());
+          const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i);
+          Statistics s =  statistics(n->child(i),Ai,t0t1i);
+          s.statAABBNodesMB4D.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB4D.numNodes++;
+      s.statAABBNodesMB4D.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNodeMB())
+    {
+      OBBNodeMB* n = node.ungetAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent0(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodesMB.numNodes++;
+      s.statOBBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isQuantizedNode())
+    {
+      QuantizedNode* n = node.quantizedNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statQuantizedNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statQuantizedNodes.numNodes++;
+      s.statQuantizedNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isLeaf())
+    {
+      size_t num; const char* tri = node.leaf(num);
+      if (num)
+      {
+        for (size_t i=0; i<num; i++)
+        {
+          const size_t bytes = bvh->primTy->getBytes(tri);
+          s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri);
+          s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri);
+          s.statLeaf.numBytes += bytes;
+          tri+=bytes;
+        }
+        s.statLeaf.numLeaves++;
+        s.statLeaf.numPrimBlocks += num;
+        s.statLeaf.leafSAH += dt*A*num;
+        if (num-1 < Statistics::LeafStat::NHIST) {
+          s.statLeaf.numPrimBlocksHistogram[num-1]++;
+        }
+      }
+    }
+    else {
+      // -- GODOT start --
+      // throw std::runtime_error("not supported node type in bvh_statistics");
+      abort();
+      // -- GODOT end --
+    }
+    return s;
+  } 
+
+#if defined(__AVX__)
+  template class BVHNStatistics<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+  template class BVHNStatistics<4>;
+#endif
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.h b/thirdparty/embree/kernels/bvh/bvh_statistics.h
new file mode 100644
index 0000000000..a28e115f1c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.h
@@ -0,0 +1,285 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include <sstream>
+
+namespace embree
+{
+  template<int N>
+  class BVHNStatistics
+  {
+    typedef BVHN<N> BVH;
+    typedef typename BVH::AABBNode AABBNode;
+    typedef typename BVH::OBBNode OBBNode;
+    typedef typename BVH::AABBNodeMB AABBNodeMB;
+    typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+    typedef typename BVH::OBBNodeMB OBBNodeMB;
+    typedef typename BVH::QuantizedNode QuantizedNode;
+
+    typedef typename BVH::NodeRef NodeRef;
+
+    struct Statistics 
+    {
+      template<typename Node>
+        struct NodeStat
+      {
+        NodeStat ( double nodeSAH = 0,
+                   size_t numNodes = 0, 
+                   size_t numChildren = 0)
+        : nodeSAH(nodeSAH),
+          numNodes(numNodes), 
+          numChildren(numChildren) {}
+        
+        double sah(BVH* bvh) const {
+          return nodeSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes() const {
+          return numNodes*sizeof(Node);
+        }
+
+        size_t size() const {
+          return numNodes;
+        }
+
+        double fillRateNom () const { return double(numChildren);  }
+        double fillRateDen () const { return double(numNodes*N);  }
+        double fillRate    () const { return fillRateNom()/fillRateDen(); }
+
+        __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+        {
+          return NodeStat(a.nodeSAH + b.nodeSAH,
+                          a.numNodes+b.numNodes,
+                          a.numChildren+b.numChildren);
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";          
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+      public:
+        double nodeSAH;
+        size_t numNodes;
+        size_t numChildren;
+      };
+
+      struct LeafStat
+      {
+        static const int NHIST = 8;
+
+        LeafStat ( double leafSAH = 0.0f, 
+                   size_t numLeaves = 0,
+                   size_t numPrimsActive = 0,
+                   size_t numPrimsTotal = 0,
+                   size_t numPrimBlocks = 0,
+                   size_t numBytes = 0)
+        : leafSAH(leafSAH),
+          numLeaves(numLeaves),
+          numPrimsActive(numPrimsActive),
+          numPrimsTotal(numPrimsTotal),
+          numPrimBlocks(numPrimBlocks),
+          numBytes(numBytes)
+        {
+          for (size_t i=0; i<NHIST; i++)
+            numPrimBlocksHistogram[i] = 0;
+        }
+
+        double sah(BVH* bvh) const {
+          return leafSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes(BVH* bvh) const {
+          return numBytes;
+        }
+
+        size_t size() const {
+          return numLeaves;
+        }
+
+        double fillRateNom (BVH* bvh) const { return double(numPrimsActive);  }
+        double fillRateDen (BVH* bvh) const { return double(numPrimsTotal);  }
+        double fillRate    (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); }
+
+        __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+        {
+          LeafStat stat(a.leafSAH + b.leafSAH,
+                        a.numLeaves+b.numLeaves,
+                        a.numPrimsActive+b.numPrimsActive,
+                        a.numPrimsTotal+b.numPrimsTotal,
+                        a.numPrimBlocks+b.numPrimBlocks,
+                        a.numBytes+b.numBytes);
+          for (size_t i=0; i<NHIST; i++) {
+            stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i];
+            stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i];
+          }
+          return stat;
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+        std::string histToString() const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          for (size_t i=0; i<NHIST; i++)
+            stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% ";
+          return stream.str();
+        }
+     
+      public:
+        double leafSAH;                    //!< SAH of the leaves only
+        size_t numLeaves;                  //!< Number of leaf nodes.
+        size_t numPrimsActive;             //!< Number of active primitives (
+        size_t numPrimsTotal;              //!< Number of active and inactive primitives
+        size_t numPrimBlocks;              //!< Number of primitive blocks.
+        size_t numBytes;                   //!< Number of bytes of leaves.
+        size_t numPrimBlocksHistogram[8];
+      };
+
+    public:
+      Statistics (size_t depth = 0,
+                  LeafStat statLeaf = LeafStat(),
+                  NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(),
+                  NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(),
+                  NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(),
+                  NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(),
+                  NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(),
+                  NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>())
+
+      : depth(depth), 
+        statLeaf(statLeaf),
+        statAABBNodes(statAABBNodes),
+        statOBBNodes(statOBBNodes),
+        statAABBNodesMB(statAABBNodesMB),
+        statAABBNodesMB4D(statAABBNodesMB4D),
+        statOBBNodesMB(statOBBNodesMB),
+        statQuantizedNodes(statQuantizedNodes) {}
+
+      double sah(BVH* bvh) const 
+      {
+        return statLeaf.sah(bvh) +
+          statAABBNodes.sah(bvh) + 
+          statOBBNodes.sah(bvh) + 
+          statAABBNodesMB.sah(bvh) + 
+          statAABBNodesMB4D.sah(bvh) + 
+          statOBBNodesMB.sah(bvh) + 
+          statQuantizedNodes.sah(bvh);
+      }
+      
+      size_t bytes(BVH* bvh) const {
+        return statLeaf.bytes(bvh) +
+          statAABBNodes.bytes() + 
+          statOBBNodes.bytes() + 
+          statAABBNodesMB.bytes() + 
+          statAABBNodesMB4D.bytes() + 
+          statOBBNodesMB.bytes() + 
+          statQuantizedNodes.bytes();
+      }
+
+      size_t size() const 
+      {
+        return statLeaf.size() +
+          statAABBNodes.size() + 
+          statOBBNodes.size() + 
+          statAABBNodesMB.size() + 
+          statAABBNodesMB4D.size() + 
+          statOBBNodesMB.size() + 
+          statQuantizedNodes.size();
+      }
+
+      double fillRate (BVH* bvh) const 
+      {
+        double nom = statLeaf.fillRateNom(bvh) +
+          statAABBNodes.fillRateNom() + 
+          statOBBNodes.fillRateNom() + 
+          statAABBNodesMB.fillRateNom() + 
+          statAABBNodesMB4D.fillRateNom() + 
+          statOBBNodesMB.fillRateNom() + 
+          statQuantizedNodes.fillRateNom();
+        double den = statLeaf.fillRateDen(bvh) +
+          statAABBNodes.fillRateDen() + 
+          statOBBNodes.fillRateDen() + 
+          statAABBNodesMB.fillRateDen() + 
+          statAABBNodesMB4D.fillRateDen() + 
+          statOBBNodesMB.fillRateDen() + 
+          statQuantizedNodes.fillRateDen();
+        return nom/den;
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b )
+      {
+        return Statistics(max(a.depth,b.depth),
+                          a.statLeaf + b.statLeaf,
+                          a.statAABBNodes + b.statAABBNodes,
+                          a.statOBBNodes + b.statOBBNodes,
+                          a.statAABBNodesMB + b.statAABBNodesMB,
+                          a.statAABBNodesMB4D + b.statAABBNodesMB4D,
+                          a.statOBBNodesMB + b.statOBBNodesMB,
+                          a.statQuantizedNodes + b.statQuantizedNodes);
+      }
+
+      static Statistics add ( const Statistics& a, const Statistics& b ) {
+        return a+b;
+      }
+
+    public:
+      size_t depth;
+      LeafStat statLeaf;
+      NodeStat<AABBNode> statAABBNodes;
+      NodeStat<OBBNode> statOBBNodes;
+      NodeStat<AABBNodeMB> statAABBNodesMB;
+      NodeStat<AABBNodeMB4D> statAABBNodesMB4D;
+      NodeStat<OBBNodeMB> statOBBNodesMB;
+      NodeStat<QuantizedNode> statQuantizedNodes;
+    };
+
+  public:
+
+    /* Constructor gathers statistics. */
+    BVHNStatistics (BVH* bvh);
+
+    /*! Convert statistics into a string */
+    std::string str();
+
+    double sah() const { 
+      return stat.sah(bvh); 
+    }
+
+    size_t bytesUsed() const {
+      return stat.bytes(bvh);
+    }
+
+  private:
+    Statistics statistics(NodeRef node, const double A, const BBox1f dt);
+
+  private:
+    BVH* bvh;
+    Statistics stat;
+  };
+
+  typedef BVHNStatistics<4> BVH4Statistics;
+  typedef BVHNStatistics<8> BVH8Statistics;
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser1.h b/thirdparty/embree/kernels/bvh/bvh_traverser1.h
new file mode 100644
index 0000000000..8ce01b57f5
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_traverser1.h
@@ -0,0 +1,466 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "node_intersector1.h"
+#include "../common/stack_item.h"
+
+#define NEW_SORTING_CODE 1
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH regular node traversal for single rays. */
+    template<int N, int types>
+    class BVHNNodeTraverser1Hit;
+
+#if defined(__AVX512VL__) // SKX
+
+    template<int N>
+    __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
+    {
+      const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
+      const vboolf<N> m_geq = d >= dist;
+      const vboolf<N> m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      dist = select(m_geq_shift,dist_shift,dist);
+    }
+
+    template<int N>
+    __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
+      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+    }
+
+    __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
+      return toScalar(permutex2var((__m256i)index,n0,n1));
+    }
+
+    __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
+      return toScalar(permute(n,index));
+    }
+
+#endif
+
+    /* Specialization for BVH4. */
+    template<int types>
+    class BVHNNodeTraverser1Hit<4, types>
+    {
+      typedef BVH4 BVH;
+      typedef BVH4::NodeRef NodeRef;
+      typedef BVH4::BaseNode BaseNode;
+
+
+    public:
+      /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat4& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+        cur = toSizeT(s3);
+        stackPtr+=3;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+      }
+
+      /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat4& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r); 
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+
+    /* Specialization for BVH8. */
+    template<int types>
+    class BVHNNodeTraverser1Hit<8, types>
+    {
+      typedef BVH8 BVH;
+      typedef BVH8::NodeRef NodeRef;
+      typedef BVH8::BaseNode BaseNode;
+      
+#if defined(__AVX512VL__)
+      template<class NodeRef, class BaseNode>
+        static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
+                                                              size_t mask,
+                                                              const vfloat8& tNear,
+                                                              StackItemT<NodeRef>*& stackPtr,
+                                                              StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+        const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
+        const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
+        vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
+        distance_i = vint8::compact((int)mask,distance_i,distance_i);
+        cur = permuteExtract(distance_i,n0,n1);
+        BVH::prefetch(cur,types);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) return;
+
+        /* 2 hits: order A0 B0 */
+        const vint8 d0(distance_i);
+        const vint8 d1(shuffle<1>(distance_i));
+        cur = permuteExtract(d1,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A0 = min(d0, d1);
+        const vint8 dist_B0 = max(d0, d1);
+        assert(dist_A0[0] < dist_B0[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A0,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_B0,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
+          stackPtr++;
+          return;
+        }
+
+        /* 3 hits: order A1 B1 C1 */
+
+        const vint8 d2(shuffle<2>(distance_i));
+        cur = permuteExtract(d2,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A1     = min(dist_A0,d2);
+        const vint8 dist_tmp_B1 = max(dist_A0,d2);
+        const vint8 dist_B1     = min(dist_B0,dist_tmp_B1);
+        const vint8 dist_C1     = max(dist_B0,dist_tmp_B1);
+        assert(dist_A1[0] < dist_B1[0]);
+        assert(dist_B1[0] < dist_C1[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A1,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_C1,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_B1,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
+          stackPtr+=2;
+          return;
+        }
+
+        /* 4 hits: order A2 B2 C2 D2 */
+
+        const vint8 d3(shuffle<3>(distance_i));
+        cur = permuteExtract(d3,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A2     = min(dist_A1,d3);
+        const vint8 dist_tmp_B2 = max(dist_A1,d3);
+        const vint8 dist_B2     = min(dist_B1,dist_tmp_B2);
+        const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
+        const vint8 dist_C2     = min(dist_C1,dist_tmp_C2);
+        const vint8 dist_D2     = max(dist_C1,dist_tmp_C2);
+        assert(dist_A2[0] < dist_B2[0]);
+        assert(dist_B2[0] < dist_C2[0]);
+        assert(dist_C2[0] < dist_D2[0]);
+        
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A2,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_D2,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_C2,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
+          stackPtr[2].ptr            = permuteExtract(dist_B2,n0,n1);
+          *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
+          stackPtr+=3;
+          return;
+        }
+
+        /* >=5 hits: reverse to descending order for writing to stack */
+
+        distance_i = align_shift_right<3>(distance_i,distance_i);
+        const size_t hits = 4 + popcnt(mask);
+        vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
+	
+        isort_quick_update<8>(dist,dist_A2);
+        isort_quick_update<8>(dist,dist_B2);
+        isort_quick_update<8>(dist,dist_C2);
+        isort_quick_update<8>(dist,dist_D2);
+
+        do {
+
+          distance_i = align_shift_right<1>(distance_i,distance_i);
+          cur = permuteExtract(distance_i,n0,n1);
+          BVH::prefetch(cur,types);
+          const vint8 new_dist(permute(distance_i,vint8(zero)));
+          mask &= mask-1;
+          isort_update<8>(dist,new_dist);
+
+        } while(mask);
+
+        for (size_t i=0; i<7; i++)
+          assert(dist[i+0]>=dist[i+1]);
+
+        for (size_t i=0;i<hits-1;i++)
+        {
+          stackPtr->ptr            = permuteExtract(dist,n0,n1);
+          *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
+          dist = align_shift_right<1>(dist,dist);
+          stackPtr++;
+        }
+        cur = permuteExtract(dist,n0,n1);
+      }
+#endif
+
+    public:
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat8& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512VL__)
+        traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+          cur = toSizeT(s3);
+          stackPtr+=3;
+          return;
+        }
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr;
+        stackPtr+=4;      
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; 
+          const vint4 s((size_t)c,(size_t)d);
+          *(vint4*)stackPtr++ = s;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr-4;
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat8& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
new file mode 100644
index 0000000000..852981e69d
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types>
+    class BVHNNodeTraverserStreamHitCoherent
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+
+    public:
+      template<class T>
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t& m_trav_active,
+                                                   const vbool<N>& vmask,
+                                                   const vfloat<N>& tNear,
+                                                   const T* const tMask,
+                                                   StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        const size_t r0 = bscf(mask);
+        assert(r0 < 8);
+        cur = node->child(r0);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r0];
+        assert(cur != BVH::emptyNode);
+        if (unlikely(mask == 0)) return;
+
+        const unsigned int* const tNear_i = (unsigned int*)&tNear;
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        unsigned int d0 = tNear_i[r0];
+        const size_t r1 = bscf(mask);
+        assert(r1 < 8);
+        NodeRef c1 = node->child(r1);
+        BVHN<N>::prefetch(c1,types);
+        unsigned int d1 = tNear_i[r1];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          if (d0 < d1) {
+            assert(tNear[r1] >= 0.0f);
+            stackPtr->mask    = tMask[r1];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c1;
+            stackPtr++;
+            cur = c0;
+            m_trav_active = tMask[r0];
+            return;
+          }
+          else {
+            assert(tNear[r0] >= 0.0f);
+            stackPtr->mask    = tMask[r0];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c0;
+            stackPtr++;
+            cur = c1;
+            m_trav_active = tMask[r1];
+            return;
+          }
+        }
+
+        /*! slow path for more than two hits */
+        size_t hits = movemask(vmask);
+        const vint<N> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<N>(step), 0);
+        const vint<N> dist_i_sorted = usort_descending(dist_i);
+        const vint<N> sorted_index = dist_i_sorted & 7;
+
+        size_t i = 0;
+        for (;;)
+        {
+          const unsigned int index = sorted_index[i];
+          assert(index < 8);
+          cur = node->child(index);
+          m_trav_active = tMask[index];
+          assert(m_trav_active);
+          BVHN<N>::prefetch(cur,types);
+          bscf(hits);
+          if (unlikely(hits==0)) break;
+          i++;
+          assert(cur != BVH::emptyNode);
+          assert(tNear[index] >= 0.0f);
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+
+      template<class T>
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t& m_trav_active,
+                                               const vbool<N>& vmask,
+                                               const T* const tMask,
+                                               StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r];
+
+        /* simple in order sequence */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        stackPtr->mask    = m_trav_active;
+        stackPtr->parent  = parent;
+        stackPtr->child   = cur;
+        stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r);
+          BVHN<N>::prefetch(cur,types);
+          m_trav_active = tMask[r];
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector.h b/thirdparty/embree/kernels/bvh/node_intersector.h
new file mode 100644
index 0000000000..25edaf295d
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct NearFarPrecalculations
+    {
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+
+      __forceinline NearFarPrecalculations() {}
+
+      __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N)
+      {
+        const size_t size = sizeof(float)*N;
+        nearX = (dir.x < 0.0f) ? 1*size : 0*size;
+        nearY = (dir.y < 0.0f) ? 3*size : 2*size;
+        nearZ = (dir.z < 0.0f) ? 5*size : 4*size;
+        farX  = nearX ^ size;
+        farY  = nearY ^ size;
+        farZ  = nearZ ^ size;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h
new file mode 100644
index 0000000000..1ec4fc63fc
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector1.h
@@ -0,0 +1,1403 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      struct TravRayBase;
+      
+    /* Base (without tnear and tfar) */
+    template<int N>
+      struct TravRayBase<N,false>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const Vec3fa ray_rdir = rcp_safe(ray_dir);
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+        org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#endif
+        nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+      }
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+	org_rdir = org*rdir;
+#endif
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<N> org, dir, rdir;
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      Vec3vf<N> org_rdir;
+#endif
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Base (without tnear and tfar) */
+    template<int N>
+      struct TravRayBase<N,true>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const float round_down = 1.0f-3.0f*float(ulp);
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir);
+        const Vec3fa ray_rdir_near = round_down*ray_rdir;
+        const Vec3fa ray_rdir_far  = round_up  *ray_rdir;
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
+        rdir_far  = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
+
+        nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+      }
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        const vfloat<N> round_down = 1.0f-3.0f*float(ulp);
+        const vfloat<N> round_up   = 1.0f+3.0f*float(ulp);
+        org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir_near = round_down*Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+        rdir_far  = round_up  *Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<N> org, dir, rdir_near, rdir_far;
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Full (with tnear and tfar) */
+    template<int N, bool robust>
+      struct TravRay : TravRayBase<N,robust>
+    {
+      __forceinline TravRay() {}
+
+      __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar)
+        : TravRayBase<N,robust>(ray_org, ray_dir),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              float ray_tnear, float ray_tfar,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        TravRayBase<N,robust>::template init<K>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip);
+        tnear = ray_tnear; tfar = ray_tfar;
+      }
+
+      vfloat<N> tnear;
+      vfloat<N> tfar;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Point Query structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    struct TravPointQuery
+    {
+      __forceinline TravPointQuery() {}
+
+      __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad)
+      {
+        org = Vec3vf<N>(query_org.x, query_org.y, query_org.z);
+        rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z);
+      }
+
+      __forceinline vfloat<N> const& tfar() const {
+        return rad.x;
+      }
+
+      Vec3vf<N> org, rad;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // point query
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t pointQuerySphereDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> vmask = dist <= query.tfar()*query.tfar();
+      const vbool<N> valid = minX <= maxX;
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeSphere(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryAABBDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> valid = minX <= maxX;
+      const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) |
+                               (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) |
+                               (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z));
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeAABB(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
+    {
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__aarch64__)
+      const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+      const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
+    {
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,true>& ray, vfloat<N>& dist)
+    {
+      const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<N)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      const size_t mask = movemask(tNear <= tFar);
+      dist = tNear;
+      return mask;
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+        
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined (__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__AVX2__) && !defined(__AVX512F__)
+      const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
+      const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+#endif
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,false>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,true>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,false>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,true>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#endif
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif      
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist)
+    {
+      const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir);
+      //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir;
+      const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(node->naabb,ray.org);
+      const Vec3vf<N> tLowerXYZ = org * nrdir;       // (Vec3fa(zero) - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,robust>& ray, const float time, vfloat<N>& dist)
+    {
+      const AffineSpace3vf<N> xfm = node->space0;
+      const Vec3vf<N> b0_lower = zero;
+      const Vec3vf<N> b0_upper = one;
+      const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time));
+      const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time));
+
+      const BBox3vf<N> bounds(lower,upper);
+      const Vec3vf<N> dir = xfmVector(xfm,ray.dir);
+      const Vec3vf<N> rdir = rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(xfm,ray.org);
+
+      const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in point query raversal
+    //////////////////////////////////////////////////////////////////////////////////////
+    
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQuerySphere1;
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeSphereMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQuerySphere1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,time,dist);
+      }
+    };
+
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQueryAABB1;
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeAABBMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQueryAABB1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,time,dist);
+      }
+    };
+
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with 1 ray */
+    template<int N, int types, bool robust>
+    struct BVHNNodeIntersector1;
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4D<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4DRobust<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNode(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4D(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4DRobust(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_QN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_QN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    /*! Intersects N nodes with K rays */
+    template<int N, bool robust>
+      struct BVHNQuantizedBaseNodeIntersector1;
+
+    template<int N>
+      struct BVHNQuantizedBaseNodeIntersector1<N, false>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,false>& ray, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,dist);
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+    template<int N>
+      struct BVHNQuantizedBaseNodeIntersector1<N, true>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,true>& ray, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,dist); 
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
new file mode 100644
index 0000000000..1f7215e5df
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
@@ -0,0 +1,241 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Frustum structure used in hybrid and stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*
+       Optimized frustum test. We calculate t=(p-org)/dir in ray/box
+       intersection. We assume the rays are split by octant, thus
+       dir intervals are either positive or negative in each
+       dimension.
+
+       Case 1: dir.min >= 0 && dir.max >= 0:
+         t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+         t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+
+       Case 2: dir.min < 0 && dir.max < 0:
+         t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+         t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+    */
+
+    template<bool robust>
+    struct Frustum;
+    
+    /* Fast variant */
+    template<>
+    struct Frustum<false>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org_rdir;
+      Vec3fa max_org_rdir;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<false> FrustumFast;
+
+    /* Robust variant */
+    template<>
+    struct Frustum<true>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org = select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org = select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org;
+      Vec3fa max_org;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<true> FrustumRobust;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumFast& frustum, vfloat<N>& dist)
+    {
+      const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x));
+      const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y));
+      const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z));
+      const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x));
+      const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y));
+      const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z));
+
+      const vfloat<N> fmin  = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<N> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
+      const vbool<N> vmask_node_hit = fmin <= fmax;
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumRobust& frustum, vfloat<N>& dist)
+    {
+      const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<N> fminX = (bminX - vfloat<N>(frustum.min_org.x)) * vfloat<N>(frustum.min_rdir.x);
+      const vfloat<N> fminY = (bminY - vfloat<N>(frustum.min_org.y)) * vfloat<N>(frustum.min_rdir.y);
+      const vfloat<N> fminZ = (bminZ - vfloat<N>(frustum.min_org.z)) * vfloat<N>(frustum.min_rdir.z);
+      const vfloat<N> fmaxX = (bmaxX - vfloat<N>(frustum.max_org.x)) * vfloat<N>(frustum.max_rdir.x);
+      const vfloat<N> fmaxY = (bmaxY - vfloat<N>(frustum.max_org.y)) * vfloat<N>(frustum.max_rdir.y);
+      const vfloat<N> fmaxZ = (bmaxZ - vfloat<N>(frustum.max_org.z)) * vfloat<N>(frustum.max_rdir.z);
+
+      const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const float round_up   = 1.0f+2.0f*float(ulp);
+      const vfloat<N> fmin  = max(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<N> fmax  = min(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
+      const vbool<N> vmask_node_hit = (round_down*fmin <= round_up*fmax);
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
new file mode 100644
index 0000000000..d5498fc5db
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
@@ -0,0 +1,805 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayK<K, false>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = rcp_safe(ray_dir);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        org_rdir = org * rdir;
+#endif
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      Vec3vf<K> org_rdir;
+#endif
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKFast = TravRayK<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayK<K, true>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKRobust = TravRayK<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i,
+                                         const TravRayKFast<K>& ray, vfloat<K>& dist)
+
+    {
+  #if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i,
+                                               const TravRayKRobust<K>& ray, vfloat<K>& dist)
+    {
+      // FIXME: use per instruction rounding for AVX512
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit   = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                         const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+#if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+#else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+#endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                               const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+#if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                             const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+      const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                                    const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]),
+                                    Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]),
+                                    Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]),
+                                    Vec3f(node->naabb.p   .x[i], node->naabb.p   .y[i], node->naabb.p   .z[i]));
+
+      const Vec3vf<K> dir = xfmVector(naabb, ray.dir);
+      const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1?
+      const Vec3vf<K> org = xfmPoint(naabb, ray.org);
+
+      const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir;
+      const vfloat<K> lclipMinY = org.y * nrdir.y;
+      const vfloat<K> lclipMinZ = org.z * nrdir.z;
+      const vfloat<K> lclipMaxX  = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir;
+      const vfloat<K> lclipMaxY  = lclipMinY - nrdir.y;
+      const vfloat<K> lclipMaxZ  = lclipMinZ - nrdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]),
+                                  Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]),
+                                  Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]),
+                                  Vec3f(node->space0.p   .x[i], node->space0.p   .y[i], node->space0.p   .z[i]));
+
+      const Vec3vf<K> b0_lower = zero;
+      const Vec3vf<K> b0_upper = one;
+      const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]);
+      const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]);
+      const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time);
+      const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time);
+
+      const Vec3vf<K> dir = xfmVector(xfm, ray.dir);
+      const Vec3vf<K> rdir = rcp_safe(dir);
+      const Vec3vf<K> org = xfmPoint(xfm, ray.org);
+
+      const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x;
+      const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y;
+      const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z;
+      const vfloat<K> lclipMaxX  = (upper.x - org.x) * rdir.x;
+      const vfloat<K> lclipMaxY  = (upper.y - org.y) * rdir.y;
+      const vfloat<K> lclipMaxZ  = (upper.z - org.z) * rdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+        
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+                                                   const TravRayK<K,false>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+  #if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+          const TravRayK<K,true>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+      const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+      }
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time);
+        const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time);
+        const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time);
+        const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time);
+        const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
+        const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
+        
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+        const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+  #endif
+        const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time);
+        const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time);
+        const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time);
+        const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time);
+        const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
+        const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
+
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const float round_down = 1.0f-3.0f*float(ulp);
+
+        const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, int types, bool robust>
+    struct BVHNNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, false>
+    {
+      /* vmask is both an input and an output parameter! Its initial value should be the parent node
+         hit mask, which is used for correctly computing the current hit mask. The parent hit mask
+         is actually required only for motion blur node intersections (because different rays may
+         have different times), so for regular nodes vmask is simply overwritten. */
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, bool robust>
+    struct BVHNQuantizedBaseNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, false>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                              const TravRayK<K,false>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+                                               const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, true>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                               const TravRayK<K,true>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
new file mode 100644
index 0000000000..55b2c27231
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayKStream;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayKStream<K, false>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = rcp_safe(ray_dir);
+        org_rdir = ray_org * rdir;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org_rdir;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamFast = TravRayKStream<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayKStream<K, true>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+        org = ray_org;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamRobust = TravRayKStream<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
+      const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
+      const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+      const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
+      const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
+      const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+      const vfloat<N> rmin  = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
+      const vfloat<N> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
+
+      const vbool<N> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char* ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+
+      const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
+      const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<N> rminX = (bminX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
+      const vfloat<N> rminY = (bminY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
+      const vfloat<N> rminZ = (bminZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
+      const vfloat<N> rmaxX = (bmaxX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
+      const vfloat<N> rmaxY = (bmaxY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
+      const vfloat<N> rmaxZ = (bmaxZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
+      const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const vfloat<N> rmin  =            max(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
+      const vfloat<N> rmax  = round_up  *min(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
+
+      const vbool<N> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char *ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
+      const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
+
+      const float round_up  = 1.0f+3.0f*float(ulp);
+      const vfloat<K> rmin  =            max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
+      const vfloat<K> rmax  = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
new file mode 100644
index 0000000000..cc4ea1805b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+  class Scene;
+
+  /*! Base class for the acceleration structure data. */
+  class AccelData : public RefCount 
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+  public:
+    AccelData (const Type type) 
+      : bounds(empty), type(type) {}
+
+    /*! notifies the acceleration structure about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+   
+    /*! clears the acceleration structure data */
+    virtual void clear() = 0;
+
+    /*! returns normal bounds */
+    __forceinline BBox3fa getBounds() const {
+      return bounds.bounds();
+    }
+
+    /*! returns bounds for some time */
+    __forceinline BBox3fa getBounds(float t) const {
+      return bounds.interpolate(t);
+    }
+
+    /*! returns linear bounds */
+    __forceinline LBBox3fa getLinearBounds() const {
+      return bounds;
+    }
+
+    /*! checks if acceleration structure is empty */
+    __forceinline bool isEmpty() const {
+      return bounds.bounds0.lower.x == float(pos_inf);
+    }
+
+  public:
+    LBBox3fa bounds; // linear bounds
+    Type type;
+  };
+
+  /*! Base class for all intersectable and buildable acceleration structures. */
+  class Accel : public AccelData
+  {
+     ALIGNED_CLASS_(16);
+  public:
+
+    struct Intersectors;
+
+    /*! Type of collide function */
+    typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+    /*! Type of point query function */
+    typedef bool(*PointQueryFunc)(Intersectors* This,          /*!< this pointer to accel */
+                                  PointQuery* query,        /*!< point query for lookup */
+                                  PointQueryContext* context); /*!< point query context */
+
+    /*! Type of intersect function pointer for single rays. */
+    typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
+                                  RTCRayHit& ray,      /*!< ray to intersect */
+                                  IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 4. */
+    typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit4& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 8. */
+    typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit8& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 16. */
+    typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRayHit16& ray,   /*!< ray packet to intersect */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
+                                   const size_t N,     /*!< number of rays in stream */
+                                   IntersectContext* context /*!< layout flags */);
+    
+    
+    /*! Type of occlusion function pointer for single rays. */
+    typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+                                  RTCRay& ray,        /*!< ray to test occlusion */
+                                  IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 4. */
+    typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay4& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 8. */
+    typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay8& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 16. */
+    typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRay16& ray,      /*!< ray packet to test occlusion. */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
+                                  const size_t N,     /*!< number of rays in stream */
+                                  IntersectContext* context /*!< layout flags */);
+    typedef void (*ErrorFunc) ();
+
+    struct Collider
+    {
+      Collider (ErrorFunc error = nullptr) 
+      : collide((CollideFunc)error), name(nullptr) {}
+
+      Collider (CollideFunc collide, const char* name)
+      : collide(collide), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      CollideFunc collide;  
+      const char* name;
+    };
+    
+    struct Intersector1
+    {
+      Intersector1 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      static const char* type;
+      IntersectFunc intersect;
+      OccludedFunc occluded;
+      PointQueryFunc pointQuery;
+      const char* name;
+    };
+    
+    struct Intersector4 
+    {
+      Intersector4 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+      Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc4 intersect;
+      OccludedFunc4 occluded;
+      const char* name;
+    };
+    
+    struct Intersector8 
+    {
+      Intersector8 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+      Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc8 intersect;
+      OccludedFunc8 occluded;
+      const char* name;
+    };
+    
+    struct Intersector16 
+    {
+      Intersector16 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+      Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc16 intersect;
+      OccludedFunc16 occluded;
+      const char* name;
+    };
+
+    struct IntersectorN 
+    {
+      IntersectorN (ErrorFunc error = nullptr)
+      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFuncN intersect;
+      OccludedFuncN occluded;
+      const char* name;
+    };
+   
+    struct Intersectors 
+    {
+      Intersectors() 
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+      Intersectors (ErrorFunc error) 
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+      void print(size_t ident) 
+      {
+        if (collider.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "collider  = " << collider.name << std::endl;
+        }
+        if (intersector1.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector1  = " << intersector1.name << std::endl;
+        }
+        if (intersector4.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector4  = " << intersector4.name << std::endl;
+        }
+        if (intersector8.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector8  = " << intersector8.name << std::endl;
+        }
+        if (intersector16.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector16 = " << intersector16.name << std::endl;
+        }
+        if (intersectorN.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersectorN = " << intersectorN.name << std::endl;
+        }        
+      }
+
+      void select(bool filter)
+      {
+        if (intersector4_filter) {
+          if (filter) intersector4 = intersector4_filter;
+          else        intersector4 = intersector4_nofilter;
+        }
+        if (intersector8_filter) {
+          if (filter) intersector8 = intersector8_filter;
+          else        intersector8 = intersector8_nofilter;
+        }
+        if (intersector16_filter) {
+          if (filter) intersector16 = intersector16_filter;
+          else         intersector16 = intersector16_nofilter;
+        }
+        if (intersectorN_filter) {
+          if (filter) intersectorN = intersectorN_filter;
+          else        intersectorN = intersectorN_nofilter;
+        }        
+      }
+
+      __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+        assert(intersector1.pointQuery);
+        return intersector1.pointQuery(this,query,context);
+      }
+
+      /*! collides two scenes */
+      __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+        assert(collider.collide);
+        collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+      }
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+        assert(intersector1.intersect);
+        intersector1.intersect(this,ray,context);
+      }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a stream of N rays in SOA layout with the scene. */
+      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.intersect);
+        intersectorN.intersect(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__)
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        intersect4(&mask,(RTCRayHit4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        intersect8(&mask,(RTCRayHit8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        intersect16(&mask,(RTCRayHit16&)ray,context);
+      }
+#endif
+      
+      template<int K>
+      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        intersectN((RTCRayHitN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+        assert(intersector1.occluded);
+        intersector1.occluded(this,ray,context);
+      }
+      
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.occluded);
+        intersectorN.occluded(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__)
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        occluded4(&mask,(RTCRay4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        occluded8(&mask,(RTCRay8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        occluded16(&mask,(RTCRay16&)ray,context);
+      }
+#endif
+
+      template<int K>
+      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        occludedN((RTCRayN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+        occluded(ray, context);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+        occluded(valid, ray, context);
+      }
+
+      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+      template<int K>
+      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+        occludedN(rayN, N, context);
+      }
+      
+    public:
+      AccelData* ptr;
+      void* leafIntersector;
+      Collider collider;
+      Intersector1 intersector1;
+      Intersector4 intersector4;
+      Intersector4 intersector4_filter;
+      Intersector4 intersector4_nofilter;
+      Intersector8 intersector8;
+      Intersector8 intersector8_filter;
+      Intersector8 intersector8_nofilter;
+      Intersector16 intersector16;
+      Intersector16 intersector16_filter;
+      Intersector16 intersector16_nofilter;
+      IntersectorN intersectorN;
+      IntersectorN intersectorN_filter;
+      IntersectorN intersectorN_nofilter;      
+    };
+  
+  public:
+
+    /*! Construction */
+    Accel (const AccelData::Type type) 
+      : AccelData(type) {}
+    
+    /*! Construction */
+    Accel (const AccelData::Type type, const Intersectors& intersectors) 
+      : AccelData(type), intersectors(intersectors) {}
+
+    /*! Virtual destructor */
+    virtual ~Accel() {}
+
+    /*! makes the acceleration structure immutable */
+    virtual void immutable () {}
+    
+    /*! build acceleration structure */
+    virtual void build () = 0;
+
+  public:
+    Intersectors intersectors;
+  };
+
+#define DEFINE_COLLIDER(symbol,collider)                                \
+  Accel::Collider symbol() {                                            \
+    return Accel::Collider((Accel::CollideFunc)collider::collide,       \
+                           TOSTRING(isa) "::" TOSTRING(symbol));        \
+  }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector)                               \
+  Accel::Intersector1 symbol() {                                              \
+    return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+                               (Accel::OccludedFunc  )intersector::occluded,  \
+                               (Accel::PointQueryFunc)intersector::pointQuery,\
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR4(symbol,intersector)                               \
+  Accel::Intersector4 symbol() {                                              \
+    return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+                               (Accel::OccludedFunc4)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR8(symbol,intersector)                               \
+  Accel::Intersector8 symbol() {                                              \
+    return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+                               (Accel::OccludedFunc8)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector)                                \
+  Accel::Intersector16 symbol() {                                               \
+    return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+                                (Accel::OccludedFunc16)intersector::occluded,   \
+                                TOSTRING(isa) "::" TOSTRING(symbol));           \
+  }
+
+#define DEFINE_INTERSECTORN(symbol,intersector)                               \
+  Accel::IntersectorN symbol() {                                              \
+    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+                               (Accel::OccludedFuncN)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+  /* ray stream filter interface */
+  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+  struct RayStreamFilterFuncs
+  {
+    RayStreamFilterFuncs()
+    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
+
+    RayStreamFilterFuncs(void (*ptr) ())
+    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
+
+    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
+    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
+
+  public:
+    intersectStreamAOS_func intersectAOS;
+    intersectStreamAOP_func intersectAOP;
+    intersectStreamSOA_func intersectSOA;
+    intersectStreamSOP_func intersectSOP;
+
+    occludedStreamAOS_func occludedAOS;
+    occludedStreamAOP_func occludedAOP;
+    occludedStreamSOA_func occludedSOA;
+    occludedStreamSOP_func occludedSOP;
+  }; 
+
+  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree/kernels/common/accelinstance.h b/thirdparty/embree/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..c63ef998bd
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+  class AccelInstance : public Accel
+  {
+  public:
+    AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+      : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+    void immutable () {
+      builder.reset(nullptr);
+    }
+
+  public:
+    void build () {
+      if (builder) builder->build();
+      bounds = accel->bounds;
+    }
+
+    void deleteGeometry(size_t geomID) {
+      if (accel  ) accel->deleteGeometry(geomID);
+      if (builder) builder->deleteGeometry(geomID);
+    }
+    
+    void clear() {
+      if (accel) accel->clear();
+      if (builder) builder->clear();
+    }
+
+  private:
+    std::unique_ptr<AccelData> accel;
+    std::unique_ptr<Builder> builder;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..32a27c560a
--- /dev/null
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+  AccelN::AccelN()
+    : Accel(AccelData::TY_ACCELN), accels() {}
+
+  AccelN::~AccelN() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+  }
+
+  void AccelN::accels_add(Accel* accel) 
+  {
+    assert(accel);
+    accels.push_back(accel);
+  }
+
+  void AccelN::accels_init() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+    
+    accels.clear();
+  }
+
+  bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+  {
+    bool changed = false;
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        changed |= This->accels[i]->intersectors.pointQuery(query,context);
+    return changed;
+  }
+
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect(ray,context);
+  }
+
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect4(valid,ray,context);
+  }
+
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect8(valid,ray,context);
+  }
+
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect16(valid,ray,context);
+  }
+
+  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersectN(ray,N,context);
+  }
+
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded(ray,context); 
+      if (ray.tfar < 0.0f) break; 
+    }
+  }
+
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__)
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      if (unlikely(none(valid0 & hit0))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      vbool4 valid2 = asBool(((vint4*)valid)[2]);
+      vbool4 hit2   = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+      vbool4 valid3 = asBool(((vint4*)valid)[3]);
+      vbool4 hit3   = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    size_t M = N;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.occludedN(ray,M,context);
+  }
+
+  void AccelN::accels_print(size_t ident)
+  {
+    for (size_t i=0; i<accels.size(); i++)
+    {
+      for (size_t j=0; j<ident; j++) std::cout << " "; 
+      std::cout << "accels[" << i << "]" << std::endl;
+      accels[i]->intersectors.print(ident+2);
+    }
+  }
+
+  void AccelN::accels_immutable()
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      accels[i]->immutable();
+  }
+  
+  void AccelN::accels_build () 
+  {
+    /* reduce memory consumption */
+    accels.shrink_to_fit();
+    
+    /* build all acceleration structures in parallel */
+    parallel_for (accels.size(), [&] (size_t i) { 
+        accels[i]->build();
+      });
+
+    /* create list of non-empty acceleration structures */
+    bool valid1 = true;
+    bool valid4 = true;
+    bool valid8 = true;
+    bool valid16 = true;
+    for (size_t i=0; i<accels.size(); i++) {
+      valid1 &= (bool) accels[i]->intersectors.intersector1;
+      valid4 &= (bool) accels[i]->intersectors.intersector4;
+      valid8 &= (bool) accels[i]->intersectors.intersector8;
+      valid16 &= (bool) accels[i]->intersectors.intersector16;
+    }
+
+    if (accels.size() == 1) {
+      type = accels[0]->type; // FIXME: should just assign entire Accel
+      bounds = accels[0]->bounds;
+      intersectors = accels[0]->intersectors;
+    }
+    else 
+    {
+      type = AccelData::TY_ACCELN;
+      intersectors.ptr = this;
+      intersectors.intersector1  = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+      intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+      intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+      intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+      /*! calculate bounds */
+      bounds = empty;
+      for (size_t i=0; i<accels.size(); i++) 
+        bounds.extend(accels[i]->bounds);
+    }
+  }
+
+  void AccelN::accels_select(bool filter)
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->intersectors.select(filter);
+  }
+
+  void AccelN::accels_deleteGeometry(size_t geomID) 
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->deleteGeometry(geomID);
+  }
+
+  void AccelN::accels_clear()
+  {
+    for (size_t i=0; i<accels.size(); i++) {
+      accels[i]->clear();
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/common/acceln.h b/thirdparty/embree/kernels/common/acceln.h
new file mode 100644
index 0000000000..0445b2e811
--- /dev/null
+++ b/thirdparty/embree/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+  /*! merges N acceleration structures together, by processing them in order */
+  class AccelN : public Accel
+  {
+  public:
+    AccelN ();
+    ~AccelN();
+
+  public:
+    void accels_add(Accel* accel);
+    void accels_init();
+
+  public:
+    static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+  public:
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    void accels_print(size_t ident);
+    void accels_immutable();
+    void accels_build ();
+    void accels_select(bool filter);
+    void accels_deleteGeometry(size_t geomID);
+    void accels_clear ();
+
+  public:
+    std::vector<Accel*> accels;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/accelset.cpp b/thirdparty/embree/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..8c18f31776
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+  AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) 
+    : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+  AccelSet::IntersectorN::IntersectorN (ErrorFunc error) 
+    : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+  
+  AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+    : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h
new file mode 100644
index 0000000000..90b184a07b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+  struct IntersectFunctionNArguments;
+  struct OccludedFunctionNArguments;
+  
+  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  
+  struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportIntersectionFunc report;
+  };
+
+  struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportOcclusionFunc report;
+  };
+
+  /*! Base class for set of acceleration structures. */
+  class AccelSet : public Geometry
+  {
+  public:
+    typedef RTCIntersectFunctionN IntersectFuncN;  
+    typedef RTCOccludedFunctionN OccludedFuncN;
+    typedef void (*ErrorFunc) ();
+
+      struct IntersectorN
+      {
+        IntersectorN (ErrorFunc error = nullptr) ;
+        IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+        
+        operator bool() const { return name; }
+        
+      public:
+        static const char* type;
+        IntersectFuncN intersect;
+        OccludedFuncN occluded; 
+        const char* name;
+      };
+      
+    public:
+      
+      /*! construction */
+      AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+      
+      /*! makes the acceleration structure immutable */
+      virtual void immutable () {}
+      
+      /*! build accel */
+      virtual void build () = 0;
+
+      /*! check if the i'th primitive is valid between the specified time range */
+      __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+      {
+        for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+          if (!isvalid_non_empty(bounds(i,itime))) return false;
+        
+        return true;
+      }
+
+      /*! Calculates the bounds of an item */
+      __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+      {
+        BBox3fa box;
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)itime;
+        args.bounds_o = (RTCBounds*)&box;
+        boundsFunc(&args);
+        return box;
+      }
+
+      /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+      __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+      {
+        BBox3fa box[2];
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)(itime+0);
+        args.bounds_o = (RTCBounds*)&box[0];
+        boundsFunc(&args);
+        args.timeStep = (unsigned int)(itime+1);
+        args.bounds_o = (RTCBounds*)&box[1];
+        boundsFunc(&args);
+        return LBBox3fa(box[0],box[1]);
+      }
+
+      /*! calculates the build bounds of the i'th item, if it's valid */
+      __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+      {
+        const BBox3fa b = bounds(i);
+        if (bbox) *bbox = b;
+        return isvalid_non_empty(b);
+      }
+
+      /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+      __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+      {
+        const LBBox3fa bounds = linearBounds(i,itime);
+        bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+        return isvalid_non_empty(bounds);
+      }
+
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+        return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+      }
+      
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const  {
+        if (!valid(i, timeSegmentRange(time_range))) return false;
+        bbox = linearBounds(i, time_range);
+        return true;
+      }
+
+      /* gets version info of topology */
+      unsigned int getTopologyVersion() const {
+        return numPrimitives;
+      }
+    
+      /* returns true if topology changed */
+      bool topologyChanged(unsigned int otherVersion) const {
+        return numPrimitives != otherVersion;
+      }
+
+  public:
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+   
+      /*! Intersects a packet of K rays with the scene. */
+      template<int K>
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        vint<K> mask = valid.mask32();
+        IntersectFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+         
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        vint<K> mask = valid.mask32();
+        OccludedFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+
+    public:
+      RTCBoundsFunction boundsFunc;
+      IntersectorN intersectorN;
+  };
+  
+#define DEFINE_SET_INTERSECTORN(symbol,intersector)                     \
+  AccelSet::IntersectorN symbol() {                                     \
+    return AccelSet::IntersectorN(intersector::intersect, \
+                                  intersector::occluded, \
+                                  TOSTRING(isa) "::" TOSTRING(symbol)); \
+  }
+}
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..1a0e1aeed3
--- /dev/null
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -0,0 +1,79 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+
+namespace embree
+{
+  __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+   
+  struct fast_allocator_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> numFailed;
+    std::unique_ptr<FastAllocator> alloc;
+
+    fast_allocator_regression_test() 
+      : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(fast_allocator_regression_test* This)
+    {
+      FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+      size_t* ptrs[1000];
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        for (size_t i=0; i<1000; i++) {
+          ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+          *ptrs[i] = size_t(threadalloc.talloc0) + i;
+        }
+        for (size_t i=0; i<1000; i++) {
+          if (*ptrs[i] != size_t(threadalloc.talloc0) + i) 
+            This->numFailed++;
+        }
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      alloc = make_unique(new FastAllocator(nullptr,false));
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        alloc->reset();
+        barrier.wait();
+        barrier.wait();
+      }
+     
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      alloc = nullptr;
+
+      return numFailed == 0;
+    }
+  };
+
+  fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
new file mode 100644
index 0000000000..4458e35c24
--- /dev/null
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -0,0 +1,958 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+namespace embree
+{
+  class FastAllocator
+  {
+    /*! maximum supported alignment */
+    static const size_t maxAlignment = 64;
+
+    /*! maximum allocation size */
+
+    /* default settings */
+    //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+  public:
+
+    struct ThreadLocal2;
+    enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
+
+    /*! Per thread structure holding the current memory block. */
+    struct __aligned(64) ThreadLocal
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      /*! Constructor for usage with ThreadLocalData */
+      __forceinline ThreadLocal (ThreadLocal2* parent) 
+	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+      /*! initialize allocator */
+      void init(FastAllocator* alloc) 
+      {
+        ptr = nullptr;
+	cur = end = 0;
+        bytesUsed = 0;
+        bytesWasted = 0;
+        allocBlockSize = 0;
+        if (alloc) allocBlockSize = alloc->defaultBlockSize;
+      }
+
+      /* Allocate aligned memory from the threads memory block. */
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      {
+        /* bind the thread local allocator to the proper FastAllocator*/
+        parent->bind(alloc);
+
+        assert(align <= maxAlignment);
+	bytesUsed += bytes;
+
+        /* try to allocate in local block */
+	size_t ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+        
+        /* if allocation is too large allocate with parent allocator */
+        if (4*bytes > allocBlockSize) {
+          return alloc->malloc(bytes,maxAlignment,false);
+	}
+
+        /* get new partial block if allocation failed */
+        size_t blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ 	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* get new full block if allocation failed */
+        blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* should never happen as large allocations get handled specially above */
+        assert(false);
+        return nullptr;
+      }
+
+      
+      /*! returns amount of used bytes */
+      __forceinline size_t getUsedBytes() const { return bytesUsed; }
+  
+      /*! returns amount of free bytes */
+      __forceinline size_t getFreeBytes() const { return end-cur; }
+      
+      /*! returns amount of wasted bytes */
+      __forceinline size_t getWastedBytes() const { return bytesWasted; }
+  
+    private:
+      ThreadLocal2* parent;
+      char*  ptr;            //!< pointer to memory block
+      size_t cur;            //!< current location of the allocator
+      size_t end;            //!< end of the memory block
+      size_t allocBlockSize; //!< block size for allocations
+      size_t bytesUsed;      //!< number of total bytes allocated
+      size_t bytesWasted;    //!< number of bytes wasted
+    };
+
+    /*! Two thread local structures. */
+    struct __aligned(64) ThreadLocal2
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      __forceinline ThreadLocal2()
+        : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+      /*! bind to fast allocator */
+      __forceinline void bind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() == alloc_i) return;
+        Lock<SpinLock> lock(mutex);
+        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+        if (alloc.load()) {
+          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        }
+        alloc0.init(alloc_i);
+        alloc1.init(alloc_i);
+        alloc.store(alloc_i);
+        alloc_i->join(this);
+      }
+
+      /*! unbind to fast allocator */
+      void unbind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() != alloc_i) return;
+        Lock<SpinLock> lock(mutex);
+        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        alloc0.init(nullptr);
+        alloc1.init(nullptr);
+        alloc.store(nullptr);
+      }
+
+    public:
+      SpinLock mutex;        //!< required as unbind is called from other threads
+      std::atomic<FastAllocator*> alloc;  //!< parent allocator
+      ThreadLocal alloc0;
+      ThreadLocal alloc1;
+    };
+
+    FastAllocator (Device* device, bool osAllocation) 
+      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
+        primrefarray(device,0)
+    {
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+        assert(!slotMutex[i].isLocked());
+      }
+    }
+
+    ~FastAllocator () {
+      clear();
+    }
+
+    /*! returns the device attached to this allocator */
+    Device* getDevice() {
+      return device;
+    }
+
+    void share(mvector<PrimRef>& primrefarray_i) {
+      primrefarray = std::move(primrefarray_i);
+    }
+
+    void unshare(mvector<PrimRef>& primrefarray_o)
+    {
+      reset(); // this removes blocks that are allocated inside the shared primref array
+      primrefarray_o = std::move(primrefarray);
+    }
+
+    /*! returns first fast thread local allocator */
+    __forceinline ThreadLocal* _threadLocal() {
+      return &threadLocal2()->alloc0;
+    }
+
+    void setOSallocation(bool flag)
+    {
+      atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
+    }
+
+  private:
+
+    /*! returns both fast thread local allocators */
+    __forceinline ThreadLocal2* threadLocal2() 
+    {
+      ThreadLocal2* alloc = thread_local_allocator2;
+      if (alloc == nullptr) {
+        thread_local_allocator2 = alloc = new ThreadLocal2;
+        Lock<SpinLock> lock(s_thread_local_allocators_lock);
+        s_thread_local_allocators.push_back(make_unique(alloc));
+      }
+      return alloc;
+    }
+
+  public:
+
+    __forceinline void join(ThreadLocal2* alloc)
+    {
+      Lock<SpinLock> lock(thread_local_allocators_lock);
+      thread_local_allocators.push_back(alloc);
+    }
+
+  public:
+
+    struct CachedAllocator
+    {
+      __forceinline CachedAllocator(void* ptr)
+        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
+      {
+        assert(ptr == nullptr);
+      }
+
+      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+      __forceinline operator bool () const {
+        return alloc != nullptr;
+      }
+
+      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+        return talloc1->malloc(alloc,bytes,align);
+      }
+
+    public:
+      FastAllocator* alloc;
+      ThreadLocal* talloc0;
+      ThreadLocal* talloc1;
+    };
+
+    __forceinline CachedAllocator getCachedAllocator() {
+      return CachedAllocator(this,threadLocal2());
+    }
+
+    /*! Builder interface to create thread local allocator */
+    struct Create
+    {
+    public:
+      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
+
+    private:
+      FastAllocator* allocator;
+    };
+
+    void internal_fix_used_blocks()
+    {
+      /* move thread local blocks to global block list */
+      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        while (threadBlocks[i].load() != nullptr) {
+          Block* nextUsedBlock = threadBlocks[i].load()->next;
+          threadBlocks[i].load()->next = usedBlocks.load();
+          usedBlocks = threadBlocks[i].load();
+          threadBlocks[i] = nextUsedBlock;
+        }
+        threadBlocks[i] = nullptr;
+      }
+    }
+
+    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
+    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+    /* calculates a single threaded threshold for the builders such
+     * that for small scenes the overhead of partly allocated blocks
+     * per thread is low */
+    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+    {
+      if (numPrimitives == 0 || bytesEstimated == 0) 
+        return defaultThreshold;
+
+      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+      /* if we do not have to limit number of threads use optimal thresdhold */
+      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        return defaultThreshold;
+
+      /* otherwise limit number of threads by calculating proper single thread threshold */
+      else {
+        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
+      }
+    }
+
+    __forceinline size_t alignSize(size_t i) {
+      return (i+127)/128*128;
+    }
+
+    /*! initializes the grow size */
+    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
+    {
+      /* we do not need single thread local allocator mode */
+      use_single_mode = false;
+     
+      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+      /* if we reached the maxAllocationSize for growSize, we can
+       * increase the number of allocation slots by still guaranteeing
+       * the mainAllocationOverhead */
+      slotMask = 0x0;
+
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+      /* set the thread local alloc block size */
+      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+      
+      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+        else
+#endif
+        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+      if (bytesEstimated == 0) {
+        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+        defaultBlockSize = defaultBlockSizeSwitch;
+      }
+      log2_grow_size_scale = 0;
+      
+      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+    }
+
+    /*! initializes the allocator */
+    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      /* distribute the allocation to multiple thread block slots */
+      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      if (bytesReserve == 0) bytesReserve = bytesAllocate;
+      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      estimatedSize = bytesEstimate;
+      initGrowSizeAndNumSlots(bytesEstimate,true);
+    }
+
+    /*! initializes the allocator */
+    void init_estimate(size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      /* single allocator mode ? */
+      estimatedSize = bytesEstimate;
+      //initGrowSizeAndNumSlots(bytesEstimate,false);
+      initGrowSizeAndNumSlots(bytesEstimate,false);
+
+    }
+
+    /*! frees state not required after build */
+    __forceinline void cleanup()
+    {
+      internal_fix_used_blocks();
+
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! resets the allocator, memory blocks get reused */
+    void reset ()
+    {
+      internal_fix_used_blocks();
+
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+
+      /* reset all used blocks and move them to begin of free block list */
+      while (usedBlocks.load() != nullptr) {
+        usedBlocks.load()->reset_block();
+        Block* nextUsedBlock = usedBlocks.load()->next;
+        usedBlocks.load()->next = freeBlocks.load();
+        freeBlocks = usedBlocks.load();
+        usedBlocks = nextUsedBlock;
+      }
+
+      /* remove all shared blocks as they are re-added during build */
+      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! frees all allocated memory */
+    __forceinline void clear()
+    {
+      cleanup();
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      primrefarray.clear();
+    }
+
+    __forceinline size_t incGrowSizeScale()
+    {
+      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+      return size_t(1) << min(size_t(16),scale);
+    }
+
+    /*! thread safe allocation of memory */
+    void* malloc(size_t& bytes, size_t align, bool partial)
+    {
+      assert(align <= maxAlignment);
+
+      while (true)
+      {
+        /* allocate using current block */
+        size_t threadID = TaskScheduler::threadID();
+        size_t slot = threadID & slotMask;
+	Block* myUsedBlocks = threadUsedBlocks[slot];
+        if (myUsedBlocks) {
+          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          if (ptr) return ptr;
+        }
+
+        /* throw error if allocation is too large */
+        if (bytes > maxAllocationSize)
+          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+        if (likely(freeBlocks.load() == nullptr))
+        {
+          Lock<SpinLock> lock(slotMutex[slot]);
+          if (myUsedBlocks == threadUsedBlocks[slot]) {
+            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+            assert(allocSize >= bytes);
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+          }
+          continue;
+        }
+
+        /* if this fails allocate new block */
+        {
+          Lock<SpinLock> lock(mutex);
+	  if (myUsedBlocks == threadUsedBlocks[slot])
+	  {
+            if (freeBlocks.load() != nullptr) {
+	      Block* nextFreeBlock = freeBlocks.load()->next;
+	      freeBlocks.load()->next = usedBlocks;
+	      __memory_barrier();
+	      usedBlocks = freeBlocks.load();
+              threadUsedBlocks[slot] = freeBlocks.load();
+	      freeBlocks = nextFreeBlock;
+	    } else {
+              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+	    }
+          }
+        }
+      }
+    }
+
+    /*! add new block */
+    void addBlock(void* ptr, ssize_t bytes)
+    {
+      Lock<SpinLock> lock(mutex);
+      const size_t sizeof_Header = offsetof(Block,data[0]);
+      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+      size_t ofs = (size_t) aptr - (size_t) ptr;
+      bytes -= ofs;
+      if (bytes < 4096) return; // ignore empty or very small blocks
+      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+    }
+
+    /* special allocation only used from morton builder only a single time for each build */
+    void* specialAlloc(size_t bytes)
+    {
+      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+      return freeBlocks.load()->ptr();
+    }
+
+    struct Statistics
+    {
+      Statistics ()
+      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+      : bytesUsed(0), bytesFree(0), bytesWasted(0)
+      {
+        Block* usedBlocks = alloc->usedBlocks.load();
+        Block* freeBlocks = alloc->freeBlocks.load();
+        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+      }
+
+      std::string str(size_t numPrimitives)
+      {
+        std::stringstream str;
+        str.setf(std::ios::fixed, std::ios::floatfield);
+        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+        return str.str();
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+      {
+        return Statistics(a.bytesUsed+b.bytesUsed,
+                          a.bytesFree+b.bytesFree,
+                          a.bytesWasted+b.bytesWasted);
+      }
+
+      size_t bytesAllocatedTotal() const {
+        return bytesUsed + bytesFree + bytesWasted;
+      }
+
+    public:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+    };
+
+    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+      return Statistics(this,atype,huge_pages);
+    }
+
+    size_t getUsedBytes() {
+      return bytesUsed;
+    }
+
+    size_t getWastedBytes() {
+      return bytesWasted;
+    }
+
+    struct AllStatistics
+    {
+      AllStatistics (FastAllocator* alloc)
+
+      : bytesUsed(alloc->bytesUsed),
+        bytesFree(alloc->bytesFree),
+        bytesWasted(alloc->bytesWasted),
+        stat_all(alloc,ANY_TYPE),
+        stat_malloc(alloc,ALIGNED_MALLOC),
+        stat_4K(alloc,OS_MALLOC,false),
+        stat_2M(alloc,OS_MALLOC,true),
+        stat_shared(alloc,SHARED) {}
+
+      AllStatistics (size_t bytesUsed,
+                     size_t bytesFree,
+                     size_t bytesWasted,
+                     Statistics stat_all,
+                     Statistics stat_malloc,
+                     Statistics stat_4K,
+                     Statistics stat_2M,
+                     Statistics stat_shared)
+
+      : bytesUsed(bytesUsed),
+        bytesFree(bytesFree),
+        bytesWasted(bytesWasted),
+        stat_all(stat_all),
+        stat_malloc(stat_malloc),
+        stat_4K(stat_4K),
+        stat_2M(stat_2M),
+        stat_shared(stat_shared) {}
+
+      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+      {
+        return AllStatistics(a.bytesUsed+b.bytesUsed,
+                             a.bytesFree+b.bytesFree,
+                             a.bytesWasted+b.bytesWasted,
+                             a.stat_all + b.stat_all,
+                             a.stat_malloc + b.stat_malloc,
+                             a.stat_4K + b.stat_4K,
+                             a.stat_2M + b.stat_2M,
+                             a.stat_shared + b.stat_shared);
+      }
+
+      void print(size_t numPrimitives)
+      {
+        std::stringstream str0;
+        str0.setf(std::ios::fixed, std::ios::floatfield);
+        str0 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "                                                            " 
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+        std::cout << str0.str() << std::endl;
+      
+        std::stringstream str1;
+        str1.setf(std::ios::fixed, std::ios::floatfield);
+        str1 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
+             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+        std::cout << str1.str() << std::endl;
+     
+        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
+        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
+        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
+        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
+      }
+
+    private:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+      Statistics stat_all;
+      Statistics stat_malloc;
+      Statistics stat_4K;
+      Statistics stat_2M;
+      Statistics stat_shared;
+    };
+
+    void print_blocks()
+    {
+      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+      std::cout << "  used blocks = ";
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+
+      std::cout << "  free blocks = ";
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+    }
+
+  private:
+
+    struct Block
+    {
+      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      {
+        /* We avoid using os_malloc for small blocks as this could
+         * cause a risk of fragmenting the virtual address space and
+         * reach the limit of vm.max_map_count = 65k under Linux. */
+        if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
+          atype = ALIGNED_MALLOC;
+
+        /* we need to additionally allocate some header */
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        bytesAllocate = sizeof_Header+bytesAllocate;
+        bytesReserve  = sizeof_Header+bytesReserve;
+
+        /* consume full 4k pages with using os_malloc */
+        if (atype == OS_MALLOC) {
+          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+        }
+
+        /* either use alignedMalloc or os_malloc */
+        void *ptr = nullptr;
+        if (atype == ALIGNED_MALLOC)
+        {
+          /* special handling for default block size */
+          if (bytesAllocate == (2*PAGE_SIZE_2M))
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+
+            /* give hint to transparently convert these pages to 2MB pages */
+            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+          else
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+        }
+        else if (atype == OS_MALLOC)
+        {
+          if (device) device->memoryMonitor(bytesAllocate,false);
+          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+          return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+        }
+        else
+          assert(false);
+
+        return NULL;
+      }
+
+      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+      {
+        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+      }
+
+      static Block* remove_shared_blocks(Block* head)
+      {
+        Block** prev_next = &head;
+        for (Block* block = head; block; block = block->next) {
+          if (block->atype == SHARED) *prev_next = block->next;
+          else                         prev_next = &block->next;
+        }
+        return head;
+      }
+
+      void clear_list(MemoryMonitorInterface* device)
+      {
+        Block* block = this;
+        while (block) {
+          Block* next = block->next;
+          block->clear_block(device);
+          block = next;
+        }
+      }
+
+      void clear_block (MemoryMonitorInterface* device)
+      {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+        if (atype == ALIGNED_MALLOC) {
+          alignedFree(this);
+          if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else if (atype == OS_MALLOC) {
+         size_t sizeof_This = sizeof_Header+reserveEnd;
+         os_free(this,sizeof_This,huge_pages);
+         if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else /* if (atype == SHARED) */ {
+        }
+      }
+
+      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+      {
+        size_t bytes = bytes_in;
+        assert(align <= maxAlignment);
+        bytes = (bytes+(align-1)) & ~(align-1);
+	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+        if (unlikely(i > reserveEnd)) return nullptr;
+        bytes_in = bytes = min(bytes,reserveEnd-i);
+        
+	if (i+bytes > allocEnd) {
+          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+        }
+	return &data[i];
+      }
+
+      void* ptr() {
+        return &data[cur];
+      }
+
+      void reset_block ()
+      {
+        allocEnd = max(allocEnd,(size_t)cur);
+        cur = 0;
+      }
+
+      size_t getBlockUsedBytes() const {
+        return min(size_t(cur),reserveEnd);
+      }
+
+      size_t getBlockFreeBytes() const {
+	return getBlockAllocatedBytes() - getBlockUsedBytes();
+      }
+
+      size_t getBlockAllocatedBytes() const {
+        return min(max(allocEnd,size_t(cur)),reserveEnd);
+      }
+
+      size_t getBlockWastedBytes() const {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        return sizeof_Header + wasted;
+      }
+
+      size_t getBlockReservedBytes() const {
+        return reserveEnd;
+      }
+  
+      bool hasType(AllocationType atype_i, bool huge_pages_i) const
+      {
+        if      (atype_i == ANY_TYPE ) return true;
+        else if (atype   == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else                           return atype_i == atype;
+      }
+
+      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockUsedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockFreeBytes();
+        }
+        return bytes;
+      }
+
+      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockWastedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockAllocatedBytes();
+        }
+        return bytes;
+      }
+
+      void print_list ()
+      {
+        for (const Block* block = this; block; block = block->next)
+          block->print_block();
+      }
+
+      void print_block() const
+      {
+        if (atype == ALIGNED_MALLOC) std::cout << "A";
+        else if (atype == OS_MALLOC) std::cout << "O";
+        else if (atype == SHARED) std::cout << "S";
+        if (huge_pages) std::cout << "H";
+        size_t bytesUsed = getBlockUsedBytes();
+        size_t bytesFree = getBlockFreeBytes();
+        size_t bytesWasted = getBlockWastedBytes();
+        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+      }
+
+    public:
+      std::atomic<size_t> cur;        //!< current location of the allocator
+      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
+      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+      Block* next;               //!< pointer to next block in list
+      size_t wasted;             //!< amount of memory wasted through block alignment
+      AllocationType atype;      //!< allocation mode of the block
+      bool huge_pages;           //!< whether the block uses huge pages
+      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+      char data[1];              //!< here starts memory to use for allocations
+    };
+
+  private:
+    Device* device;
+    SpinLock mutex;
+    size_t slotMask;
+    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> usedBlocks;
+    std::atomic<Block*> freeBlocks;
+
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+
+    bool use_single_mode;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+    std::atomic<size_t> bytesUsed;
+    std::atomic<size_t> bytesFree;
+    std::atomic<size_t> bytesWasted;
+    static __thread ThreadLocal2* thread_local_allocator2;
+    static SpinLock s_thread_local_allocators_lock;
+    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+    SpinLock thread_local_allocators_lock;
+    std::vector<ThreadLocal2*> thread_local_allocators;
+    AllocationType atype;
+    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
+  };
+}
diff --git a/thirdparty/embree/kernels/common/buffer.h b/thirdparty/embree/kernels/common/buffer.h
new file mode 100644
index 0000000000..793012c04d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+  /*! Implements an API data buffer object. This class may or may not own the data. */
+  class Buffer : public RefCount
+  {
+  public:
+    /*! Buffer construction */
+    Buffer() 
+      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+    /*! Buffer construction */
+    Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+      : device(device), numBytes(numBytes_in)
+    {
+      device->refInc();
+      
+      if (ptr_in)
+      {
+        shared = true;
+        ptr = (char*)ptr_in;
+      }
+      else
+      {
+        shared = false;
+        alloc();
+      }
+    }
+    
+    /*! Buffer destruction */
+    ~Buffer() {
+      free();
+      device->refDec();
+    }
+    
+    /*! this class is not copyable */
+  private:
+    Buffer(const Buffer& other) DELETED; // do not implement
+    Buffer& operator =(const Buffer& other) DELETED; // do not implement
+    
+  public:
+    /* inits and allocates the buffer */
+    void create(Device* device_in, size_t numBytes_in)
+    {
+      init(device_in, numBytes_in);
+      alloc();
+    }
+    
+    /* inits the buffer */
+    void init(Device* device_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = nullptr;
+      numBytes = numBytes_in;
+      shared = false;
+    }
+
+    /*! sets shared buffer */
+    void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = (char*)ptr_in;
+      if (numBytes_in != (size_t)-1)
+        numBytes = numBytes_in;
+      shared = true;
+    }
+    
+    /*! allocated buffer */
+    void alloc()
+    {
+      if (device)
+        device->memoryMonitor(this->bytes(), false);
+      size_t b = (this->bytes()+15) & ssize_t(-16);
+      ptr = (char*)alignedMalloc(b,16);
+    }
+    
+    /*! frees the buffer */
+    void free()
+    {
+      if (shared) return;
+      alignedFree(ptr); 
+      if (device)
+        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      ptr = nullptr;
+    }
+    
+    /*! gets buffer pointer */
+    void* data()
+    {
+      /* report error if buffer is not existing */
+      if (!device)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+      
+      /* return buffer */
+      return ptr;
+    }
+
+    /*! returns pointer to first element */
+    __forceinline char* getPtr() const {
+      return ptr;
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return numBytes;
+    }
+    
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr; 
+    }
+
+  public:
+    Device* device;  //!< device to report memory usage to
+    char* ptr;       //!< pointer to buffer data
+    size_t numBytes; //!< number of bytes in the buffer
+    bool shared;     //!< set if memory is shared with application
+  };
+
+  /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+  class RawBufferView
+  {
+  public:
+    /*! Buffer construction */
+    RawBufferView()
+      : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+  public:
+    /*! sets the buffer view */
+    void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+    {
+      if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+      ptr_ofs = buffer_in->ptr + offset_in;
+      stride = stride_in;
+      num = num_in;
+      format = format_in;
+      modCounter++;
+      modified = true;
+      buffer = buffer_in;
+    }
+
+    /*! returns pointer to the first element */
+    __forceinline char* getPtr() const {
+      return ptr_ofs;
+    }
+
+    /*! returns pointer to the i'th element */
+    __forceinline char* getPtr(size_t i) const
+    {
+      assert(i<num);
+      return ptr_ofs + i*stride;
+    }
+
+    /*! returns the number of elements of the buffer */
+    __forceinline size_t size() const { 
+      return num; 
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return num*stride; 
+    }
+    
+    /*! returns the buffer stride */
+    __forceinline unsigned getStride() const
+    {
+      assert(stride <= unsigned(inf));
+      return unsigned(stride);
+    }
+
+    /*! return the buffer format */
+    __forceinline RTCFormat getFormat() const {
+      return format;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline void setModified() {
+      modCounter++;
+      modified = true;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline bool isModified(unsigned int otherModCounter) const {
+      return modCounter > otherModCounter;
+    }
+
+     /*! mark buffer as modified or unmodified */
+    __forceinline bool isLocalModified() const {
+      return modified;
+    }
+
+    /*! clear local modified flag */
+    __forceinline void clearLocalModified() {
+      modified = false;
+    }
+
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr_ofs; 
+    }
+
+    /*! checks padding to 16 byte check, fails hard */
+    __forceinline void checkPadding16() const
+    {
+      if (ptr_ofs && num)
+        volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+    }
+
+  public:
+    char* ptr_ofs;      //!< base pointer plus offset
+    size_t stride;      //!< stride of the buffer in bytes
+    size_t num;         //!< number of elements in the buffer
+    RTCFormat format;   //!< format of the buffer
+    unsigned int modCounter; //!< version ID of this buffer
+    bool modified;      //!< local modified data
+    int userData;       //!< special data
+    Ref<Buffer> buffer; //!< reference to the parent buffer
+  };
+
+  /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+  template<typename T>
+  class BufferView : public RawBufferView
+  {
+  public:
+    typedef T value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline       T& operator [](size_t i)       { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+    __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+  };
+
+  template<>
+  class BufferView<Vec3fa> : public RawBufferView
+  {
+  public:
+    typedef Vec3fa value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/common/builder.h b/thirdparty/embree/kernels/common/builder.h
new file mode 100644
index 0000000000..07fe7b069b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+  /*! virtual interface for all hierarchy builders */
+  class Builder : public RefCount {
+  public:
+
+    static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    /*! initiates the hierarchy builder */
+    virtual void build() = 0;
+
+    /*! notifies the builder about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+
+    /*! clears internal builder state */
+    virtual void clear() = 0;
+  };
+
+  /*! virtual interface for progress monitor class */
+  struct BuildProgressMonitor {
+    virtual void operator() (size_t dn) const = 0;
+  };
+
+  /*! build the progress monitor interface from a closure */
+  template<typename Closure>
+    struct ProgressMonitorClosure : BuildProgressMonitor
+  {
+  public:
+    ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+    void operator() (size_t dn) const { closure(dn); }
+  private:
+    const Closure closure;
+  };
+  template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+    return ProgressMonitorClosure<Closure>(closure);
+  }
+
+  struct LineSegments;
+  struct TriangleMesh;
+  struct QuadMesh;
+  struct UserGeometry;
+
+  class Scene;
+
+  typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+  typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree/kernels/common/context.h b/thirdparty/embree/kernels/common/context.h
new file mode 100644
index 0000000000..ccd88bdeac
--- /dev/null
+++ b/thirdparty/embree/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+  class Scene;
+
+  struct IntersectContext
+  {
+  public:
+    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+      : scene(scene), user(user_context) {}
+
+    __forceinline bool hasContextFilter() const {
+      return user->filter != nullptr;
+    }
+
+    __forceinline bool isCoherent() const {
+      return embree::isCoherent(user->flags);
+    }
+
+    __forceinline bool isIncoherent() const {
+      return embree::isIncoherent(user->flags);
+    }
+    
+  public:
+    Scene* scene;
+    RTCIntersectContext* user;
+  };
+
+  template<int M, typename Geometry>
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+    {
+#if RTC_MIN_WIDTH
+      const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+      return v;
+#endif
+    }
+
+    template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+  {
+#if RTC_MIN_WIDTH
+    const float d = length(Vec3fa(v) - ray_org);
+    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    return Vec3ff(v.x,v.y,v.z,r);
+#else
+    return v;
+#endif
+  }
+  
+  enum PointQueryType
+  {
+    POINT_QUERY_TYPE_UNDEFINED = 0,
+    POINT_QUERY_TYPE_SPHERE = 1,
+    POINT_QUERY_TYPE_AABB = 2,
+  };
+
+  typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+  struct PointQueryContext
+  {
+  public:
+    __forceinline PointQueryContext(Scene* scene, 
+                                    PointQuery* query_ws, 
+                                    PointQueryType query_type,
+                                    PointQueryFunction func, 
+                                    RTCPointQueryContext* userContext,
+                                    float similarityScale,
+                                    void* userPtr)
+      : scene(scene)
+      , query_ws(query_ws)
+      , query_type(query_type)
+      , func(func)
+      , userContext(userContext)
+      , similarityScale(similarityScale)
+      , userPtr(userPtr) 
+      , primID(RTC_INVALID_GEOMETRY_ID)
+      , geomID(RTC_INVALID_GEOMETRY_ID)
+      , query_radius(query_ws->radius)
+    { 
+      if (query_type == POINT_QUERY_TYPE_AABB) {
+        assert(similarityScale == 0.f);
+        updateAABB();
+      }
+      if (userContext->instStackSize == 0) {
+        assert(similarityScale == 1.f);
+      }
+    }
+
+  public:
+    __forceinline void updateAABB() 
+    {
+      if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+        query_radius = Vec3fa(query_ws->radius);
+        return;
+      }
+
+      const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+      BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+      bbox = xfmBounds(m, bbox);
+      query_radius = 0.5f * (bbox.upper - bbox.lower);
+    }
+
+public:
+    Scene* scene;
+
+    PointQuery* query_ws; // the original world space point query 
+    PointQueryType query_type;
+    PointQueryFunction func;
+    RTCPointQueryContext* userContext;
+    const float similarityScale;
+
+    void* userPtr;
+
+    unsigned int primID;
+    unsigned int geomID;
+
+    Vec3fa query_radius;  // used if the query is converted to an AABB internally
+  };
+}
+
diff --git a/thirdparty/embree/kernels/common/default.h b/thirdparty/embree/kernels/common/default.h
new file mode 100644
index 0000000000..f15d61b768
--- /dev/null
+++ b/thirdparty/embree/kernels/common/default.h
@@ -0,0 +1,268 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec2 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec2vf  = Vec2<vfloat<N>>;
+  template<int N> using Vec2vd  = Vec2<vdouble<N>>;
+  template<int N> using Vec2vr  = Vec2<vreal<N>>;
+  template<int N> using Vec2vi  = Vec2<vint<N>>;
+  template<int N> using Vec2vl  = Vec2<vllong<N>>;
+  template<int N> using Vec2vb  = Vec2<vbool<N>>;
+  template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+  template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+  typedef Vec2<vfloat4>  Vec2vf4;
+  typedef Vec2<vdouble4> Vec2vd4;
+  typedef Vec2<vreal4>   Vec2vr4;
+  typedef Vec2<vint4>    Vec2vi4;
+  typedef Vec2<vllong4>  Vec2vl4;
+  typedef Vec2<vbool4>   Vec2vb4;
+  typedef Vec2<vboolf4>  Vec2vbf4;
+  typedef Vec2<vboold4>  Vec2vbd4;
+
+  typedef Vec2<vfloat8>  Vec2vf8;
+  typedef Vec2<vdouble8> Vec2vd8;
+  typedef Vec2<vreal8>   Vec2vr8;
+  typedef Vec2<vint8>    Vec2vi8;
+  typedef Vec2<vllong8>  Vec2vl8;
+  typedef Vec2<vbool8>   Vec2vb8;
+  typedef Vec2<vboolf8>  Vec2vbf8;
+  typedef Vec2<vboold8>  Vec2vbd8;
+
+  typedef Vec2<vfloat16>  Vec2vf16;
+  typedef Vec2<vdouble16> Vec2vd16;
+  typedef Vec2<vreal16>   Vec2vr16;
+  typedef Vec2<vint16>    Vec2vi16;
+  typedef Vec2<vllong16>  Vec2vl16;
+  typedef Vec2<vbool16>   Vec2vb16;
+  typedef Vec2<vboolf16>  Vec2vbf16;
+  typedef Vec2<vboold16>  Vec2vbd16;
+
+  typedef Vec2<vfloatx>  Vec2vfx;
+  typedef Vec2<vdoublex> Vec2vdx;
+  typedef Vec2<vrealx>   Vec2vrx;
+  typedef Vec2<vintx>    Vec2vix;
+  typedef Vec2<vllongx>  Vec2vlx;
+  typedef Vec2<vboolx>   Vec2vbx;
+  typedef Vec2<vboolfx>  Vec2vbfx;
+  typedef Vec2<vbooldx>  Vec2vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec3 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec3vf  = Vec3<vfloat<N>>;
+  template<int N> using Vec3vd  = Vec3<vdouble<N>>;
+  template<int N> using Vec3vr  = Vec3<vreal<N>>;
+  template<int N> using Vec3vi  = Vec3<vint<N>>;
+  template<int N> using Vec3vl  = Vec3<vllong<N>>;
+  template<int N> using Vec3vb  = Vec3<vbool<N>>;
+  template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+  template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+  typedef Vec3<vfloat4>  Vec3vf4;
+  typedef Vec3<vdouble4> Vec3vd4;
+  typedef Vec3<vreal4>   Vec3vr4;
+  typedef Vec3<vint4>    Vec3vi4;
+  typedef Vec3<vllong4>  Vec3vl4;
+  typedef Vec3<vbool4>   Vec3vb4;
+  typedef Vec3<vboolf4>  Vec3vbf4;
+  typedef Vec3<vboold4>  Vec3vbd4;
+
+  typedef Vec3<vfloat8>  Vec3vf8;
+  typedef Vec3<vdouble8> Vec3vd8;
+  typedef Vec3<vreal8>   Vec3vr8;
+  typedef Vec3<vint8>    Vec3vi8;
+  typedef Vec3<vllong8>  Vec3vl8;
+  typedef Vec3<vbool8>   Vec3vb8;
+  typedef Vec3<vboolf8>  Vec3vbf8;
+  typedef Vec3<vboold8>  Vec3vbd8;
+
+  typedef Vec3<vfloat16>  Vec3vf16;
+  typedef Vec3<vdouble16> Vec3vd16;
+  typedef Vec3<vreal16>   Vec3vr16;
+  typedef Vec3<vint16>    Vec3vi16;
+  typedef Vec3<vllong16>  Vec3vl16;
+  typedef Vec3<vbool16>   Vec3vb16;
+  typedef Vec3<vboolf16>  Vec3vbf16;
+  typedef Vec3<vboold16>  Vec3vbd16;
+
+  typedef Vec3<vfloatx>  Vec3vfx;
+  typedef Vec3<vdoublex> Vec3vdx;
+  typedef Vec3<vrealx>   Vec3vrx;
+  typedef Vec3<vintx>    Vec3vix;
+  typedef Vec3<vllongx>  Vec3vlx;
+  typedef Vec3<vboolx>   Vec3vbx;
+  typedef Vec3<vboolfx>  Vec3vbfx;
+  typedef Vec3<vbooldx>  Vec3vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec4 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec4vf  = Vec4<vfloat<N>>;
+  template<int N> using Vec4vd  = Vec4<vdouble<N>>;
+  template<int N> using Vec4vr  = Vec4<vreal<N>>;
+  template<int N> using Vec4vi  = Vec4<vint<N>>;
+  template<int N> using Vec4vl  = Vec4<vllong<N>>;
+  template<int N> using Vec4vb  = Vec4<vbool<N>>;
+  template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+  template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+  typedef Vec4<vfloat4>  Vec4vf4;
+  typedef Vec4<vdouble4> Vec4vd4;
+  typedef Vec4<vreal4>   Vec4vr4;
+  typedef Vec4<vint4>    Vec4vi4;
+  typedef Vec4<vllong4>  Vec4vl4;
+  typedef Vec4<vbool4>   Vec4vb4;
+  typedef Vec4<vboolf4>  Vec4vbf4;
+  typedef Vec4<vboold4>  Vec4vbd4;
+
+  typedef Vec4<vfloat8>  Vec4vf8;
+  typedef Vec4<vdouble8> Vec4vd8;
+  typedef Vec4<vreal8>   Vec4vr8;
+  typedef Vec4<vint8>    Vec4vi8;
+  typedef Vec4<vllong8>  Vec4vl8;
+  typedef Vec4<vbool8>   Vec4vb8;
+  typedef Vec4<vboolf8>  Vec4vbf8;
+  typedef Vec4<vboold8>  Vec4vbd8;
+
+  typedef Vec4<vfloat16>  Vec4vf16;
+  typedef Vec4<vdouble16> Vec4vd16;
+  typedef Vec4<vreal16>   Vec4vr16;
+  typedef Vec4<vint16>    Vec4vi16;
+  typedef Vec4<vllong16>  Vec4vl16;
+  typedef Vec4<vbool16>   Vec4vb16;
+  typedef Vec4<vboolf16>  Vec4vbf16;
+  typedef Vec4<vboold16>  Vec4vbd16;
+
+  typedef Vec4<vfloatx>  Vec4vfx;
+  typedef Vec4<vdoublex> Vec4vdx;
+  typedef Vec4<vrealx>   Vec4vrx;
+  typedef Vec4<vintx>    Vec4vix;
+  typedef Vec4<vllongx>  Vec4vlx;
+  typedef Vec4<vboolx>   Vec4vbx;
+  typedef Vec4<vboolfx>  Vec4vbfx;
+  typedef Vec4<vbooldx>  Vec4vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Other shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+  typedef BBox<Vec3vf4>  BBox3vf4;
+  typedef BBox<Vec3vf8>  BBox3vf8;
+  typedef BBox<Vec3vf16> BBox3vf16;
+
+  /* calculate time segment itime and fractional time ftime */
+  __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = time * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  template<int N>
+  __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = time * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  template<int N>
+    __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+  {
+    const float round_up   = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+    const float round_down = 1.0f-2.0f*float(ulp);
+    const int itime_lower = (int)max(floor(round_up  *time_range.lower*numTimeSegments), 0.0f);
+    const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+    return make_range(itime_lower, itime_upper);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+  {
+    const float lower = (range.lower-time_range.lower)/time_range.size();
+    const float upper = (range.upper-time_range.lower)/time_range.size();
+    return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+  }
+}
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
new file mode 100644
index 0000000000..068e0c2983
--- /dev/null
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -0,0 +1,556 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+  /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+  ssize_t Device::debug_int0 = 0;
+  ssize_t Device::debug_int1 = 0;
+  ssize_t Device::debug_int2 = 0;
+  ssize_t Device::debug_int3 = 0;
+
+  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+  static MutexSys g_mutex;
+  static std::map<Device*,size_t> g_cache_size_map;
+  static std::map<Device*,size_t> g_num_threads_map;
+
+  Device::Device (const char* cfg)
+  {
+    /* check that CPU supports lowest ISA */
+    if (!hasISA(ISA)) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+    }
+
+    /* set default frequency level for detected CPU */
+    switch (getCPUModel()) {
+    case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::SANDY_BRIDGE:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::NEHALEM:         frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE2:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_PHI_KNIGHTS_MILL   : frequency_level = FREQUENCY_SIMD512; break;
+    case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
+    case CPU::ARM:             frequency_level = FREQUENCY_SIMD128; break;
+    }
+
+    /* initialize global state */
+#if defined(EMBREE_CONFIG)
+    State::parseString(EMBREE_CONFIG);
+#endif
+    State::parseString(cfg);
+    State::verify();
+
+    /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */    
+    if (!checkISASupport()) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+    }    
+    
+    /*! do some internal tests */
+    assert(isa::Cylinder::verify());
+
+    /*! enable huge page support if desired */
+#if defined(__WIN32__)
+    if (State::enable_selockmemoryprivilege)
+      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+    
+    /*! set tessellation cache size */
+    setCacheSize( State::tessellation_cache_size );
+
+    /*! enable some floating point exceptions to catch bugs */
+    if (State::float_exceptions)
+    {
+      int exceptions = _MM_MASK_MASK;
+      //exceptions &= ~_MM_MASK_INVALID;
+      exceptions &= ~_MM_MASK_DENORM;
+      exceptions &= ~_MM_MASK_DIV_ZERO;
+      //exceptions &= ~_MM_MASK_OVERFLOW;
+      //exceptions &= ~_MM_MASK_UNDERFLOW;
+      //exceptions &= ~_MM_MASK_INEXACT;
+      _MM_SET_EXCEPTION_MASK(exceptions);
+    }
+    
+    /* print info header */
+    if (State::verbosity(1))
+      print();
+    if (State::verbosity(2)) 
+      State::print();
+
+    /* register all algorithms */
+    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+    /* setup tasking system */
+    initTaskingSystem(numThreads);
+
+    /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+    RayStreamFilterFuncsType rayStreamFilterFuncs;
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(enabled_cpu_features,rayStreamFilterFuncs);
+    rayStreamFilters = rayStreamFilterFuncs();
+#endif
+  }
+
+  Device::~Device ()
+  {
+    setCacheSize(0);
+    exitTaskingSystem();
+  }
+
+  std::string getEnabledTargets()
+  {
+    std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+    v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512)
+    v += "AVX512 ";
+#endif
+    return v;
+  }
+
+  std::string getEmbreeFeatures()
+  {
+    std::string v;
+#if defined(EMBREE_RAY_MASK)
+    v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+    v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+    v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+    v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+    v += "compact_polys ";
+#endif
+    return v;
+  }
+
+  void Device::print()
+  {
+    const int cpu_features = getCPUFeatures();
+    std::cout << std::endl;
+    std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+    std::cout << "  Compiler  : " << getCompilerName() << std::endl;
+    std::cout << "  Build     : ";
+#if defined(DEBUG)
+    std::cout << "Debug " << std::endl;
+#else
+    std::cout << "Release " << std::endl;
+#endif
+    std::cout << "  Platform  : " << getPlatformName() << std::endl;
+    std::cout << "  CPU       : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+    std::cout << "   Threads  : " << getNumberOfLogicalThreads() << std::endl;
+    std::cout << "   ISA      : " << stringOfCPUFeatures(cpu_features) << std::endl;
+    std::cout << "   Targets  : " << supportedTargetList(cpu_features) << std::endl;
+    const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+    const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+    std::cout << "   MXCSR    : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+    std::cout << "  Config" << std::endl;
+    std::cout << "    Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+    std::cout << "    ISA     : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+    std::cout << "    Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+    std::cout << "              " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+    std::cout << "    Features: " << getEmbreeFeatures() << std::endl;
+    std::cout << "    Tasking : ";
+#if defined(TASKING_TBB)
+    std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+  #if TBB_INTERFACE_VERSION >= 12002
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+  #else
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+  #endif
+#endif
+#if defined(TASKING_INTERNAL)
+    std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_PPL)
+	std::cout << "PPL ";
+#endif
+    std::cout << std::endl;
+
+    /* check of FTZ and DAZ flags are set in CSR */
+    if (!hasFTZ || !hasDAZ) 
+    {
+#if !defined(_DEBUG)
+      if (State::verbosity(1)) 
+#endif
+      {
+        std::cout << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << "  WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled "         << std::endl 
+                  << "           in the MXCSR control and status register. This can have a severe "     << std::endl
+                  << "           performance impact. Please enable these modes for each application "   << std::endl
+                  << "           thread the following way:" << std::endl
+                  << std::endl 
+                  << "           #include \"xmmintrin.h\"" << std::endl 
+                  << "           #include \"pmmintrin.h\"" << std::endl 
+                  << std::endl 
+                  << "           _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl 
+                  << "           _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  void Device::setDeviceErrorCode(RTCError error)
+  {
+    RTCError* stored_error = errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getDeviceErrorCode()
+  {
+    RTCError* stored_error = errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::setThreadErrorCode(RTCError error)
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getThreadErrorCode()
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::process_error(Device* device, RTCError error, const char* str)
+  { 
+    /* store global error code when device construction failed */
+    if (!device)
+      return setThreadErrorCode(error);
+
+    /* print error when in verbose mode */
+    if (device->verbosity(1)) 
+    {
+      switch (error) {
+      case RTC_ERROR_NONE         : std::cerr << "Embree: No error"; break;
+      case RTC_ERROR_UNKNOWN    : std::cerr << "Embree: Unknown error"; break;
+      case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+      case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+      case RTC_ERROR_OUT_OF_MEMORY    : std::cerr << "Embree: Out of memory"; break;
+      case RTC_ERROR_UNSUPPORTED_CPU  : std::cerr << "Embree: Unsupported CPU"; break;
+      default                   : std::cerr << "Embree: Invalid error code"; break;                   
+      };
+      if (str) std::cerr << ", (" << str << ")";
+      std::cerr << std::endl;
+    }
+
+    /* call user specified error callback */
+    if (device->error_function) 
+      device->error_function(device->error_function_userptr,error,str); 
+
+    /* record error code */
+    device->setDeviceErrorCode(error);
+  }
+
+  void Device::memoryMonitor(ssize_t bytes, bool post)
+  {
+    if (State::memory_monitor_function && bytes != 0) {
+      if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+        if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+          throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+        }
+      }
+    }
+  }
+
+  size_t getMaxNumThreads()
+  {
+    size_t maxNumThreads = 0;
+    for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+      maxNumThreads = max(maxNumThreads, (*i).second);
+    if (maxNumThreads == 0)
+      maxNumThreads = std::numeric_limits<size_t>::max();
+    return maxNumThreads;
+  }
+
+  size_t getMaxCacheSize()
+  {
+    size_t maxCacheSize = 0;
+    for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+      maxCacheSize = max(maxCacheSize, (*i).second);
+    return maxCacheSize;
+  }
+ 
+  void Device::setCacheSize(size_t bytes) 
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    Lock<MutexSys> lock(g_mutex);
+    if (bytes == 0) g_cache_size_map.erase(this);
+    else            g_cache_size_map[this] = bytes;
+    
+    size_t maxCacheSize = getMaxCacheSize();
+    resizeTessellationCache(maxCacheSize);
+#endif
+  }
+
+  void Device::initTaskingSystem(size_t numThreads) 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    if (numThreads == 0) 
+      g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+    else 
+      g_num_threads_map[this] = numThreads;
+
+    /* create task scheduler */
+    size_t maxNumThreads = getMaxNumThreads();
+    TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+    const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+    const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+  }
+
+  void Device::exitTaskingSystem() 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    g_num_threads_map.erase(this);
+
+    /* terminate tasking system */
+    if (g_num_threads_map.size() == 0) {
+      TaskScheduler::destroy();
+    } 
+    /* or configure new number of threads */
+    else {
+      size_t maxNumThreads = getMaxNumThreads();
+      TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+    }
+#if USE_TASK_ARENA
+    arena.reset();
+#endif
+  }
+
+  void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+  {
+    /* hidden internal properties */
+    switch ((size_t)prop)
+    {
+    case 1000000: debug_int0 = val; return;
+    case 1000001: debug_int1 = val; return;
+    case 1000002: debug_int2 = val; return;
+    case 1000003: debug_int3 = val; return;
+    }
+
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+  }
+
+  ssize_t Device::getProperty(const RTCDeviceProperty prop)
+  {
+    size_t iprop = (size_t)prop;
+
+    /* get name of internal regression test */
+    if (iprop >= 2000000 && iprop < 3000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-2000000);
+      if (test) return (ssize_t) test->name.c_str();
+      else      return 0;
+    }
+
+    /* run internal regression test */
+    if (iprop >= 3000000 && iprop < 4000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-3000000);
+      if (test) return test->run();
+      else      return 0;
+    }
+
+    /* documented properties */
+    switch (prop) 
+    {
+    case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+    case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+    case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+    case RTC_DEVICE_PROPERTY_VERSION      : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return hasISA(SSE2);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return hasISA(AVX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
+#endif
+    
+#if defined(EMBREE_RAY_MASK)
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+        
+#if defined(EMBREE_GEOMETRY_QUAD)
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+    default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/common/device.h b/thirdparty/embree/kernels/common/device.h
new file mode 100644
index 0000000000..21c42c654d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+  class BVH4Factory;
+  class BVH8Factory;
+
+  class Device : public State, public MemoryMonitorInterface
+  {
+    ALIGNED_CLASS_(16);
+
+  public:
+
+    /*! Device construction */
+    Device (const char* cfg);
+
+    /*! Device destruction */
+    virtual ~Device ();
+
+    /*! prints info about the device */
+    void print();
+
+    /*! sets the error code */
+    void setDeviceErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    RTCError getDeviceErrorCode();
+
+    /*! sets the error code */
+    static void setThreadErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    static RTCError getThreadErrorCode();
+
+    /*! processes error codes, do not call directly */
+    static void process_error(Device* device, RTCError error, const char* str);
+
+    /*! invokes the memory monitor callback */
+    void memoryMonitor(ssize_t bytes, bool post);
+
+    /*! sets the size of the software cache. */
+    void setCacheSize(size_t bytes);
+
+    /*! sets a property */
+    void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+    /*! gets a property */
+    ssize_t getProperty(const RTCDeviceProperty prop);
+
+  private:
+
+    /*! initializes the tasking system */
+    void initTaskingSystem(size_t numThreads);
+
+    /*! shuts down the tasking system */
+    void exitTaskingSystem();
+
+    /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+  public:
+    static ssize_t debug_int0;
+    static ssize_t debug_int1;
+    static ssize_t debug_int2;
+    static ssize_t debug_int3;
+
+  public:
+    std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+    std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+    
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+    
+    /* ray streams filter */
+    RayStreamFilterFuncs rayStreamFilters;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/geometry.cpp b/thirdparty/embree/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..d8d3f65a5c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+  const char* Geometry::gtype_names[Geometry::GTY_END] =
+  {
+    "flat_linear_curve",
+    "round_linear_curve",
+    "oriented_linear_curve",
+    "",
+    "flat_bezier_curve",
+    "round_bezier_curve",
+    "oriented_bezier_curve",
+    "",
+    "flat_bspline_curve",
+    "round_bspline_curve",
+    "oriented_bspline_curve",
+    "",
+    "flat_hermite_curve",
+    "round_hermite_curve",
+    "oriented_hermite_curve",
+    "",
+    "flat_catmull_rom_curve",
+    "round_catmull_rom_curve",
+    "oriented_catmull_rom_curve",
+    "",    
+    "triangles",
+    "quads",
+    "grid",
+    "subdivs",
+    "",
+    "sphere",
+    "disc",
+    "oriented_disc",
+    "",
+    "usergeom",
+    "instance_cheap",
+    "instance_expensive",
+  };
+     
+  Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
+    : device(device), userPtr(nullptr),
+      numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+      mask(-1),
+      gtype(gtype),
+      gsubtype(GTY_SUBTYPE_DEFAULT),
+      quality(RTC_BUILD_QUALITY_MEDIUM),
+      state((unsigned)State::MODIFIED),
+      enabled(true),
+      intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+  {
+    device->refInc();
+  }
+
+  Geometry::~Geometry()
+  {
+    device->refDec();
+  }
+
+  void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+  {      
+    if (numPrimitives_in == numPrimitives) return;
+    
+    numPrimitives = numPrimitives_in;
+    
+    Geometry::update();
+  }
+
+  void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+  {
+    if (numTimeSteps_in == numTimeSteps) {
+      return;
+    }
+    
+    numTimeSteps = numTimeSteps_in;
+    fnumTimeSegments = float(numTimeSteps_in-1);
+    
+    Geometry::update();
+  }
+
+  void Geometry::setTimeRange (const BBox1f range)
+  {
+    time_range = range;
+    Geometry::update();
+  }
+  
+  void Geometry::update()
+  {
+    ++modCounter_; // FIXME: required?
+    state = (unsigned)State::MODIFIED;
+  }
+  
+  void Geometry::commit() 
+  {
+    ++modCounter_;
+    state = (unsigned)State::COMMITTED;
+  }
+
+  void Geometry::preCommit()
+  {
+    if (State::MODIFIED == (State)state)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+  }
+
+  void Geometry::postCommit()
+  {
+  }
+
+  void Geometry::enable () 
+  {
+    if (isEnabled()) 
+      return;
+
+    enabled = true;
+    ++modCounter_;
+  }
+
+  void Geometry::disable () 
+  {
+    if (isDisabled()) 
+      return;
+    
+    enabled = false;
+    ++modCounter_;
+  }
+
+  void Geometry::setUserData (void* ptr)
+  {
+    userPtr = ptr;
+  }
+  
+  void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    intersectionFilterN = filter;
+  }
+
+  void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    occlusionFilterN = filter;
+  }
+  
+  void Geometry::setPointQueryFunction (RTCPointQueryFunction func) 
+  {
+    pointQueryFunc = func;
+  }
+
+  void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+  {
+    const void* valid_i = args->valid;
+    const unsigned* primIDs = args->primIDs;
+    const float* u = args->u;
+    const float* v = args->v;
+    unsigned int N = args->N;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+    const int* valid = (const int*) valid_i;
+ 
+    __aligned(64) float P_tmp[256];
+    __aligned(64) float dPdu_tmp[256];
+    __aligned(64) float dPdv_tmp[256];
+    __aligned(64) float ddPdudu_tmp[256];
+    __aligned(64) float ddPdvdv_tmp[256];
+    __aligned(64) float ddPdudv_tmp[256];
+
+    float* Pt = P ? P_tmp : nullptr;
+    float* dPdut = nullptr, *dPdvt = nullptr;
+    if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+    float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+    if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+    
+    for (unsigned int i=0; i<N; i++)
+    {
+      if (valid && !valid[i]) continue;
+
+      RTCInterpolateArguments iargs;
+      iargs.primID = primIDs[i];
+      iargs.u = u[i];
+      iargs.v = v[i];
+      iargs.bufferType = bufferType;
+      iargs.bufferSlot = bufferSlot;
+      iargs.P = Pt;
+      iargs.dPdu = dPdut;
+      iargs.dPdv = dPdvt;
+      iargs.ddPdudu = ddPdudut;
+      iargs.ddPdvdv = ddPdvdvt;
+      iargs.ddPdudv = ddPdudvt;
+      iargs.valueCount = valueCount;
+      interpolate(&iargs);
+      
+      if (likely(P)) {
+        for (unsigned int j=0; j<valueCount; j++) 
+          P[j*N+i] = Pt[j];
+      }
+      if (likely(dPdu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          dPdu[j*N+i] = dPdut[j];
+          dPdv[j*N+i] = dPdvt[j];
+        }
+      }
+      if (likely(ddPdudu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          ddPdudu[j*N+i] = ddPdudut[j];
+          ddPdvdv[j*N+i] = ddPdvdvt[j];
+          ddPdudv[j*N+i] = ddPdudvt[j];
+        }
+      }
+    }
+  }
+    
+  bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+  {
+    assert(context->primID < size());
+   
+    RTCPointQueryFunctionArguments args;
+    args.query           = (RTCPointQuery*)context->query_ws;
+    args.userPtr         = context->userPtr;
+    args.primID          = context->primID;
+    args.geomID          = context->geomID;
+    args.context         = context->userContext;
+    args.similarityScale = context->similarityScale;
+    
+    bool update = false;
+    if(context->func)  update |= context->func(&args);
+    if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+    if (update && context->userContext->instStackSize > 0)
+    {
+      // update point query
+      if (context->query_type == POINT_QUERY_TYPE_AABB) {
+        context->updateAABB();
+      } else {
+        assert(context->similarityScale > 0.f);
+        query->radius = context->query_ws->radius * context->similarityScale;
+      }
+    }
+    return update;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h
new file mode 100644
index 0000000000..2f9f2e7c94
--- /dev/null
+++ b/thirdparty/embree/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+  class Scene;
+  class Geometry;
+
+  struct GeometryCounts 
+  {
+    __forceinline GeometryCounts()
+      : numFilterFunctions(0),
+        numTriangles(0), numMBTriangles(0), 
+        numQuads(0), numMBQuads(0), 
+        numBezierCurves(0), numMBBezierCurves(0), 
+        numLineSegments(0), numMBLineSegments(0), 
+        numSubdivPatches(0), numMBSubdivPatches(0), 
+        numUserGeometries(0), numMBUserGeometries(0), 
+        numInstancesCheap(0), numMBInstancesCheap(0), 
+        numInstancesExpensive(0), numMBInstancesExpensive(0), 
+        numGrids(0), numMBGrids(0), 
+        numPoints(0), numMBPoints(0) {}
+
+    __forceinline size_t size() const {
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+    }
+
+    __forceinline unsigned int enabledGeometryTypesMask() const
+    {
+      unsigned int mask = 0;
+      if (numTriangles) mask |= 1 << 0;
+      if (numQuads) mask |= 1 << 1;
+      if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+      if (numSubdivPatches) mask |= 1 << 3;
+      if (numUserGeometries) mask |= 1 << 4;
+      if (numInstancesCheap) mask |= 1 << 5;
+      if (numInstancesExpensive) mask |= 1 << 6;
+      if (numGrids) mask |= 1 << 7;
+      if (numPoints) mask |= 1 << 8;
+
+      unsigned int maskMB = 0;
+      if (numMBTriangles) maskMB |= 1 << 0;
+      if (numMBQuads) maskMB |= 1 << 1;
+      if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+      if (numMBSubdivPatches) maskMB |= 1 << 3;
+      if (numMBUserGeometries) maskMB |= 1 << 4;
+      if (numMBInstancesCheap) maskMB |= 1 << 5;
+      if (numMBInstancesExpensive) maskMB |= 1 << 6;
+      if (numMBGrids) maskMB |= 1 << 7;
+      if (numMBPoints) maskMB |= 1 << 8;
+      
+      return (mask<<8) + maskMB;
+    }
+
+    __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+    {
+      GeometryCounts ret;
+      ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+      ret.numTriangles = numTriangles + rhs.numTriangles;
+      ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+      ret.numQuads = numQuads + rhs.numQuads;
+      ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+      ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+      ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+      ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+      ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+      ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+      ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+      ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+      ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+      ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+      ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+      ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+      ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numGrids = numGrids + rhs.numGrids;
+      ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numPoints = numPoints + rhs.numPoints;
+      ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+      return ret;
+    }
+
+    size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
+    size_t numTriangles;             //!< number of enabled triangles
+    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numQuads;                 //!< number of enabled quads
+    size_t numMBQuads;               //!< number of enabled motion blurred quads
+    size_t numBezierCurves;          //!< number of enabled curves
+    size_t numMBBezierCurves;        //!< number of enabled motion blurred curves
+    size_t numLineSegments;          //!< number of enabled line segments
+    size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
+    size_t numSubdivPatches;         //!< number of enabled subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numUserGeometries;        //!< number of enabled user geometries
+    size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
+    size_t numInstancesCheap;        //!< number of enabled cheap instances
+    size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
+    size_t numInstancesExpensive;    //!< number of enabled expensive instances
+    size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numGrids;                 //!< number of enabled grid geometries
+    size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numPoints;                //!< number of enabled points
+    size_t numMBPoints;              //!< number of enabled motion blurred points
+  };
+
+  /*! Base class all geometries are derived from */
+  class Geometry : public RefCount
+  {
+    friend class Scene;
+  public:
+
+    /*! type of geometry */
+    enum GType
+    {
+      GTY_FLAT_LINEAR_CURVE = 0,
+      GTY_ROUND_LINEAR_CURVE = 1,
+      GTY_ORIENTED_LINEAR_CURVE = 2,
+      GTY_CONE_LINEAR_CURVE = 3,
+      
+      GTY_FLAT_BEZIER_CURVE = 4,
+      GTY_ROUND_BEZIER_CURVE = 5,
+      GTY_ORIENTED_BEZIER_CURVE = 6,
+      
+      GTY_FLAT_BSPLINE_CURVE = 8,
+      GTY_ROUND_BSPLINE_CURVE = 9,
+      GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+      GTY_FLAT_HERMITE_CURVE = 12,
+      GTY_ROUND_HERMITE_CURVE = 13,
+      GTY_ORIENTED_HERMITE_CURVE = 14,
+      
+      GTY_FLAT_CATMULL_ROM_CURVE = 16,
+      GTY_ROUND_CATMULL_ROM_CURVE = 17,
+      GTY_ORIENTED_CATMULL_ROM_CURVE = 18,      
+
+      GTY_TRIANGLE_MESH = 20,
+      GTY_QUAD_MESH = 21,
+      GTY_GRID_MESH = 22,
+      GTY_SUBDIV_MESH = 23,
+
+      GTY_SPHERE_POINT = 25,
+      GTY_DISC_POINT = 26,
+      GTY_ORIENTED_DISC_POINT = 27,
+      
+      GTY_USER_GEOMETRY = 29,
+      GTY_INSTANCE_CHEAP = 30,
+      GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_END = 32,
+
+      GTY_BASIS_LINEAR = 0,
+      GTY_BASIS_BEZIER = 4,
+      GTY_BASIS_BSPLINE = 8,
+      GTY_BASIS_HERMITE = 12,
+      GTY_BASIS_CATMULL_ROM = 16,
+      GTY_BASIS_MASK = 28,
+
+      GTY_SUBTYPE_FLAT_CURVE = 0,
+      GTY_SUBTYPE_ROUND_CURVE = 1,
+      GTY_SUBTYPE_ORIENTED_CURVE = 2,
+      GTY_SUBTYPE_MASK = 3,
+    };
+
+    enum GSubType
+    {
+      GTY_SUBTYPE_DEFAULT= 0,
+      GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+      GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+    };
+
+    enum GTypeMask
+    {
+      MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+      MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+      MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+      MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+      MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+      MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+      
+      MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+      MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+      MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+      MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+      MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+      MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+      MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+      MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+      MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+                   MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+                   MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+                   MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+      MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+      MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+      MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+      MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+      MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+      MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+      MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+      MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+      MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+      MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+      MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+    };
+
+    static const char* gtype_names[GTY_END];
+
+    enum class State : unsigned {
+      MODIFIED = 0,
+      COMMITTED = 1,
+    };
+
+  public:
+    
+    /*! Geometry constructor */
+    Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+    /*! Geometry destructor */
+    virtual ~Geometry();
+
+  public:
+
+    /*! tests if geometry is enabled */
+    __forceinline bool isEnabled() const { return enabled; }
+
+    /*! tests if geometry is disabled */
+    __forceinline bool isDisabled() const { return !isEnabled(); }
+
+    /*! tests if that geometry has some filter function set */
+    __forceinline bool hasFilterFunctions () const {
+      return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
+    }
+
+    /*! returns geometry type */
+    __forceinline GType getType() const { return gtype; }
+
+    /*! returns curve type */
+    __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+    /*! returns curve basis */
+    __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+    /*! returns geometry type mask */
+    __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+    /*! returns number of primitives */
+    __forceinline size_t size() const { return numPrimitives; }
+
+    /*! sets the number of primitives */
+    virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+    /*! sets number of time steps */
+    virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+    /*! sets motion blur time range */
+    void setTimeRange (const BBox1f range);
+
+    /*! sets number of vertex attributes */
+    virtual void setVertexAttributeCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets number of topologies */
+    virtual void setTopologyCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets the build quality */
+    void setBuildQuality(RTCBuildQuality quality_in)
+    {
+      this->quality = quality_in;
+      Geometry::update();
+    }
+
+    /* calculate time segment itime and fractional time ftime */
+    __forceinline int timeSegment(float time, float& ftime) const {
+      return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+    }
+
+    template<int N>
+      __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+      return getTimeSegment<N>(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+    }
+    
+    /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+    }
+
+    /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<(int)numTimeSteps);
+      return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+    }
+    
+    /*! for all geometries */
+  public:
+
+    /*! Enable geometry. */
+    virtual void enable();
+
+    /*! Update geometry. */
+    void update();
+    
+    /*! commit of geometry */
+    virtual void commit();
+
+    /*! Update geometry buffer. */
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+      update(); // update everything for geometries not supporting this call
+    }
+    
+    /*! Disable geometry. */
+    virtual void disable();
+
+    /*! Verify the geometry */
+    virtual bool verify() { return true; }
+
+    /*! called before every build */
+    virtual void preCommit();
+  
+    /*! called after every build */
+    virtual void postCommit();
+
+    virtual void addElementsToCount (GeometryCounts & counts) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    };
+
+    /*! sets constant tessellation rate for the geometry */
+    virtual void setTessellationRate(float N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets the maximal curve radius scale allowed by min-width feature. */
+    virtual void setMaxRadiusScale(float s) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set user data pointer. */
+    virtual void setUserData(void* ptr);
+      
+    /*! Get user data pointer. */
+    __forceinline void* getUserData() const {
+      return userPtr;
+    }
+
+    /*! interpolates user data to the specified u/v location */
+    virtual void interpolate(const RTCInterpolateArguments* const args) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! interpolates user data to the specified u/v locations */
+    virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+    /* point query api */
+    bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+    /*! for subdivision surfaces only */
+  public:
+    virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set displacement function. */
+    virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFace(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! get fast access to first vertex buffer if applicable */
+    virtual float * getCompactVertexArray () const {
+      return nullptr;
+    }
+
+    /*! Returns the modified counter - how many times the geo has been modified */
+    __forceinline unsigned int getModCounter () const {
+      return modCounter_;
+    }
+
+    /*! for triangle meshes and bezier curves only */
+  public:
+
+
+    /*! Sets ray mask. */
+    virtual void setMask(unsigned mask) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Sets specified buffer. */
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Gets specified buffer. */
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Set intersection filter function for ray packets of size N. */
+    virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! Set occlusion filter function for ray packets of size N. */
+    virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! for instances only */
+  public:
+
+    /*! Sets the instanced scene */
+    virtual void setInstancedScene(const Ref<Scene>& scene) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! for user geometries only */
+  public:
+
+    /*! Set bounds function. */
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set intersect function for ray packets of size N. */
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set occlusion function for ray packets of size N. */
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set point query function. */
+    void setPointQueryFunction(RTCPointQueryFunction func);
+
+    /*! returns number of time segments */
+    __forceinline unsigned numTimeSegments () const {
+      return numTimeSteps-1;
+    }
+
+  public:
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+    
+    virtual Vec3fa computeDirection(unsigned int primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+    
+    virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+    virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+  public:
+    __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+    __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+  public:
+    Device* device;             //!< device this geometry belongs to
+
+    void* userPtr;              //!< user pointer
+    unsigned int numPrimitives; //!< number of primitives of this geometry
+    
+    unsigned int numTimeSteps;  //!< number of time steps
+    float fnumTimeSegments;     //!< number of time segments (precalculation)
+    BBox1f time_range;          //!< motion blur time range
+    
+    unsigned int mask;             //!< for masking out geometry
+    unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+    
+    struct {
+      GType gtype : 8;                //!< geometry type
+      GSubType gsubtype : 8;          //!< geometry subtype
+      RTCBuildQuality quality : 3;    //!< build quality for geometry
+      unsigned state : 2;
+      bool enabled : 1;              //!< true if geometry is enabled
+    };
+       
+    RTCFilterFunctionN intersectionFilterN;
+    RTCFilterFunctionN occlusionFilterN;
+    RTCPointQueryFunction pointQueryFunc;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/hit.h b/thirdparty/embree/kernels/common/hit.h
new file mode 100644
index 0000000000..fd1a9d6391
--- /dev/null
+++ b/thirdparty/embree/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+  /* Hit structure for K hits */
+  template<int K>
+    struct HitK
+  {
+    /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+      : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+      instance_id_stack::copy_UV<K>(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return K; }
+
+  public:
+    Vec3vf<K> Ng;  // geometry normal
+    vfloat<K> u;         // barycentric u coordinate of hit
+    vfloat<K> v;         // barycentric v coordinate of hit
+    vuint<K> primID;      // primitive ID
+    vuint<K> geomID;      // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Specialization for a single hit */
+  template<>
+    struct __aligned(16) HitK<1>
+  {
+     /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+      : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+    {
+      instance_id_stack::copy_UU(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return 1; }
+
+  public:
+    Vec3<float> Ng;  // geometry normal
+    float u;         // barycentric u coordinate of hit
+    float v;         // barycentric v coordinate of hit
+    unsigned int primID;      // primitive ID
+    unsigned int geomID;      // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Shortcuts */
+  typedef HitK<1>  Hit;
+  typedef HitK<4>  Hit4;
+  typedef HitK<8>  Hit8;
+  typedef HitK<16> Hit16;
+
+  /* Outputs hit to stream */
+  template<int K>
+  __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  Ng = " << ray.Ng <<  embree_endl
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  template<typename Hit>
+    __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+  {
+    ray.Ng   = hit.Ng;
+    ray.u    = hit.u;
+    ray.v    = hit.v;
+    ray.primID = hit.primID;
+    ray.geomID = hit.geomID;
+    instance_id_stack::copy_UU(hit.instID, ray.instID);
+  }
+
+  template<int K>
+    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+  {
+    vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+    vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+    vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+    vfloat<K>::storeu(mask,&ray.u, hit.u);
+    vfloat<K>::storeu(mask,&ray.v, hit.v);
+    vuint<K>::storeu(mask,&ray.primID, hit.primID);
+    vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+    instance_id_stack::copy_VV<K>(hit.instID, ray.instID, mask);
+  }
+}
diff --git a/thirdparty/embree/kernels/common/instance_stack.h b/thirdparty/embree/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d3c0a643f1
--- /dev/null
+++ b/thirdparty/embree/kernels/common/instance_stack.h
@@ -0,0 +1,179 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, 
+              "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/* 
+ * Push an instance to the stack. 
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
+                          unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+  /* We assert here because instances are silently dropped when the stack is full. 
+     This might be quite hard to find in production. */
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[context->instStackSize++] = instanceId;
+  return spaceAvailable;
+#else
+  const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[0] = instanceId;
+  return spaceAvailable;
+#endif
+}
+
+
+/* 
+ * Pop the last instance pushed to the stack. 
+ * Do not call on an empty stack. 
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*
+ * Optimized instance id stack copy.
+ * The copy() functions will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ */
+RTC_FORCEINLINE void copy_UU(const unsigned* src, unsigned* tgt)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0];
+  
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, size_t j)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0][j] = src[0];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l][j] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  vuint<K>::store(mask, tgt, src[0]);
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    vuint<K>::store(mask, tgt + l, src[l]);
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VU(const vuint<K>* src, unsigned* tgt, size_t i)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0][i];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l][i];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l][i] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, size_t i, size_t j)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0][j] = src[0][i];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l][j] = src[l][i];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l][i] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  vuint<K>::store(mask, tgt, src[0]);
+
+#else
+  vbool<K> done = !mask;
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    vuint<K>::store(mask, tgt + l, src[l]);
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) {
+      done |= src[l] == RTC_INVALID_GEOMETRY_ID;
+      if (all(done)) break;
+    }
+  }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h
new file mode 100644
index 0000000000..ae6556336c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/isa.h
@@ -0,0 +1,246 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name)               \
+  typedef type (*name##Func)();                 \
+  name##Func name;
+  
+#define DECLARE_SYMBOL2(type,name)                                       \
+  namespace sse2   { extern type name(); }                           \
+  namespace sse42  { extern type name(); }                           \
+  namespace avx    { extern type name(); }                           \
+  namespace avx2   { extern type name(); }                           \
+  namespace avx512 { extern type name(); }                           \
+  void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+  type name##_error() { return type(name##_error2); }                   \
+  type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args)                            \
+  namespace sse2   { extern type symbol(args); }                       \
+  namespace sse42  { extern type symbol(args); }                       \
+  namespace avx    { extern type symbol(args); }                       \
+  namespace avx2   { extern type symbol(args); }                       \
+  namespace avx512 { extern type symbol(args); }                     \
+  inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+  typedef type (*symbol##Ty)(args);                                       \
+  
+#define DEFINE_ISA_FUNCTION(type,symbol,args)   \
+  typedef type (*symbol##Func)(args);           \
+  symbol##Func symbol;
+  
+#define ZERO_SYMBOL(features,intersector)                      \
+  intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector)                      \
+  intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+  intersector = isa::intersector;
+
+#if defined(__SSE__)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+  if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+  if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512(features,intersector) \
+  if ((features & AVX512) == AVX512) intersector = avx512::intersector;
+#else
+#define SELECT_SYMBOL_AVX512(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);            \
+  SELECT_SYMBOL_SSE42(features,intersector);                                  
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                \
+  SELECT_SYMBOL_SSE42(features,intersector);                  \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                     \
+  SELECT_SYMBOL_SSE42(features,intersector);                       \
+  SELECT_SYMBOL_AVX(features,intersector);                         \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                          \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                         \
+  SELECT_SYMBOL_AVX(features,intersector);                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                            \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                         \
+  SELECT_SYMBOL_SSE42(features,intersector);                                           \
+  SELECT_SYMBOL_AVX(features,intersector);                                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                                            \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                               \
+  SELECT_SYMBOL_SSE42(features,intersector);                                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);          \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                              \
+  SELECT_SYMBOL_AVX(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                      \
+  SELECT_SYMBOL_AVX(features,intersector);                \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                            \
+  SELECT_SYMBOL_SSE42(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                      \
+  SELECT_SYMBOL_AVX2(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                           \
+  SELECT_SYMBOL_AVX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                     \
+  SELECT_SYMBOL_AVX(features,intersector);                               \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                          \
+  SELECT_SYMBOL_AVX(features,intersector);                                    \
+  SELECT_SYMBOL_AVX2(features,intersector);                                   \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                                \
+  SELECT_SYMBOL_SSE42(features,intersector);                                        \
+  SELECT_SYMBOL_AVX(features,intersector);                                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                                         \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  ZERO_SYMBOL(features,intersector);                                    \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX2(features,intersector);                             \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_SSE42(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                 \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+  struct VerifyMultiTargetLinking {
+    static __noinline int getISA(int depth = 5) { 
+      if (depth == 0) return ISA; 
+      else return getISA(depth-1); 
+    }
+  };
+  namespace sse2   { int getISA(); };
+  namespace sse42  { int getISA(); };
+  namespace avx    { int getISA(); };
+  namespace avx2   { int getISA(); };
+  namespace avx512 { int getISA(); };
+}
diff --git a/thirdparty/embree/kernels/common/motion_derivative.h b/thirdparty/embree/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..c619d6a675
--- /dev/null
+++ b/thirdparty/embree/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+  float theta;
+  float coeffs[3*8*7];
+
+  MotionDerivativeCoefficients() {}
+
+  // xfm0 and xfm1 are interpret as quaternion decomposition
+  MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+  {
+    // cosTheta of the two quaternions
+    const float cosTheta = min(1.f, max(-1.f,
+                           xfm0.l.vx.w * xfm1.l.vx.w
+                         + xfm0.l.vy.w * xfm1.l.vy.w
+                         + xfm0.l.vz.w * xfm1.l.vz.w
+                         + xfm0.p.w * xfm1.p.w));
+
+    theta = std::acos(cosTheta);
+    Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+    if (cosTheta < 0.995f) {
+      // compute perpendicular quaternion
+      qperp.x = xfm1.p.w    - cosTheta * xfm0.p.w;
+      qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+      qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+      qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+      qperp = normalize(qperp);
+    }
+    const float p[33] = {
+      theta,
+      xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+      xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+      xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+      qperp.x, qperp.y, qperp.z, qperp.w,
+      xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+                   xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+                                xfm0.l.vz.z, xfm0.p.z,
+      xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+                   xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+                                xfm1.l.vz.z, xfm1.p.z
+    };
+    motion_derivative_coefficients(p, coeffs);
+  }
+};
+
+struct MotionDerivative
+{
+  float twoTheta;
+  float c[8];
+
+  MotionDerivative(MotionDerivativeCoefficients const& mdc,
+                    int dim, Vec3fa const& p0, Vec3fa const& p1)
+    : twoTheta(2.f*mdc.theta)
+  {
+    const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+    for (int i = 0; i < 8; ++i) {
+      c[i] = 0;
+      for (int j = 0; j < 7; ++j) {
+        c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+      }
+    }
+  }
+
+  template<typename T>
+  struct EvalMotionDerivative
+  {
+    MotionDerivative const& md;
+    float offset;
+
+    EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+    T operator()(T const& time) const {
+      return md.c[0] + md.c[1] * time
+          + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+          + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+          + offset;
+    }
+  };
+
+  unsigned int findRoots(
+    Interval1f const& interval,
+    float offset,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    unsigned int numRoots = 0;
+    EvalMotionDerivative<Interval1f> eval(*this, offset);
+    findRoots(eval, interval, numRoots, roots, maxNumRoots);
+    return numRoots;
+  }
+
+  template<typename Eval>
+  static void findRoots(
+
+    Eval const& eval,
+    Interval1f const& interval,
+    unsigned int& numRoots,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    Interval1f range = eval(interval);
+    if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+    const float split = 0.5f * (interval.upper + interval.lower);
+    if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f ||  abs(split-interval.upper) < 1e-7f)
+    {
+      // check if the root already exists
+      for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+        if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+        return;
+      }
+      if (numRoots < maxNumRoots) {
+        roots[numRoots++] = split;
+      }
+      if (numRoots > maxNumRoots) {
+        printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+        return;
+      }
+      return;
+    }
+
+    findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+    findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+  }
+};
+
+/******************************************************************************
+ *                       Code generated with sympy 1.4                        *
+ *              See http://www.sympy.org/ for more information.               *
+ *                                                                            *
+ * see                                                                        *
+ *                                                                            *
+ *     scripts/generate_motion_derivative_coefficients.py                     *
+ *                                                                            *
+ * for how this code is generated                                             *
+ *                                                                            *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+   coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+   coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+   coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+   coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+   coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+   coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+   coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+   coeff[7] = 0;
+   coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+   coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+   coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+   coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+   coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+   coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+   coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+   coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+   coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+   coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+   coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+   coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+   coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+   coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+   coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+   coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+   coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+   coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+   coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+   coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+   coeff[28] = 0;
+   coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+   coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+   coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+   coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+   coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+   coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+   coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+   coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+   coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+   coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+   coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+   coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+   coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+   coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+   coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+   coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+   coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+   coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+   coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+   coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+   coeff[49] = 0;
+   coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+   coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+   coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+   coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+   coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+   coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+   coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+   coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+   coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+   coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+   coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+   coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+   coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+   coeff[63] = 0;
+   coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+   coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+   coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+   coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+   coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+   coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+   coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+   coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+   coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+   coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+   coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+   coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+   coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+   coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+   coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+   coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+   coeff[84] = 0;
+   coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+   coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+   coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+   coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+   coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+   coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+   coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+   coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+   coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+   coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+   coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+   coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+   coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+   coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+   coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+   coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+   coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+   coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+   coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+   coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+   coeff[105] = 0;
+   coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+   coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+   coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+   coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+   coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+   coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+   coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+   coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+   coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+   coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+   coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+   coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+   coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+   coeff[119] = 0;
+   coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+   coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+   coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+   coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+   coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+   coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+   coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+   coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+   coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+   coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+   coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+   coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+   coeff[140] = 0;
+   coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+   coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+   coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+   coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+   coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+   coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+   coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+   coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+   coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+   coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+   coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+   coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+   coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+   coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+   coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+   coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+   coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+   coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+   coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+   coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+   coeff[161] = 0;
+   coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+   coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+   coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+   coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+   coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+   coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree/kernels/common/point_query.h b/thirdparty/embree/kernels/common/point_query.h
new file mode 100644
index 0000000000..7d55c91fff
--- /dev/null
+++ b/thirdparty/embree/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* Point query structure for closest point query */
+  template<int K>
+  struct RTC_ALIGN(16) PointQueryK 
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = radius >= vfloat<K>(0);
+      const vbool<K> vf = abs(time) < vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(PointQueryK<1>* ray) const;
+    __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+    __forceinline void set(const PointQueryK<1>* ray);
+    __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+    Vec3vf<K> p;      // location of the query point
+    vfloat<K> time;   // time for motion blur
+    vfloat<K> radius; // radius for the point query
+  };
+  
+  /* Specialization for a single point query */
+  template<>
+  struct RTC_ALIGN(16) PointQueryK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+    }
+
+    Vec3f p;  
+    float time;
+    float radius;
+  };
+  
+  /* Converts point query packet to single point query */
+  template<int K>
+  __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      query[i].p.x    = p.x[i]; 
+      query[i].p.y    = p.y[i]; 
+      query[i].p.z    = p.z[i];
+      query[i].time   = time[i];
+      query[i].radius = radius[i]; 
+    }
+  }
+
+  /* Extracts a single point query out of a point query packet*/
+  template<int K>
+  __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+  {
+    query.p.x    = p.x[i]; 
+    query.p.y    = p.y[i]; 
+    query.p.z    = p.z[i];
+    query.radius = radius[i];  
+    query.time   = time[i];  
+  }
+
+  /* Converts single point query to point query packet */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+  {
+    for (size_t i = 0; i < K; i++)
+    {
+      p.x[i]    = query[i].p.x;
+      p.y[i]    = query[i].p.y;
+      p.z[i]    = query[i].p.z;
+      radius[i] = query[i].radius; 
+      time[i]   = query[i].time; 
+    }
+  }
+
+  /* inserts a single point query into a point query packet element */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+  {
+    p.x[i]    = query.p.x;
+    p.y[i]    = query.p.y;
+    p.z[i]    = query.p.z;
+    radius[i] = query.radius; 
+    time[i]   = query.time; 
+  }
+
+  /* Shortcuts */
+  typedef PointQueryK<1>  PointQuery;
+  typedef PointQueryK<4>  PointQuery4;
+  typedef PointQueryK<8>  PointQuery8;
+  typedef PointQueryK<16> PointQuery16;
+  struct PointQueryN;
+
+  /* Outputs point query to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+  {
+    cout << "{ " << embree_endl
+        << "  p = "    << query.p      << embree_endl
+        << "  r = "    << query.radius << embree_endl
+        << "  time = " << query.time   << embree_endl
+        << "}";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/primref.h b/thirdparty/embree/kernels/common/primref.h
new file mode 100644
index 0000000000..d61763487b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(32) PrimRef 
+  {
+    __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+    __forceinline PrimRef(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v));
+    }
+    __forceinline PrimRef& operator=(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+    }
+#endif
+
+    __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) 
+    {
+      lower = Vec3fx(bounds.lower, geomID);
+      upper = Vec3fx(bounds.upper, primID);
+    }
+
+    __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
+    {
+#if defined(__64BIT__)
+      lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+      upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+      lower = Vec3fx(bounds.lower, (unsigned)id);
+      upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+    }
+
+    /*! calculates twice the center of the primitive */
+    __forceinline const Vec3fa center2() const {
+      return lower+upper;
+    }
+    
+    /*! return the bounding box of the primitive */
+    __forceinline const BBox3fa bounds() const {
+      return BBox3fa(lower,upper);
+    }
+
+    /*! size for bin heuristic is 1 */
+    __forceinline unsigned size() const { 
+      return 1;
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const 
+    {
+      bounds_o = bounds();
+      center_o = embree::center2(bounds_o);
+    }
+
+    __forceinline unsigned& geomIDref() {  // FIXME: remove !!!!!!!
+      return lower.u;
+    }
+    __forceinline unsigned& primIDref() {  // FIXME: remove !!!!!!!
+      return upper.u;
+    }
+    
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const { 
+      return lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const { 
+      return upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__64BIT__)
+      return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+      return size_t(lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+      return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+    }
+
+  public:
+    Vec3fx lower;     //!< lower bounds and geomID
+    Vec3fx upper;     //!< upper bounds and primID
+  };
+
+  /*! fast exchange for PrimRefs */
+  __forceinline void xchg(PrimRef& a, PrimRef& b)
+  {
+#if defined(__AVX__)
+    const vfloat8 aa = vfloat8::load((float*)&a);
+    const vfloat8 bb = vfloat8::load((float*)&b);
+    vfloat8::store((float*)&a,bb);
+    vfloat8::store((float*)&b,aa);
+#else
+    std::swap(a,b);
+#endif
+  }
+
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  
+  struct SubGridBuildData {
+    unsigned short sx,sy;
+    unsigned int primID;
+    
+    __forceinline SubGridBuildData() {};
+    __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+    
+    __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+    __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+    
+  };
+}
diff --git a/thirdparty/embree/kernels/common/primref_mb.h b/thirdparty/embree/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..fb08a05003
--- /dev/null
+++ b/thirdparty/embree/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct PrimRefMB
+  {
+    typedef LBBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      lbounds.bounds0.lower.a = geomID;
+      lbounds.bounds0.upper.a = primID;
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.a = id;
+      lbounds.bounds0.upper.a = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+    
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.u = id;
+      lbounds.bounds0.upper.u = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    /*! returns bounds for binning */
+    __forceinline LBBox3fa bounds() const {
+      return lbounds;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned size() const {
+      return lbounds.bounds1.lower.a;
+    }
+
+    __forceinline unsigned totalTimeSegments() const {
+      return lbounds.bounds1.upper.a;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)totalTimeSegments());
+      return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(lbounds.interpolate(0.5f));
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = binCenter();
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const {
+      return lbounds.bounds0.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const {
+      return lbounds.bounds0.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const {
+#if defined(__64BIT__)
+      return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+      return size_t(lbounds.bounds0.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    LBBox3fx lbounds;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#else
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(16) PrimRefMB
+  {
+    typedef BBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      bbox.lower.a = geomID;
+      bbox.upper.a = primID;
+    }
+    
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      bbox.lower.u = id & 0xFFFFFFFF;
+      bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      bbox.lower.u = id;
+      bbox.upper.u = 0;
+#endif
+    }
+    
+    /*! returns bounds for binning */
+    __forceinline BBox3fa bounds() const {
+      return bbox;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned int size() const { 
+      return _activeTimeSegments;
+    }
+
+    __forceinline unsigned int totalTimeSegments() const { 
+      return _totalTimeSegments;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)_totalTimeSegments);
+      return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(bounds());
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = center2(bounds());
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned int geomID() const { 
+      return bbox.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned int primID() const { 
+      return bbox.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__64BIT__)
+      return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+      return size_t(bbox.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    BBox3fa bbox; // bounds, geomID, primID
+    unsigned int _activeTimeSegments;
+    unsigned int _totalTimeSegments;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#endif
+}
diff --git a/thirdparty/embree/kernels/common/profile.h b/thirdparty/embree/kernels/common/profile.h
new file mode 100644
index 0000000000..5ef7f6ec0f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! helper structure for the implementation of the profile functions below */
+  struct ProfileTimer
+  {
+    static const size_t N = 20;
+    
+    ProfileTimer () {}
+
+    ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+    {
+      for (size_t i=0; i<N; i++) names[i] = nullptr;
+      for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+      for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+    }
+    
+    __forceinline void begin() 
+    {
+      j=0;
+      t0 = tj = getSeconds();
+    }
+
+    __forceinline void end() {
+      absolute("total");
+      i++;
+    }
+
+    __forceinline void operator() (const char* name) {
+      relative(name);
+    }
+
+    __forceinline void absolute (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-t0;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    __forceinline void relative (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-tj;
+      tj = t1;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    void print(size_t numElements) 
+    {
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      printf("  profile [M/s]:\n");
+      for (size_t j=0; j<maxJ; j++)
+        printf("%20s:  fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+               names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+      printf("  profile [ms]:\n");
+      for (size_t j=0; j<maxJ; j++) 
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+    }
+
+    void print() 
+    {
+      printf("  profile:\n");
+
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      for (size_t j=0; j<maxJ; j++) {
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+      }
+    }
+
+    double avg() {
+      return dt_avg[maxJ-1]/double(i-numSkip);
+    }
+    
+  private:
+    size_t i;
+    size_t j;
+    size_t maxJ;
+    size_t numSkip;
+    double t0;
+    double tj;
+    const char* names[N];
+    double dt_fst[N];
+    double dt_min[N];
+    double dt_avg[N];
+    double dt_max[N];
+  };
+
+  /*! This function executes some code block multiple times and measured sections of it. 
+      Use the following way:
+
+      profile(1,10,1000,[&](ProfileTimer& timer) {
+        // code
+        timer("A");
+        // code 
+        timer("B");
+      });
+  */
+  template<typename Closure>
+    void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      ProfileTimer timer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+
+  /*! similar as the function above, but the timer object comes externally */
+  template<typename Closure>
+    void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      timer = ProfileTimer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+}
diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h
new file mode 100644
index 0000000000..7b951cc1e8
--- /dev/null
+++ b/thirdparty/embree/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+  /* Ray structure for K rays */
+  template<int K>
+  struct RayK
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                       const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                       const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+      const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(RayK<1>* ray) const;
+    __forceinline void get(size_t i, RayK<1>& ray) const;
+    __forceinline void set(const RayK<1>* ray);
+    __forceinline void set(size_t i, const RayK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    __forceinline vint<K> octant() const
+    {
+      return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+             select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+             select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+    }
+
+    /* Ray data */
+    Vec3vf<K> org;    // ray origin
+    vfloat<K> _tnear; // start of ray segment
+    Vec3vf<K> dir;    // ray direction
+    vfloat<K> _time;  // time of this ray for motion blur
+    vfloat<K> tfar;   // end of ray segment
+    vint<K> mask;     // used to mask out objects during traversal
+    vint<K> id;      
+    vint<K> flags;  
+
+    __forceinline vfloat<K>& tnear() { return _tnear; }
+    __forceinline vfloat<K>& time()  { return _time; }
+    __forceinline const vfloat<K>& tnear() const { return _tnear; }
+    __forceinline const vfloat<K>& time()  const { return _time; }
+  };
+
+  /* Ray+hit structure for K rays */
+  template<int K>
+  struct RayHitK : RayK<K>
+  {
+    using RayK<K>::org;
+    using RayK<K>::_tnear;
+    using RayK<K>::dir;
+    using RayK<K>::_time;
+    using RayK<K>::tfar;
+    using RayK<K>::mask;
+    using RayK<K>::id;
+    using RayK<K>::flags;
+
+    using RayK<K>::tnear;
+    using RayK<K>::time;
+
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                          const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                          const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK(const RayK<K>& ray)
+      : RayK<K>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+    {
+      org    = ray.org;
+      _tnear = ray._tnear;
+      dir    = ray.dir;
+      _time  = ray._time;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit(const vbool<K>& valid0) const
+    {
+      vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+      const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+      const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+      if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+      if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+      if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+      if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+      if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+      if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+    }
+
+    __forceinline void get(RayHitK<1>* ray) const;
+    __forceinline void get(size_t i, RayHitK<1>& ray) const;
+    __forceinline void set(const RayHitK<1>* ray);
+    __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    /* Hit data */
+    Vec3vf<K> Ng;   // geometry normal
+    vfloat<K> u;    // barycentric u coordinate of hit
+    vfloat<K> v;    // barycentric v coordinate of hit
+    vuint<K> primID; // primitive ID
+    vuint<K> geomID; // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Specialization for a single ray */
+  template<>
+  struct RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+    }
+
+    /* Ray data */
+    Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
+    //float tnear; // start of ray segment
+    Vec3ff dir;  // 3 floats for ray direction, 1 float for time
+    // float time; 
+    float tfar;  // end of ray segment
+    int mask;    // used to mask out objects during traversal
+    int id;      // ray ID
+    int flags;   // ray flags
+
+    __forceinline float& tnear() { return org.w; };
+    __forceinline const float& tnear() const { return org.w; };
+
+    __forceinline float& time() { return dir.w; };
+    __forceinline const float& time() const { return dir.w; };
+
+  };
+
+  template<>
+  struct RayHitK<1> : RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK(const RayK<1>& ray)
+      : RayK<1>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+    {
+      org    = ray.org;
+      dir    = ray.dir;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit() const
+    {
+      if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+      const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+      const bool vu = (abs(u) <= FLT_LARGE);
+      const bool vv = (abs(u) <= FLT_LARGE);
+      const bool vnx = abs(Ng.x) <= FLT_LARGE;
+      const bool vny = abs(Ng.y) <= FLT_LARGE;
+      const bool vnz = abs(Ng.z) <= FLT_LARGE;
+      if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+      if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+      if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+      if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+      if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+      if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+    }
+
+    /* Hit data */
+    Vec3f Ng;            // not normalized geometry normal
+    float u;             // barycentric u coordinate of hit
+    float v;             // barycentric v coordinate of hit
+    unsigned int primID; // primitive ID
+    unsigned int geomID; // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Converts ray packet to single rays */
+  template<int K>
+  __forceinline void RayK<K>::get(RayK<1>* ray) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+      ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time()  = time()[i];
+      ray[i].tfar  = tfar[i];  ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+    }
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      get(i, ray[i]);
+  }
+
+  /* Extracts a single ray out of a ray packet*/
+  template<int K>
+  __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; 
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time()  = time()[i];  
+    ray.tfar  = tfar[i]; ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar  = tfar[i]; ray.time()  = time()[i]; 
+    ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+    ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+    ray.u = u[i]; ray.v = v[i];
+    ray.primID = primID[i]; ray.geomID = geomID[i]; 
+
+    instance_id_stack::copy_VU<K>(instID, ray.instID, i);
+  }
+
+  /* Converts single rays to ray packet */
+  template<int K>
+  __forceinline void RayK<K>::set(const RayK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  /* inserts a single ray into a ray packet element */
+  template<int K>
+  __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+    Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+    u[i] = ray.u; v[i] = ray.v;
+    primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+    instance_id_stack::copy_UV<K>(ray.instID, instID, i);
+  }
+
+  /* copies a ray packet element into another element*/
+  template<int K>
+  __forceinline void RayK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; 
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+    Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+    u[dest] = u[source]; v[dest] = v[source];
+    primID[dest] = primID[source]; geomID[dest] = geomID[source];  
+
+    instance_id_stack::copy_VV<K>(instID, instID, source, dest);
+  }
+
+  /* Shortcuts */
+  typedef RayK<1>  Ray;
+  typedef RayK<4>  Ray4;
+  typedef RayK<8>  Ray8;
+  typedef RayK<16> Ray16;
+  struct RayN;
+
+  typedef RayHitK<1>  RayHit;
+  typedef RayHitK<4>  RayHit4;
+  typedef RayHitK<8>  RayHit8;
+  typedef RayHitK<16> RayHit16;
+  struct RayHitN;
+
+  template<int K, bool intersect>
+  struct RayTypeHelper;
+
+  template<int K>
+  struct RayTypeHelper<K, true>
+  {
+    typedef RayHitK<K> Ty;
+  };
+
+  template<int K>
+  struct RayTypeHelper<K, false>
+  {
+    typedef RayK<K> Ty;
+  };
+
+  template<bool intersect>
+  using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+  template<int K, bool intersect>
+  using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+  /* Outputs ray to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+  {
+    return cout << "{ " << embree_endl
+                << "  org = " << ray.org << embree_endl
+                << "  dir = " << ray.dir << embree_endl
+                << "  near = " << ray.tnear() << embree_endl
+                << "  far = " << ray.tfar << embree_endl
+                << "  time = " << ray.time() << embree_endl
+                << "  mask = " << ray.mask << embree_endl
+                << "  id = " << ray.id << embree_endl
+                << "  flags = " << ray.flags << embree_endl
+                << "}";
+  }
+
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  org = " << ray.org << embree_endl
+         << "  dir = " << ray.dir << embree_endl
+         << "  near = " << ray.tnear() << embree_endl
+         << "  far = " << ray.tfar << embree_endl
+         << "  time = " << ray.time() << embree_endl
+         << "  mask = " << ray.mask << embree_endl
+         << "  id = " << ray.id << embree_endl
+         << "  flags = " << ray.flags << embree_endl
+         << "  Ng = " << ray.Ng
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  struct RayStreamSOA
+  {
+    __forceinline RayStreamSOA(void* rays, size_t N)
+      : ptr((char*)rays), N(N) {}
+
+    /* ray data access functions */
+    __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; }  // x coordinate of ray origin
+    __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; }  // y coordinate of ray origin
+    __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+    __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+    __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+    __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+    __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+    __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+    __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+    __forceinline int*   mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset];   }; // used to mask out objects during traversal (optional)
+    __forceinline int*   id   (size_t offset = 0) { return (int*)&ptr[10*4*N+offset];  }; // id
+    __forceinline int*   flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset];  }; // flags
+
+    /* hit data access functions */
+    __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+    __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+    __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+    __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; };    // barycentric u coordinate of hit
+    __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; };    // barycentric v coordinate of hit
+
+    __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
+    __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
+    __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = org_x(offset)[0];
+      ray.org.y   = org_y(offset)[0];
+      ray.org.z   = org_z(offset)[0];
+      ray.tnear() = tnear(offset)[0];
+      ray.dir.x   = dir_x(offset)[0];
+      ray.dir.y   = dir_y(offset)[0];
+      ray.dir.z   = dir_z(offset)[0];
+      ray.time()  = time(offset)[0];
+      ray.tfar    = tfar(offset)[0];
+      ray.mask    = mask(offset)[0];
+      ray.id      = id(offset)[0];
+      ray.flags   = flags(offset)[0];
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x  = vfloat<K>::loadu(org_x(offset));
+      ray.org.y  = vfloat<K>::loadu(org_y(offset));
+      ray.org.z  = vfloat<K>::loadu(org_z(offset));
+      ray.tnear  = vfloat<K>::loadu(tnear(offset));
+      ray.dir.x  = vfloat<K>::loadu(dir_x(offset));
+      ray.dir.y  = vfloat<K>::loadu(dir_y(offset));
+      ray.dir.z  = vfloat<K>::loadu(dir_z(offset));
+      ray.time   = vfloat<K>::loadu(time(offset));
+      ray.tfar   = vfloat<K>::loadu(tfar(offset));
+      ray.mask   = vint<K>::loadu(mask(offset));
+      ray.id     = vint<K>::loadu(id(offset));
+      ray.flags  = vint<K>::loadu(flags(offset));
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, org_x(offset));
+      ray.org.y   = vfloat<K>::loadu(valid, org_y(offset));
+      ray.org.z   = vfloat<K>::loadu(valid, org_z(offset));
+      ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, dir_x(offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, dir_y(offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, dir_z(offset));
+      ray.time()  = vfloat<K>::loadu(valid, time(offset));
+      ray.tfar  = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+      /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+         because the SSE masked loads always access the entire vector */
+      if (unlikely(!all(valid)))
+      {
+        ray.mask  = zero;
+        ray.id    = zero;
+        ray.flags = zero;
+
+        for (size_t k = 0; k < K; k++)
+        {
+          if (likely(valid[k]))
+          {
+            ray.mask[k]  = mask(offset)[k];
+            ray.id[k]    = id(offset)[k];
+            ray.flags[k] = flags(offset)[k];
+          }
+        }
+      }
+      else
+#endif
+      {
+        ray.mask  = vint<K>::loadu(valid, mask(offset));
+        ray.id    = vint<K>::loadu(valid, id(offset));
+        ray.flags = vint<K>::loadu(valid, flags(offset));
+      }
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      /* 
+       * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+       * valid:   stores which of the rays actually hit something.
+       */
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+        vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+        vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+        vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, u(offset), ray.u);
+        vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+        /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+           because the SSE masked stores always access the entire vector */
+        if (unlikely(!all(valid_i)))
+        {
+          for (size_t k = 0; k < K; k++)
+          {
+            if (likely(valid[k]))
+            {
+              primID(offset)[k] = ray.primID[k];
+              geomID(offset)[k] = ray.geomID[k];
+
+              instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+                instID(l, offset)[k] = ray.instID[l][k];
+#endif
+            }
+          }
+        }
+        else
+#endif
+        {
+          vuint<K>::storeu(valid, primID(offset), ray.primID);
+          vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+          vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = dir_x(offset)[0];
+      const float dy = dir_y(offset)[0];
+      const float dz = dir_z(offset)[0];
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear(offset)[0];
+      const float ffar  = tfar(offset)[0];
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x(), offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y(), offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z(), offset);
+      ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+      ray.time()  = vfloat<K>::template gather<1>(valid, time(), offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar(), offset);
+      ray.mask    = vint<K>::template gather<1>(valid, mask(), offset);
+      ray.id      = vint<K>::template gather<1>(valid, id(), offset);
+      ray.flags   = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.time()  = zero;
+      ray.tfar    = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *org_x(ofs);
+          ray.org.y[k]   = *org_y(ofs);
+          ray.org.z[k]   = *org_z(ofs);
+          ray.tnear()[k] = *tnear(ofs);
+          ray.dir.x[k]   = *dir_x(ofs);
+          ray.dir.y[k]   = *dir_y(ofs);
+          ray.dir.z[k]   = *dir_z(ofs);
+          ray.time()[k]  = *time(ofs);
+          ray.tfar[k]    = *tfar(ofs);
+          ray.mask[k]    = *mask(ofs);
+          ray.id[k]      = *id(ofs);
+          ray.flags[k]   = *flags(ofs);
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+        vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+
+          *Ng_x(ofs)   = ray.Ng.x[k];
+          *Ng_y(ofs)   = ray.Ng.y[k];
+          *Ng_z(ofs)   = ray.Ng.z[k];
+          *u(ofs)      = ray.u[k];
+          *v(ofs)      = ray.v[k];
+          *primID(ofs) = ray.primID[k];
+          *geomID(ofs) = ray.geomID[k];
+
+          *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *instID(l, ofs) = ray.instID[l][k];
+#endif
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    char* __restrict__ ptr;
+    size_t N;
+  };
+
+  template<size_t MAX_K>
+  struct StackRayStreamSOA : public RayStreamSOA
+  {
+    __forceinline StackRayStreamSOA(size_t K)
+      : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+    char data[MAX_K / 4 * sizeof(RayHit4)];
+  };
+
+
+  struct RayStreamSOP
+  {
+    template<class T>
+    __forceinline void init(T& t)
+    {
+      org_x  = (float*)&t.org.x;
+      org_y  = (float*)&t.org.y;
+      org_z  = (float*)&t.org.z;
+      tnear  = (float*)&t.tnear;
+      dir_x  = (float*)&t.dir.x;
+      dir_y  = (float*)&t.dir.y;
+      dir_z  = (float*)&t.dir.z;
+      time   = (float*)&t.time;
+      tfar   = (float*)&t.tfar;
+      mask   = (unsigned int*)&t.mask;
+      id     = (unsigned int*)&t.id;
+      flags  = (unsigned int*)&t.flags;
+
+      Ng_x   = (float*)&t.Ng.x;
+      Ng_y   = (float*)&t.Ng.y;
+      Ng_z   = (float*)&t.Ng.z;
+      u      = (float*)&t.u;
+      v      = (float*)&t.v;
+      primID = (unsigned int*)&t.primID;
+      geomID = (unsigned int*)&t.geomID;
+
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = (unsigned int*)&t.instID[l];
+    }
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = *(float* __restrict__)((char*)org_x + offset);
+      ray.org.y   = *(float* __restrict__)((char*)org_y + offset);
+      ray.org.z   = *(float* __restrict__)((char*)org_z + offset);
+      ray.dir.x   = *(float* __restrict__)((char*)dir_x + offset);
+      ray.dir.y   = *(float* __restrict__)((char*)dir_y + offset);
+      ray.dir.z   = *(float* __restrict__)((char*)dir_z + offset);
+      ray.tfar  = *(float* __restrict__)((char*)tfar + offset);
+      ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      ray.time()  = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+      ray.mask    = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+      ray.id      = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+      ray.flags   = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+      ray.org.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+      ray.org.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      ray.tfar    = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      ray.time()  = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+      ray.mask    = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+      ray.id      = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+      ray.flags   = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+    {
+      Vec3vf<K> dir;
+      dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      return dir;
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+    {
+      if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+      {
+        *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+        if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+        if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+        if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+        *(float* __restrict__)((char*)u + offset) = ray.u;
+        *(float* __restrict__)((char*)v + offset) = ray.v;
+        *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+        *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+        if (likely(instID[0])) {
+          *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+        }
+      }
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+    {
+      *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+        if (likely(instID[0])) {
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = *(float* __restrict__)((char*)dir_x + offset);
+      const float dy = *(float* __restrict__)((char*)dir_y + offset);
+      const float dz = *(float* __restrict__)((char*)dir_z + offset);
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      const float ffar  = *(float* __restrict__)((char*)tfar + offset);
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+    {
+      const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      const vfloat<K> ffar  = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x, offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y, offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z, offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x, offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y, offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z, offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar, offset);
+      ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+      ray.time()  = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+      ray.mask    = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+      ray.id      = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+      ray.flags   = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.tfar    = zero;
+      ray.time()  = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *(float* __restrict__)((char*)org_x + ofs);
+          ray.org.y[k]   = *(float* __restrict__)((char*)org_y + ofs);
+          ray.org.z[k]   = *(float* __restrict__)((char*)org_z + ofs);
+          ray.dir.x[k]   = *(float* __restrict__)((char*)dir_x + ofs);
+          ray.dir.y[k]   = *(float* __restrict__)((char*)dir_y + ofs);
+          ray.dir.z[k]   = *(float* __restrict__)((char*)dir_z + ofs);
+          ray.tfar[k]  = *(float* __restrict__)((char*)tfar + ofs);
+          ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+          ray.time()[k]  = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+          ray.mask[k]    = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+          ray.id[k]      = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+          ray.flags[k]   = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+        if (likely(instID[0])) {
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+        }
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+          if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+          if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+          if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+          *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+          *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+          *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+          *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+          if (likely(instID[0])) {
+            *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+          }
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    /* ray data */
+    float* __restrict__ org_x; // x coordinate of ray origin
+    float* __restrict__ org_y; // y coordinate of ray origin
+    float* __restrict__ org_z; // z coordinate of ray origin
+    float* __restrict__ tnear; // start of ray segment (optional)
+
+    float* __restrict__ dir_x; // x coordinate of ray direction
+    float* __restrict__ dir_y; // y coordinate of ray direction
+    float* __restrict__ dir_z; // z coordinate of ray direction
+    float* __restrict__ time;         // time of this ray for motion blur (optional)
+
+    float* __restrict__ tfar;  // end of ray segment (set to hit distance)
+    unsigned int* __restrict__ mask;  // used to mask out objects during traversal (optional)
+    unsigned int* __restrict__ id;    // ray ID
+    unsigned int* __restrict__ flags; // ray flags
+
+    /* hit data */
+    float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+    float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+    float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+    float* __restrict__ u;    // barycentric u coordinate of hit
+    float* __restrict__ v;    // barycentric v coordinate of hit
+
+    unsigned int* __restrict__ primID; // primitive ID
+    unsigned int* __restrict__ geomID; // geometry ID
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+  };
+
+
+  struct RayStreamAOS
+  {
+    __forceinline RayStreamAOS(void* rays)
+      : ptr((Ray*)rays) {}
+
+    __forceinline Ray& getRayByOffset(size_t offset)
+    {
+      return *(Ray*)((char*)ptr + offset);
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      const vint<K> valid_offset = select(valid, offset, vintx(zero));
+      return getRayByOffset<K>(valid_offset);
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+          ray_k->tfar   = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+
+          instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+          ray_k->tfar = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    Ray* __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOS::getRayByOffset<4>(const vint4& offset)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+    const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+    const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+    const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOS::getRayByOffset<8>(const vint8& offset)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+    const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+    const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+    const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+    const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOS::getRayByOffset<16>(const vint16& offset)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+    const vfloat8 ab1  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+    const vfloat8 ab2  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+    const vfloat8 ab3  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+    const vfloat8 ab4  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+    const vfloat8 ab5  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+    const vfloat8 ab6  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+    const vfloat8 ab7  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+    const vfloat8 ab8  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+    const vfloat8 ab9  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+    const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+    const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+    const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+    const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+    const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+    const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+
+  struct RayStreamAOP
+  {
+    __forceinline RayStreamAOP(void* rays)
+      : ptr((Ray**)rays) {}
+
+    __forceinline Ray& getRayByIndex(size_t index)
+    {
+      return *ptr[index];
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+    {
+      const vint<K> valid_index = select(valid, index, vintx(zero));
+      return getRayByIndex<K>(valid_index);
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+          instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+        }
+      }
+    }
+
+    Ray** __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOP::getRayByIndex<4>(const vint4& index)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+    const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+    const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+    const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+    const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+    const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+    const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOP::getRayByIndex<8>(const vint8& index)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOP::getRayByIndex<16>(const vint16& index)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1  = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2  = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3  = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4  = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5  = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6  = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7  = vfloat8::loadu(&ptr[index[7]]->org);
+    const vfloat8 ab8  = vfloat8::loadu(&ptr[index[8]]->org);
+    const vfloat8 ab9  = vfloat8::loadu(&ptr[index[9]]->org);
+    const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+    const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+    const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+    const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+    const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+    const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&ptr[index[7]]->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&ptr[index[8]]->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&ptr[index[9]]->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+}
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..94b3819e42
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -0,0 +1,1766 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+  /* mutex to make API thread safe */
+  static MutexSys g_mutex;
+
+  RTC_API RTCDevice rtcNewDevice(const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewDevice);
+    Lock<MutexSys> lock(g_mutex);
+    Device* device = new Device(config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API void rtcRetainDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    device->refInc();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API void rtcReleaseDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    device->refDec();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceProperty);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    return device->getProperty(prop);
+    RTC_CATCH_END(device);
+    return 0;
+  }
+
+  RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceProperty);
+    const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+    if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+    Lock<MutexSys> lock(g_mutex);
+    device->setProperty(prop,val);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceError);
+    if (device == nullptr) return Device::getThreadErrorCode();
+    else                   return device->getDeviceErrorCode();
+    RTC_CATCH_END(device);
+    return RTC_ERROR_UNKNOWN;
+  }
+
+  RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceErrorFunction);
+    RTC_VERIFY_HANDLE(hdevice);
+    device->setErrorFunction(error, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+    device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSharedBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetBufferData);
+    RTC_VERIFY_HANDLE(hbuffer);
+    return buffer->data();
+    RTC_CATCH_END2(buffer);
+    return nullptr;
+  }
+
+  RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refInc();
+    RTC_CATCH_END2(buffer);
+  }
+  
+  RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refDec();
+    RTC_CATCH_END2(buffer);
+  }
+
+  RTC_API RTCScene rtcNewScene (RTCDevice hdevice) 
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewScene);
+    RTC_VERIFY_HANDLE(hdevice);
+    Scene* scene = new Scene((Device*)hdevice);
+    return (RTCScene) scene->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneDevice);
+    RTC_VERIFY_HANDLE(hscene);
+    return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+    RTC_CATCH_END2(scene);
+    return (RTCDevice)nullptr;
+  }
+
+  RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+    RTC_VERIFY_HANDLE(hscene);
+    Lock<MutexSys> lock(g_mutex);
+    scene->setProgressMonitorFunction(progress,ptr);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneBuildQuality);
+    RTC_VERIFY_HANDLE(hscene);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    scene->setBuildQuality(quality);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->setSceneFlags(flags);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    return scene->getSceneFlags();
+    RTC_CATCH_END2(scene);
+    return RTC_SCENE_FLAG_NONE;
+  }
+  
+  RTC_API void rtcCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(false);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcJoinCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcJoinCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(true);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    BBox3fa bounds = scene->bounds.bounds();
+    bounds_o->lower_x = bounds.lower.x;
+    bounds_o->lower_y = bounds.lower.y;
+    bounds_o->lower_z = bounds.lower.z;
+    bounds_o->align0  = 0;
+    bounds_o->upper_x = bounds.upper.x;
+    bounds_o->upper_y = bounds.upper.y;
+    bounds_o->upper_z = bounds.upper.z;
+    bounds_o->align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (bounds_o == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+    if (scene->isModified())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    
+    bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+    bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+    bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+    bounds_o->bounds0.align0  = 0;
+    bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+    bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+    bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+    bounds_o->bounds0.align1  = 0;
+    bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+    bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+    bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+    bounds_o->bounds1.align0  = 0;
+    bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+    bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+    bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+    bounds_o->bounds1.align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+  {
+    Scene* scene0 = (Scene*) hscene0;
+    Scene* scene1 = (Scene*) hscene1;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene0);
+    RTC_VERIFY_HANDLE(hscene1);
+    if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+    auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+    scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+    RTC_CATCH_END(scene0->device);
+  }
+  
+  inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    bool changed = false;
+    if (userContext->instStackSize > 0)
+    {
+      const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+      float similarityScale = 0.f;
+      const bool similtude = similarityTransform(transform, &similarityScale);
+      assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+      PointQuery query_inst;
+      query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); 
+      query_inst.radius = query->radius * similarityScale;
+      query_inst.time = query->time;
+      
+      PointQueryContext context_inst(scene, (PointQuery*)query,
+        similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+        queryFunc, userContext, similarityScale, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+    }
+    else
+    {
+      PointQueryContext context(scene, (PointQuery*)query, 
+        POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+    }
+    return changed;
+  }
+
+  RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(userContext);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+    if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");   
+#endif
+
+    return pointQuery(scene, query, userContext, queryFunc, userPtr);
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery4* query4 = (PointQuery4*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      query4->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query4->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery8);
+    
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery8* query8 = (PointQuery8*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      query8->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query8->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery16* query16 = (PointQuery16*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      PointQuery query1; query16->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query16->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT3(normal.travs,1,1,1);
+    IntersectContext context(scene,user_context);
+    scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+    ((RayHit*)rayhit)->verifyHit();
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)rayhit)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray4* ray4 = (Ray4*) rayhit;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray4->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray4->set(i,ray1);
+    }
+#else
+    scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)rayhit)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray8* ray8 = (Ray8*) rayhit;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray8->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.intersect8(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)rayhit)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray16* ray16 = (Ray16*) rayhit;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray16->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.intersect16(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
+        scene->intersectors.intersect(*rayhit,&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
+        scene->intersectors.intersect(*rn[0],&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N*M,N*M,N*M);
+    IntersectContext context(scene,user_context);
+
+    /* code path for single ray streams */
+    if (likely(N == 1))
+    {
+      /* fast code path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+      } 
+      /* normal codepath for single ray streams */
+      else {
+        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    IntersectContext context(scene,user_context);
+    scene->intersectors.occluded(*ray,&context);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)ray)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit4* ray4 = (RayHit4*) ray;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray4->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray4->geomID[i] = ray1.geomID; 
+    }
+#else
+    scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+ 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)ray)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit8* ray8 = (RayHit8*) ray;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray8->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.occluded8(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)ray)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit16* ray16 = (RayHit16*) ray;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray16->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.occluded16(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray->tnear <= ray->tfar)) 
+        scene->intersectors.occluded (*ray,&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
+        scene->intersectors.occluded (*ray[0],&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N*M,N*N,N*N);
+    IntersectContext context(scene,user_context);
+
+    /* codepath for single rays */
+    if (likely(N == 1))
+    {
+      /* fast path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+          scene->intersectors.occluded (*(RTCRay*)ray,&context);
+      } 
+      /* codepath for normal ray streams */
+      else {
+        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
+    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
+    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
+    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
+    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
+    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
+    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
+    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refInc();
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcReleaseScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refDec();
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Scene> scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    geometry->setInstancedScene(scene);
+    RTC_CATCH_END2(geometry);
+  }
+
+  AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+  {
+    AffineSpace3fa space = one;
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+                             Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+                             Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+                             Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+                             Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+                             Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+                             Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+                             Vec3fa(xfm[12], xfm[13], xfm[14]));
+      break;
+
+    default: 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+    return space;
+  }
+
+  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+  }
+
+  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransform);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(xfm);
+    const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+    geometry->setTransform(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransformQuaternion);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(qd);
+    
+    AffineSpace3fx transform;
+    transform.l.vx.x = qd->scale_x;
+    transform.l.vy.y = qd->scale_y;
+    transform.l.vz.z = qd->scale_z;
+    transform.l.vy.x = qd->skew_xy;
+    transform.l.vz.x = qd->skew_xz;
+    transform.l.vz.y = qd->skew_yz;
+    transform.l.vx.y = qd->translation_x;
+    transform.l.vx.z = qd->translation_y;
+    transform.l.vy.z = qd->translation_z;
+    transform.p.x    = qd->shift_x;
+    transform.p.y    = qd->shift_y;
+    transform.p.z    = qd->shift_z;
+
+    // normalize quaternion
+    Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+    q = normalize(q);
+    transform.l.vx.w = q.i;
+    transform.l.vy.w = q.j;
+    transform.l.vz.w = q.k;
+    transform.p.w    = q.r;
+
+    geometry->setQuaternionDecomposition(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransform);
+    const AffineSpace3fa transform = geometry->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+
+  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+  
+  RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewGeometry);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    switch (type)
+    {
+    case RTC_GEOMETRY_TYPE_TRIANGLE:
+    {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+      createTriangleMeshTy createTriangleMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createTriangleMesh);
+      Geometry* geom = createTriangleMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_QUAD:
+    {
+#if defined(EMBREE_GEOMETRY_QUAD)
+      createQuadMeshTy createQuadMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createQuadMesh);
+      Geometry* geom = createQuadMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+    case RTC_GEOMETRY_TYPE_DISC_POINT:
+    case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+    {
+#if defined(EMBREE_GEOMETRY_POINT)
+      createPointsTy createPoints = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_builder_cpu_features, createPoints);
+
+      Geometry *geom;
+      switch(type) {
+        case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+          geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_DISC_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+          break;
+        default:
+          geom = nullptr;
+          break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+    {
+#if defined(EMBREE_GEOMETRY_CURVE)
+      createLineSegmentsTy createLineSegments = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createLineSegments);
+      createCurvesTy createCurves = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createCurves);
+      
+      Geometry* geom;
+      switch (type) {
+      case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE            : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+      //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE  : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE            : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE             : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+      case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+      default:                                    geom = nullptr; break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SUBDIVISION:
+    {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+      createSubdivMeshTy createSubdivMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+      //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+      Geometry* geom = createSubdivMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_USER:
+    {
+#if defined(EMBREE_GEOMETRY_USER)
+      createUserGeometryTy createUserGeometry = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createUserGeometry);
+      Geometry* geom = createUserGeometry(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_INSTANCE:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+      createInstanceTy createInstance = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createInstance);
+      Geometry* geom = createInstance(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_GRID:
+    {
+#if defined(EMBREE_GEOMETRY_GRID)
+      createGridMeshTy createGridMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createGridMesh);
+      Geometry* geom = createGridMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+    }
+    
+    default:
+      throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+    }
+    
+    RTC_CATCH_END(device);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
+
+    geometry->setNumPrimitives(userPrimitiveCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeStepCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+    
+    geometry->setNumTimeSteps(timeStepCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+  {
+    Ref<Geometry> geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeRange);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (startTime > endTime)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+        
+    geometry->setTimeRange(BBox1f(startTime,endTime));
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTopologyCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTopologyCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+ 
+  RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuildQuality);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH &&
+        quality != RTC_BUILD_QUALITY_REFIT)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    geometry->setBuildQuality(quality);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+    RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+    if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+    geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMask);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setMask(mask);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometrySubdivisionMode);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setSubdivisionMode(topologyID,mode);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Buffer> buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hbuffer);
+    
+    if (geometry->device != buffer->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSharedGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetNewGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+    size_t bytes = itemCount*byteStride;
+    if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      bytes += (16 - (byteStride%16))%16;
+      
+    Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    return buffer->data();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryBufferData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+  
+  RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcEnableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->enable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcUpdateGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->updateBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDisableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->disable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTessellationRate);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTessellationRate(tessellationRate);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setUserData(ptr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getUserData();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBoundsFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setBoundsFunction(bounds,userPtr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryDisplacementFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setDisplacementFunction(displacement);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectFunctionN(intersect);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryPointQueryFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setPointQueryFunction(pointQuery);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    return geometry->getFirstHalfEdge(faceID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFace);
+    return geometry->getFace(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    return geometry->getNextHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    return geometry->getPreviousHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    return geometry->getOppositeHalfEdge(topologyID,edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetOccludedFunctionN);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOccludedFunctionN(occluded);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOcclusionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolate(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolateN(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->commit();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+    RTC_CATCH_END2(scene);
+    return -1;
+  }
+
+  RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometryByID);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_GEOMID(geomID);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    scene->bind(geomID,geometry);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDetachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+    scene->detachGeometry(geomID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refInc();
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refDec();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    return (RTCGeometry) scene->get(geomID);
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
new file mode 100644
index 0000000000..373e49a689
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{  
+  /*! decoding of intersection flags */
+  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+  
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
+  
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle)                               \
+  if (handle == nullptr) {                                         \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_GEOMID(id)                                   \
+  if (id == RTC_INVALID_GEOMETRY_ID) {                             \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_UPPER(id,upper)                              \
+  if (id > upper) {                                                \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_RANGE(id,lower,upper)	\
+  if (id < lower || id > upper)						  \
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+  
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x) 
+#endif
+
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define throw_RTCError(error,str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
+  #define throw_RTCError(error,str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+}
diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..1f1b6f6ddf
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa // FIXME: support more ISAs for builders
+  {
+    struct BVH : public RefCount
+    {
+      BVH (Device* device)
+        : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+      {
+        device->refInc();
+      }
+
+      ~BVH() {
+        device->refDec();
+      }
+
+    public:
+      Device* device;
+      FastAllocator allocator;
+      mvector<BVHBuilderMorton::BuildPrim> morton_src;
+      mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+    };
+
+    void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims_i =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+        
+      std::atomic<size_t> progress(0);
+      
+      /* initialize temporary arrays for morton builder */
+      PrimRef* prims = (PrimRef*) prims_i;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+      morton_src.resize(primitiveCount);
+      morton_tmp.resize(primitiveCount);
+
+      /* compute centroid bounds */
+      const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+          BBox3fa bounds(empty);
+          for (size_t i=r.begin(); i<r.end(); i++) 
+            bounds.extend(prims[i].bounds().center2());
+          return bounds;
+        }, BBox3fa::merge);
+      
+      /* compute morton codes */
+      BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+      parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+          BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            generator(prims[i].bounds(),(unsigned) i);
+          }
+        });
+
+      /* start morton build */
+      std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+        
+        /* lambda function that allocates BVH nodes */
+        [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+          return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+        },
+        
+        /* lambda function that sets bounds */
+        [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+        {
+          BBox3fa bounds = empty;
+          void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) {
+            bounds.extend(children[i].second);
+            childptrs[i] = children[i].first;
+            cbounds[i] = (const RTCBounds*)&children[i].second;
+          }
+          setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+          setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+        {
+	  RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+	  BBox3fa bounds = empty;
+	  for (size_t i=0;i<current.size();i++)
+	    {
+	      const size_t id = morton_src[current.begin()+i].index;
+	      bounds.extend(prims[id].bounds());
+	      localBuildPrims[i] = prims_i[id];
+	    }
+          void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda that calculates the bounds for some primitive */
+        [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+          return prims[morton.index].bounds();
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        morton_src.data(),morton_tmp.data(),primitiveCount,
+        *arguments);
+
+      bvh->allocator.cleanup();
+      return root.first;
+    }
+
+    void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+      auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+        {
+          CentGeomBBox3fa bounds(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            bounds.extend((BBox3fa&)prims[j]);
+          return bounds;
+        };
+      const CentGeomBBox3fa bounds = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+      
+      /* build BVH */
+      void* root = BVHBuilderBinnedSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+
+    static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+      CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+      unsigned int maxGeomID = max(a.second,b.second); 
+      return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+    }
+
+    void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+
+      auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+        {
+          CentGeomBBox3fa bounds(empty);
+          unsigned maxGeomID = 0;
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            bounds.extend((BBox3fa&)prims[j]);
+            maxGeomID = max(maxGeomID,prims[j].geomID);
+          }
+          return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+        };
+
+
+      const std::pair<CentGeomBBox3fa,unsigned int> pair = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+      CentGeomBBox3fa bounds = pair.first;
+      const unsigned int maxGeomID = pair.second;
+      
+      if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+        {
+          /* fallback code for max geomID larger than threshold */
+          return rtcBuildBVHBinnedSAH(arguments);
+        }
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+
+      /* function that splits a build primitive */
+      struct Splitter
+      {
+        Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+          : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+        
+        __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const 
+        {
+          prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+          left_o.geomIDref()  = geomID; left_o.primIDref()  = primID;
+          right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+        }
+
+        __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const 
+        {
+          PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+        }
+   
+        RTCSplitPrimitiveFunction splitPrimitive;
+        unsigned geomID;
+        unsigned primID;
+        void* userPtr;
+      };
+
+      /* build BVH */
+      void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* returns the splitter */
+        [&] ( const PrimRef& prim ) -> Splitter {
+          return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+        },
+
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,
+        arguments->primitiveArrayCapacity,
+        pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+  }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+    RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+    {
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcNewAllocator);
+      RTC_VERIFY_HANDLE(device);
+      BVH* bvh = new BVH((Device*)device);
+      return (RTCBVH) bvh->refInc();
+      RTC_CATCH_END((Device*)device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcBuildBVH);
+      RTC_VERIFY_HANDLE(bvh);
+      RTC_VERIFY_HANDLE(arguments);
+      RTC_VERIFY_HANDLE(arguments->createNode);
+      RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+      RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+      RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+      if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+      /* initialize the allocator */
+      bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+      bvh->allocator.reset();
+
+      /* switch between differnet builders based on quality level */
+      if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+        return rtcBuildBVHMorton(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+        return rtcBuildBVHBinnedSAH(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+        if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+          return rtcBuildBVHBinnedSAH(arguments);
+        else
+          return rtcBuildBVHSpatialSAH(arguments);
+      }
+      else
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+      /* if we are in dynamic mode, then do not clear temporary data */
+      if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+      {
+        bvh->morton_src.clear();
+        bvh->morton_tmp.clear();
+      }
+
+      RTC_CATCH_END(bvh->device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+    {
+      FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcThreadLocalAlloc);
+      return alloc->malloc0(bytes,align);
+      RTC_CATCH_END(alloc->alloc->getDevice());
+      return nullptr;
+    }
+
+    RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcStaticBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->morton_src.clear();
+      bvh->morton_tmp.clear();
+      RTC_CATCH_END(bvh->device);
+    }
+
+    RTC_API void rtcRetainBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcRetainBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refInc();
+      RTC_CATCH_END(device);
+    }
+    
+    RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcReleaseBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refDec();
+      RTC_CATCH_END(device);
+    }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
new file mode 100644
index 0000000000..408d7eae6f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -0,0 +1,955 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+ 
+namespace embree
+{
+  /* error raising rtcIntersect and rtcOccluded functions */
+  void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+  void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+  void invalid_rtcIntersect4()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+  void invalid_rtcIntersect8()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+  void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+  void invalid_rtcIntersectN()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+  Scene::Scene (Device* device)
+    : device(device),
+      flags_modified(true), enabled_geometry_types(0),
+      scene_flags(RTC_SCENE_FLAG_NONE),
+      quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+      is_build(false), modified(true),
+      progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+  {
+    device->refInc();
+
+    intersectors = Accel::Intersectors(missing_rtcCommit);
+
+    /* one can overwrite flags through device for debugging */
+    if (device->quality_flags != -1)
+      quality_flags = (RTCBuildQuality) device->quality_flags;
+    if (device->scene_flags != -1)
+      scene_flags = (RTCSceneFlags) device->scene_flags;
+  }
+
+  Scene::~Scene() noexcept
+  {
+    device->refDec();
+  }
+  
+  void Scene::printStatistics()
+  {
+    /* calculate maximum number of time segments */
+    unsigned max_time_steps = 0;
+    for (size_t i=0; i<size(); i++) {
+      if (!get(i)) continue;
+      max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+    }
+
+    /* initialize vectors*/
+    std::vector<size_t> statistics[Geometry::GTY_END];
+    for (size_t i=0; i<Geometry::GTY_END; i++)
+      statistics[i].resize(max_time_steps);
+
+    /* gather statistics */
+    for (size_t i=0; i<size(); i++) 
+    {
+      if (!get(i)) continue;
+      int ty = get(i)->getType(); 
+      assert(ty<Geometry::GTY_END);
+      int timesegments = get(i)->numTimeSegments(); 
+      assert((unsigned int)timesegments < max_time_steps);
+      statistics[ty][timesegments] += get(i)->size();
+    }
+
+    /* print statistics */
+    std::cout << std::setw(23) << "segments" << ": ";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << std::setw(10) << t;
+    std::cout << std::endl;
+
+    std::cout << "-------------------------";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << "----------";
+    std::cout << std::endl;
+    
+    for (size_t p=0; p<Geometry::GTY_END; p++)
+    {
+      if (std::string(Geometry::gtype_names[p]) == "") continue;
+      std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+      for (size_t t=0; t<max_time_steps; t++)
+        std::cout << std::setw(10) << statistics[p][t];
+      std::cout << std::endl;
+    }
+  }
+
+  void Scene::createTriangleAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel == "default") 
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        switch (mode) {
+        case /*0b00*/ 0: 
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else 
+#endif
+          { 
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else 
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1: 
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX()) 
+            accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+          break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->tri_accel == "bvh4.triangle4")       accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+    else if (device->tri_accel == "bvh4.triangle4v")      accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+    else if (device->tri_accel == "bvh4.triangle4i")      accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+    else if (device->tri_accel == "qbvh4.triangle4i")     accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel == "bvh8.triangle4")       accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+    else if (device->tri_accel == "bvh8.triangle4v")      accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+    else if (device->tri_accel == "bvh8.triangle4i")      accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4i")     accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4")      accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+  }
+
+  void Scene::createTriangleMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+      
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+  }
+
+  void Scene::createQuadAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel == "default") 
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        /* static */
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          break;
+
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->quad_accel == "bvh4.quad4v")       accels_add(device->bvh4_factory->BVH4Quad4v(this));
+    else if (device->quad_accel == "bvh4.quad4i")       accels_add(device->bvh4_factory->BVH4Quad4i(this));
+    else if (device->quad_accel == "qbvh4.quad4i")      accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel == "bvh8.quad4v")       accels_add(device->bvh8_factory->BVH8Quad4v(this));
+    else if (device->quad_accel == "bvh8.quad4i")       accels_add(device->bvh8_factory->BVH8Quad4i(this));
+    else if (device->quad_accel == "qbvh8.quad4i")      accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+  }
+
+  void Scene::createQuadMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel_mb == "default") 
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+      switch (mode) {
+      case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        break;
+
+      case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        break;
+
+      case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+      case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+      }
+    }
+    else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+  }
+
+  void Scene::createHairAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel == "default")
+    {
+      int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+  }
+
+  void Scene::createHairMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+      {
+        if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+      else
+#endif
+      {
+        if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+    }
+    else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+  }
+
+  void Scene::createSubdivAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    }
+    else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+  }
+
+  void Scene::createSubdivMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel_mb == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+  }
+
+  void Scene::createUserGeometryAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+  }
+
+  void Scene::createUserGeometryMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel_mb == "default"    ) {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+    }
+    else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createGridAccel()
+  {
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+      else
+#endif
+      {
+        accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+    }
+    else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::createGridMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel_mb == "default") 
+    {
+      accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+    }
+    else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+  }
+  
+  void Scene::clear() {
+  }
+
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) 
+  {
+    Lock<SpinLock> lock(geometriesMutex);
+    if (geomID == RTC_INVALID_GEOMETRY_ID) {
+      geomID = id_pool.allocate();
+      if (geomID == RTC_INVALID_GEOMETRY_ID)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+    }
+    else
+    {
+      if (!id_pool.add(geomID))
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+    }
+    if (geomID >= geometries.size()) {
+      geometries.resize(geomID+1);
+      vertices.resize(geomID+1);
+      geometryModCounters_.resize(geomID+1);
+    }
+    geometries[geomID] = geometry;
+    geometryModCounters_[geomID] = 0;
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    return geomID;
+  }
+
+  void Scene::detachGeometry(size_t geomID)
+  {
+    Lock<SpinLock> lock(geometriesMutex);
+    
+    if (geomID >= geometries.size())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+    Ref<Geometry>& geometry = geometries[geomID];
+    if (geometry == null)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+    
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    accels_deleteGeometry(unsigned(geomID));
+    id_pool.deallocate((unsigned)geomID);
+    geometries[geomID] = null;
+    vertices[geomID] = nullptr;
+    geometryModCounters_[geomID] = 0;
+  }
+
+  void Scene::updateInterface()
+  {
+    is_build = true;
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+    
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+    
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), 
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i) 
+        {
+          if (geometries[i] && geometries[i]->isEnabled()) 
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+    
+    /* select acceleration structures to build */
+    unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+    if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+    {
+      accels_init();
+
+      /* we need to make all geometries modified, otherwise two level builder will 
+        not rebuild currently not modified geometries */
+      parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+          geometryModCounters_[i] = 0;
+        });
+      
+      if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+      if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+      if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+      if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+      
+      flags_modified = false;
+      enabled_geometry_types = new_enabled_geometry_types;
+    }
+    
+    /* select fast code path if no filter function is present */
+    accels_select(hasFilterFunction());
+  
+    /* build all hierarchies of this scene */
+    accels_build();
+
+    /* make static geometry immutable */
+    if (!isDynamicAccel()) {
+      accels_immutable();
+      flags_modified = true; // in non-dynamic mode we have to re-create accels
+    }
+
+    /* call postCommit function of each geometry */
+    parallel_for(geometries.size(), [&] ( const size_t i ) {
+        if (geometries[i] && geometries[i]->isEnabled()) {
+          geometries[i]->postCommit();
+          vertices[i] = geometries[i]->getCompactVertexArray();
+          geometryModCounters_[i] = geometries[i]->getModCounter();
+        }
+      });
+      
+    updateInterface();
+
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+    
+    setModified(false);
+  }
+
+  void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+  {
+    if (quality_flags == quality_flags_i) return;
+    quality_flags = quality_flags_i;
+    flags_modified = true;
+  }
+
+  RTCBuildQuality Scene::getBuildQuality() const {
+    return quality_flags;
+  }
+
+  void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+  {
+    if (scene_flags == scene_flags_i) return;
+    scene_flags = scene_flags_i;
+    flags_modified = true;
+  }
+
+  RTCSceneFlags Scene::getSceneFlags() const {
+    return scene_flags;
+  }
+                   
+#if defined(TASKING_INTERNAL)
+
+  void Scene::commit (bool join) 
+  {
+    Lock<MutexSys> buildLock(buildMutex,false);
+
+    /* allocates own taskscheduler for each build */
+    Ref<TaskScheduler> scheduler = nullptr;
+    { 
+      Lock<MutexSys> lock(schedulerMutex);
+      scheduler = this->scheduler;
+      if (scheduler == null) {
+        buildLock.lock();
+        this->scheduler = scheduler = new TaskScheduler;
+      }
+    }
+
+    /* worker threads join build */
+    if (!buildLock.isLocked())
+    {
+      if (!join) 
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+      
+      scheduler->join();
+      return;
+    }
+
+    /* initiate build */
+    // -- GODOT start --
+    // try {
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
+  }
+
+#endif
+
+#if defined(TASKING_TBB)
+
+  void Scene::commit (bool join) 
+  {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+    /* join hierarchy build */
+    if (!lock.isLocked())
+    {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+      if (!join) 
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+      
+      do {
+
+#if USE_TASK_ARENA
+        if (join) {
+          device->arena->execute([&]{ group.wait(); });
+        }
+        else
+#endif
+        {
+          group.wait();
+        }
+
+        pause_cpu();
+        yield();
+      } while (!buildMutex.try_lock());
+      
+      buildMutex.unlock();
+      return;
+    }   
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+    
+    try {
+#if TBB_INTERFACE_VERSION_MAJOR < 8    
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+      //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+      if (join)
+      {
+        device->arena->execute([&]{
+            group.run([&]{
+                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+              });
+            group.wait();
+          });
+      }
+      else
+#endif
+      {
+        group.run([&]{
+            tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+          });
+        group.wait();
+      }
+     
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    } 
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+      
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+#if defined(TASKING_PPL)
+
+  void Scene::commit (bool join) 
+  {
+#if defined(TASKING_PPL)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex);
+
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+    
+    try {
+
+      group.run([&]{
+          concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+        });
+      group.wait();
+
+       /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    } 
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+      
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) 
+  {
+    progress_monitor_function = func;
+    progress_monitor_ptr      = ptr;
+  }
+
+  void Scene::progressMonitor(double dn)
+  {
+    if (progress_monitor_function) {
+      size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+      if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+        throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/scene.h b/thirdparty/embree/kernels/common/scene.h
new file mode 100644
index 0000000000..5ed80a63f6
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+ 
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! Base class all scenes are derived from */
+  class Scene : public AccelN
+  {
+    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+  public:
+    template<typename Ty, bool mblur = false>
+      class Iterator
+      {
+      public:
+      Iterator ()  {}
+      
+      Iterator (Scene* scene, bool all = false) 
+      : scene(scene), all(all) {}
+      
+      __forceinline Ty* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!all && !geom->isEnabled()) return nullptr;
+        const size_t mask = geom->getTypeMask() & Ty::geom_type; 
+        if (!(mask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return (Ty*) geom;
+      }
+
+      __forceinline Ty* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+      __forceinline size_t numPrimitives() const {
+        return scene->getNumPrimitives(Ty::geom_type,mblur);
+      }
+
+      __forceinline size_t maxPrimitivesPerGeometry() 
+      {
+        size_t ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->size());
+        }
+        return ret;
+      }
+
+      __forceinline unsigned int maxGeomID() 
+      {
+        unsigned int ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,(unsigned int)i);
+        }
+        return ret;
+      }
+
+      __forceinline unsigned maxTimeStepsPerGeometry()
+      {
+        unsigned ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->numTimeSteps);
+        }
+        return ret;
+      }
+      
+    private:
+      Scene* scene;
+      bool all;
+      };
+
+      class Iterator2
+      {
+      public:
+      Iterator2 () {}
+      
+      Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) 
+      : scene(scene), typemask(typemask), mblur(mblur) {}
+      
+      __forceinline Geometry* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!geom->isEnabled()) return nullptr;
+        if (!(geom->getTypeMask() & typemask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return geom;
+      }
+
+      __forceinline Geometry* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+    private:
+      Scene* scene;
+      Geometry::GTypeMask typemask;
+      bool mblur;
+    };
+
+  public:
+    
+    /*! Scene construction */
+    Scene (Device* device);
+
+    /*! Scene destruction */
+    ~Scene () noexcept;
+
+  private:
+    /*! class is non-copyable */
+    Scene (const Scene& other) DELETED; // do not implement
+    Scene& operator= (const Scene& other) DELETED; // do not implement
+
+  public:
+    void createTriangleAccel();
+    void createTriangleMBAccel();
+    void createQuadAccel();
+    void createQuadMBAccel();
+    void createHairAccel();
+    void createHairMBAccel();
+    void createSubdivAccel();
+    void createSubdivMBAccel();
+    void createUserGeometryAccel();
+    void createUserGeometryMBAccel();
+    void createInstanceAccel();
+    void createInstanceMBAccel();
+    void createInstanceExpensiveAccel();
+    void createInstanceExpensiveMBAccel();
+    void createGridAccel();
+    void createGridMBAccel();
+
+    /*! prints statistics about the scene */
+    void printStatistics();
+
+    /*! clears the scene */
+    void clear();
+
+    /*! detaches some geometry */
+    void detachGeometry(size_t geomID);
+
+    void setBuildQuality(RTCBuildQuality quality_flags);
+    RTCBuildQuality getBuildQuality() const;
+    
+    void setSceneFlags(RTCSceneFlags scene_flags);
+    RTCSceneFlags getSceneFlags() const;
+    
+    void commit (bool join);
+    void commit_task ();
+    void build () {}
+
+    void updateInterface();
+
+    /* return number of geometries */
+    __forceinline size_t size() const { return geometries.size(); }
+    
+    /* bind geometry to the scene */
+    unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+    
+    /* determines if scene is modified */
+    __forceinline bool isModified() const { return modified; }
+
+    /* sets modified flag */
+    __forceinline void setModified(bool f = true) { 
+      modified = f; 
+    }
+
+    __forceinline bool isGeometryModified(size_t geomID)
+    {
+      Ref<Geometry>& g = geometries[geomID];
+      if (!g) return false;
+      return g->getModCounter() > geometryModCounters_[geomID];
+    }
+
+  protected:
+    
+    __forceinline void checkIfModifiedAndSet () 
+    {
+      if (isModified ()) return;
+      
+      auto geometryIsModified = [this](size_t geomID)->bool {
+        return isGeometryModified(geomID);
+      };
+
+      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+        setModified ();
+      }
+    }
+    
+  public:
+
+    /* get mesh by ID */
+    __forceinline       Geometry* get(size_t i)       { assert(i < geometries.size()); return geometries[i].ptr; }
+    __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+    template<typename Mesh>
+      __forceinline       Mesh* get(size_t i)       { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+    template<typename Mesh>
+      __forceinline const Mesh* get(size_t i) const { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+
+    template<typename Mesh>
+    __forceinline Mesh* getSafe(size_t i) {
+      assert(i < geometries.size());
+      if (geometries[i] == null) return nullptr;
+      if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+      else return (Mesh*) geometries[i].ptr;
+    }
+
+    __forceinline Ref<Geometry> get_locked(size_t i)  {
+      Lock<SpinLock> lock(geometriesMutex);
+      assert(i < geometries.size()); 
+      return geometries[i]; 
+    }
+
+    /* flag decoding */
+    __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+    __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+    __forceinline bool isRobustAccel()  const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+    __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+    __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+    
+    __forceinline bool hasContextFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    }
+    
+    __forceinline bool hasGeometryFilterFunction() {
+      return world.numFilterFunctions != 0;
+    }
+      
+    __forceinline bool hasFilterFunction() {
+      return hasContextFilterFunction() || hasGeometryFilterFunction();
+    }
+    
+    /* test if scene got already build */
+    __forceinline bool isBuild() const { return is_build; }
+
+  public:
+    IDPool<unsigned,0xFFFFFFFE> id_pool;
+    vector<Ref<Geometry>> geometries; //!< list of all user geometries
+    vector<unsigned int> geometryModCounters_;
+    vector<float*> vertices;
+    
+  public:
+    Device* device;
+
+    /* these are to detect if we need to recreate the acceleration structures */
+    bool flags_modified;
+    unsigned int enabled_geometry_types;
+    
+    RTCSceneFlags scene_flags;
+    RTCBuildQuality quality_flags;
+    MutexSys buildMutex;
+    SpinLock geometriesMutex;
+    bool is_build;
+  private:
+    bool modified;                   //!< true if scene got modified
+
+  public:
+    
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL) 
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+    
+  public:
+    struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+      BuildProgressMonitorInterface(Scene* scene) 
+      : scene(scene) {}
+      void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+    private:
+      Scene* scene;
+    };
+    BuildProgressMonitorInterface progressInterface;
+    RTCProgressMonitorFunction progress_monitor_function;
+    void* progress_monitor_ptr;
+    std::atomic<size_t> progress_monitor_counter;
+    void progressMonitor(double nprims);
+    void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+  private:
+    GeometryCounts world;               //!< counts for geometry
+
+  public:
+
+    __forceinline size_t numPrimitives() const {
+      return world.size();
+    }
+
+    __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_TRIANGLE_MESH)
+        count += mblur ? world.numMBTriangles : world.numTriangles;
+      
+      if (mask & Geometry::MTY_QUAD_MESH)
+        count += mblur ? world.numMBQuads : world.numQuads;
+      
+      if (mask & Geometry::MTY_CURVE2)
+        count += mblur ? world.numMBLineSegments : world.numLineSegments;
+      
+      if (mask & Geometry::MTY_CURVE4)
+        count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+      
+      if (mask & Geometry::MTY_POINTS)
+        count += mblur ? world.numMBPoints : world.numPoints;
+      
+      if (mask & Geometry::MTY_SUBDIV_MESH)
+        count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+      
+      if (mask & Geometry::MTY_USER_GEOMETRY)
+        count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+      
+      if (mask & Geometry::MTY_INSTANCE_CHEAP)
+        count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+      
+      if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+        count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBGrids : world.numGrids;
+      
+      return count;
+    }
+    
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned getNumTimeSteps()
+    {
+      if (!mblur)
+        return 1;
+
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxTimeStepsPerGeometry();
+    }
+
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned int getMaxGeomID()
+    {
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxGeomID();
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..a5a39e42d4
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_curves.h
@@ -0,0 +1,688 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  /*! represents an array of bicubic bezier curves */
+  struct CurveGeometry : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+  public:
+    
+    /*! bezier curve construction */
+    CurveGeometry (Device* device, Geometry::GType gtype);
+    
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+    
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th curve */
+    __forceinline const unsigned int& curve(size_t i) const {
+      return curves[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th tangent of the first time step */
+    __forceinline Vec3ff tangent(size_t i) const {
+      return tangents0[i];
+    }
+
+    /*! returns i'th normal derivative of the first time step */
+    __forceinline Vec3fa dnormal(size_t i) const {
+      return dnormals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th tangent of itime'th timestep */
+    __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+      return tangents[itime][i];
+    }
+
+    /*! returns i'th normal derivative of itime'th timestep */
+    __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+      return dnormals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      n2 = normal(i+2,itime);
+      n3 = normal(i+3,itime);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL1_vertices(size_t i) const
+    {
+      prefetchL1(vertices0.getPtr(i)+0);
+      prefetchL1(vertices0.getPtr(i)+64);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL2_vertices(size_t i) const
+    {
+      prefetchL2(vertices0.getPtr(i)+0);
+      prefetchL2(vertices0.getPtr(i)+64);
+    }  
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3;
+      gather(a0,a1,a2,a3,i,itime);
+      Vec3ff b0,b1,b2,b3;
+      gather(b0,b1,b2,b3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+      gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+      Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+      gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+      n0 = madd(Vec3ff(t0),an0,t1*bn0);
+      n1 = madd(Vec3ff(t0),an1,t1*bn1);
+      n2 = madd(Vec3ff(t0),an2,t1*bn2);
+      n3 = madd(Vec3ff(t0),an3,t1*bn3);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+      unsigned int vertexID = curve(primID);
+      gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+      SourceCurve3ff ccurve(v0,v1,v2,v3);
+      SourceCurve3fa ncurve(n0,n1,n2,n3);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1;
+      gather_hermite(ap0,at0,ap1,at1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1;
+      gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      dn0 = dnormal(i+0);
+      dn1 = dnormal(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      dn0 = dnormal(i+0,itime);
+      dn1 = dnormal(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+      gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+      gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      n0 = madd(Vec3ff(f0),an0,f1*bn0);
+      dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+      n1 = madd(Vec3ff(f0),an1,f1*bn1);
+      dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+      unsigned int vertexID = curve(primID);
+      gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+      SourceCurve3ff ccurve(v0,t0,v1,t1);
+      SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+  private:
+    void resizeBuffers(unsigned int numSteps);
+
+  public:
+    BufferView<unsigned int> curves;        //!< array of curve indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
+    BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
+    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for flat curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    
+  template<template<typename Ty> class Curve>
+  struct CurveGeometryInterface : public CurveGeometry
+  {
+    typedef Curve<Vec3ff> Curve3ff;
+    typedef Curve<Vec3fa> Curve3fa;
+    
+    CurveGeometryInterface (Device* device, Geometry::GType gtype)
+      : CurveGeometry(device,gtype) {}
+    
+    __forceinline const Curve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      Vec3ff v0 = vertex(index+0,itime);
+      Vec3ff v1 = vertex(index+1,itime);
+      Vec3ff v2 = vertex(index+2,itime);
+      Vec3ff v3 = vertex(index+3,itime);
+      v0.w *= maxRadiusScale;
+      v1.w *= maxRadiusScale;
+      v2.w *= maxRadiusScale;
+      v3.w *= maxRadiusScale;
+      return Curve3ff (v0,v1,v2,v3);
+    }
+    
+    __forceinline const Curve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff v2 = vertex(index+2,itime);
+      const Vec3ff v3 = vertex(index+3,itime);
+      const Vec3ff w0(xfmPoint(space,(Vec3fa)v0), maxRadiusScale*v0.w);
+      const Vec3ff w1(xfmPoint(space,(Vec3fa)v1), maxRadiusScale*v1.w);
+      const Vec3ff w2(xfmPoint(space,(Vec3fa)v2), maxRadiusScale*v2.w);
+      const Vec3ff w3(xfmPoint(space,(Vec3fa)v3), maxRadiusScale*v3.w);
+      return Curve3ff(w0,w1,w2,w3);
+    }
+    
+    __forceinline const Curve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const float r_scale = r_scale0*scale;
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff v2 = vertex(index+2,itime);
+      const Vec3ff v3 = vertex(index+3,itime);
+      const Vec3ff w0(xfmPoint(space,((Vec3fa)v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale);
+      const Vec3ff w1(xfmPoint(space,((Vec3fa)v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale);
+      const Vec3ff w2(xfmPoint(space,((Vec3fa)v2-ofs)*Vec3fa(scale)), maxRadiusScale*v2.w*r_scale);
+      const Vec3ff w3(xfmPoint(space,((Vec3fa)v3-ofs)*Vec3fa(scale)), maxRadiusScale*v3.w*r_scale);
+      return Curve3ff(w0,w1,w2,w3);
+    }
+    
+    __forceinline const Curve3fa getNormalCurve(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3fa n0 = normal(index+0,itime);
+      const Vec3fa n1 = normal(index+1,itime);
+      const Vec3fa n2 = normal(index+2,itime);
+      const Vec3fa n3 = normal(index+3,itime);
+      return Curve3fa (n0,n1,n2,n3);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const Curve3ff center = getCurveScaledRadius(i,itime);
+      const Curve3fa normal = getNormalCurve(i,itime);
+      const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal);
+      return ocurve;
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale);
+    }
+    
+    /*! check if the i'th primitive is valid at the itime'th time step */
+    __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = curve(i);
+      if (index+3 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const float r0 = radius(index+0,itime);
+        const float r1 = radius(index+1,itime);
+        const float r2 = radius(index+2,itime);
+        const float r3 = radius(index+3,itime);
+        if (!isvalid(r0) || !isvalid(r1) || !isvalid(r2) || !isvalid(r3))
+          return false;
+        
+        const Vec3fa v0 = vertex(index+0,itime);
+        const Vec3fa v1 = vertex(index+1,itime);
+        const Vec3fa v2 = vertex(index+2,itime);
+        const Vec3fa v3 = vertex(index+3,itime);
+        if (!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3))
+          return false;
+        
+        if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE)
+        {
+          const Vec3fa n0 = normal(index+0,itime);
+          const Vec3fa n1 = normal(index+1,itime);
+          if (!isvalid(n0) || !isvalid(n1))
+            return false;
+        }
+      }
+      
+      return true;
+    }
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        size_t ofs = i*sizeof(float);
+        const size_t index = curves[primID];
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+1)*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+2)*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+3)*stride+ofs]);
+        
+        const Curve<vfloat<N>> curve(p0,p1,p2,p3);
+        if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      curve.eval(u));
+        if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   curve.eval_du(u));
+        if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u));
+      }
+    }
+
+    void interpolate(const RTCInterpolateArguments* const args) {
+      interpolate_impl<4>(args);
+    }
+  };
+  
+  template<template<typename Ty> class Curve>
+  struct HermiteCurveGeometryInterface : public CurveGeometry
+  {
+    typedef Curve<Vec3ff> HermiteCurve3ff;
+    typedef Curve<Vec3fa> HermiteCurve3fa;
+    
+    HermiteCurveGeometryInterface (Device* device, Geometry::GType gtype)
+      : CurveGeometry(device,gtype) {}
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      Vec3ff v0 = vertex(index+0,itime);
+      Vec3ff v1 = vertex(index+1,itime);
+      Vec3ff t0 = tangent(index+0,itime);
+      Vec3ff t1 = tangent(index+1,itime);
+      v0.w *= maxRadiusScale;
+      v1.w *= maxRadiusScale;
+      t0.w *= maxRadiusScale;
+      t1.w *= maxRadiusScale;
+      return HermiteCurve3ff (v0,t0,v1,t1);
+    }
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff t0 = tangent(index+0,itime);
+      const Vec3ff t1 = tangent(index+1,itime);
+      const Vec3ff V0(xfmPoint(space,(Vec3fa)v0),maxRadiusScale*v0.w);
+      const Vec3ff V1(xfmPoint(space,(Vec3fa)v1),maxRadiusScale*v1.w);
+      const Vec3ff T0(xfmVector(space,(Vec3fa)t0),maxRadiusScale*t0.w);
+      const Vec3ff T1(xfmVector(space,(Vec3fa)t1),maxRadiusScale*t1.w);
+      return HermiteCurve3ff(V0,T0,V1,T1);
+    }
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const float r_scale = r_scale0*scale;
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff t0 = tangent(index+0,itime);
+      const Vec3ff t1 = tangent(index+1,itime);
+      const Vec3ff V0(xfmPoint(space,(v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale);
+      const Vec3ff V1(xfmPoint(space,(v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale);
+      const Vec3ff T0(xfmVector(space,t0*Vec3fa(scale)), maxRadiusScale*t0.w*r_scale);
+      const Vec3ff T1(xfmVector(space,t1*Vec3fa(scale)), maxRadiusScale*t1.w*r_scale);
+      return HermiteCurve3ff(V0,T0,V1,T1);
+    }
+    
+    __forceinline const HermiteCurve3fa getNormalCurve(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3fa n0 = normal(index+0,itime);
+      const Vec3fa n1 = normal(index+1,itime);
+      const Vec3fa dn0 = dnormal(index+0,itime);
+      const Vec3fa dn1 = dnormal(index+1,itime);
+      return HermiteCurve3fa (n0,dn0,n1,dn1);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const HermiteCurve3ff center = getCurveScaledRadius(i,itime);
+      const HermiteCurve3fa normal = getNormalCurve(i,itime);
+      const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal);
+      return ocurve;
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale);
+    }
+    
+    /*! check if the i'th primitive is valid at the itime'th time step */
+    __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = curve(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime);
+        const Vec3ff v1 = vertex(index+1,itime);
+        if (!isvalid4(v0) || !isvalid4(v1))
+          return false;
+        
+        const Vec3ff t0 = tangent(index+0,itime);
+        const Vec3ff t1 = tangent(index+1,itime);
+        if (!isvalid4(t0) || !isvalid4(t1))
+          return false;
+        
+        if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE)
+        {
+          const Vec3fa n0 = normal(index+0,itime);
+          const Vec3fa n1 = normal(index+1,itime);
+          if (!isvalid(n0) || !isvalid(n1))
+            return false;
+          
+          const Vec3fa dn0 = dnormal(index+0,itime);
+          const Vec3fa dn1 = dnormal(index+1,itime);
+          if (!isvalid(dn0) || !isvalid(dn1))
+            return false;
+        }
+      }
+      
+      return true;
+    }
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* we interpolate vertex attributes linearly for hermite basis */
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      {
+        assert(bufferSlot <= vertexAttribs.size());
+        const char* vsrc = vertexAttribs[bufferSlot].getPtr();
+        const size_t vstride = vertexAttribs[bufferSlot].getStride();
+        
+        for (unsigned int i=0; i<valueCount; i+=N)
+        {
+          const size_t ofs = i*sizeof(float);
+          const size_t index = curves[primID];
+          const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+          const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]);
+          const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]);
+          
+          if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      madd(1.0f-u,p0,u*p1));
+          if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   p1-p0);
+          if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+        }
+      }
+      
+      /* interpolation for vertex buffers */
+      else
+      {
+        assert(bufferSlot < numTimeSteps);
+        const char* vsrc = vertices[bufferSlot].getPtr();
+        const char* tsrc = tangents[bufferSlot].getPtr();
+        const size_t vstride = vertices[bufferSlot].getStride();
+        const size_t tstride = vertices[bufferSlot].getStride();
+        
+        for (unsigned int i=0; i<valueCount; i+=N)
+        {
+          const size_t ofs = i*sizeof(float);
+          const size_t index = curves[primID];
+          const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+          const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]);
+          const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]);
+          const vfloat<N> t0 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+0)*tstride+ofs]);
+          const vfloat<N> t1 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+1)*tstride+ofs]);
+          
+          const HermiteCurveT<vfloat<N>> curve(p0,t0,p1,t1);
+          if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      curve.eval(u));
+          if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   curve.eval_du(u));
+          if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u));
+        }
+      }
+    }
+
+    void interpolate(const RTCInterpolateArguments* const args) {
+      interpolate_impl<4>(args);
+    }
+  };
+  }
+  
+  DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree/kernels/common/scene_grid_mesh.h b/thirdparty/embree/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..fb6fed445b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,294 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Grid Mesh */
+  struct GridMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+    /*! grid */
+    struct Grid 
+    {
+      unsigned int startVtxID;
+      unsigned int lineVtxOffset;
+      unsigned short resX,resY;
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+      {
+        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+      }
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+      {
+        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+      }
+
+      /*! outputs grid structure */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+      }
+    };
+
+  public:
+
+    /*! grid mesh construction */
+    GridMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float U = args->u;
+      float V = args->v;
+      
+      /* clamp input u,v to [0;1] range */
+      U = max(min(U,1.0f),0.0f);
+      V = max(min(V,1.0f),0.0f);
+      
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      const Grid& grid = grids[primID];
+      const int grid_width  = grid.resX-1;
+      const int grid_height = grid.resY-1;
+      const float rcp_grid_width = rcp(float(grid_width));
+      const float rcp_grid_height = rcp(float(grid_height));
+      const int iu = min((int)floor(U*grid_width ),grid_width);
+      const int iv = min((int)floor(V*grid_height),grid_height);
+      const float u = U*grid_width-float(iu);
+      const float v = V*grid_height-float(iv);
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const size_t ofs = i*sizeof(float);
+        const unsigned int idx0 = grid.startVtxID + (iv+0)*grid.lineVtxOffset + iu;
+        const unsigned int idx1 = grid.startVtxID + (iv+1)*grid.lineVtxOffset + iu;
+        
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+1)*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+1)*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+0)*stride+ofs]);
+        const vbool<N> left = u+v <= 1.0f;
+        const vfloat<N> Q0 = select(left,p0,p2);
+        const vfloat<N> Q1 = select(left,p1,p3);
+        const vfloat<N> Q2 = select(left,p3,p1);
+        const vfloat<N> U  = select(left,u,vfloat<N>(1.0f)-u);
+        const vfloat<N> V  = select(left,v,vfloat<N>(1.0f)-v);
+        const vfloat<N> W  = 1.0f-U-V;
+        
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2)));
+        }
+        if (dPdu) { 
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)*rcp_grid_width);
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)*rcp_grid_height);
+        }
+        if (ddPdudu) { 
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+    
+    void addElementsToCount (GeometryCounts & counts) const;
+    
+    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    {
+      const Grid &g = grid(gridID);
+      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th grid*/
+    __forceinline const Grid& grid(size_t i) const {
+      return grids[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th vertex of the first timestep */
+    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+      assert(x < (size_t)g.resX);
+      assert(y < (size_t)g.resY);
+      return g.startVtxID + x + y * g.lineVtxOffset;
+    }
+    
+    /*! returns i'th vertex of the first timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index);
+    }
+
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,itime);
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+    {
+      assert(itime < numTimeSteps);
+      BBox3fa b0(empty);
+      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+        {
+          const Vec3fa v = grid_vertex(g,x,y,itime);
+          if (unlikely(!isvalid(v))) return false;
+          b0.extend(v);
+        }
+
+      /* use bounds of first time step in builder */
+      bbox = b0;
+      return true;
+    }
+
+    __forceinline bool valid(size_t gridID, size_t itime=0) const {
+      return valid(gridID, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+    {
+      if (unlikely(gridID >= grids.size())) return false;
+      const Grid &g = grid(gridID);
+      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
+      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+      for (size_t y=0;y<g.resY;y++)
+        for (size_t x=0;x<g.resX;x++)
+          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+      return true;
+    }
+
+
+    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+    {
+      BBox3fa box(empty);
+      buildBounds(g,sx,sy,itime,box);
+      return box;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+      BBox3fa bounds0, bounds1;
+      buildBounds(g,sx,sy,itime+0,bounds0);
+      buildBounds(g,sx,sy,itime+1,bounds1);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+  public:
+    BufferView<Grid> grids;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct GridMeshISA : public GridMesh
+    {
+      GridMeshISA (Device* device)
+        : GridMesh(device) {}
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_instance.h b/thirdparty/embree/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..773f2b6fec
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct Instance : public Geometry
+  {
+    ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+  public:
+    Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+    ~Instance();
+
+  private:
+    Instance (const Instance& other) DELETED; // do not implement
+    Instance& operator= (const Instance& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+                         lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+      return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+                        lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+    }
+
+  public:
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+    virtual AffineSpace3fa getTransform(float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+      return xfmBounds(local2world[0],object->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t itime) const {
+      return object->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+      return xfmBounds(local2world[itime],getObjectBounds(itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      assert(i == 0);
+      LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      assert(i==0);
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      assert(i==0);
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      assert(i == 0);
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World() const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(local2world[0]);
+      return local2world[0];
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(float t) const
+    {
+      float ftime; const unsigned int itime = timeSegment(t, ftime);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return slerp(local2world[itime+0],local2world[itime+1],ftime);
+      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local() const {
+      return world2local0;
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(float t) const {
+      return rcp(getLocal2World(t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp<K>(valid, t);
+      return getWorld2LocalLerp<K>(valid, t);
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+                         AffineSpace3vff<K>(local2world[itime+1]),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+                        AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  public:
+    Accel* object;                 //!< pointer to instanced acceleration structure
+    AffineSpace3ff* local2world;   //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    AffineSpace3fa world2local0;   //!< transformation from world space to local space for timestep 0
+  };
+
+  namespace isa
+  {
+    struct InstanceISA : public Instance
+    {
+      InstanceISA (Device* device)
+        : Instance(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // const BBox3fa b = bounds(0);
+        // if (!isvalid(b)) return pinfo;
+
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // if (!valid(0,range<size_t>(itime))) return pinfo;
+        // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfoMB pinfo(empty);
+        if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+        const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+        pinfo.add_primref(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_line_segments.h b/thirdparty/embree/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..3c9fdb39db
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_line_segments.h
@@ -0,0 +1,345 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of line segments */
+  struct LineSegments : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+  public:
+
+    /*! line segments construction */
+    LineSegments (Device* device, Geometry::GType gtype);
+
+  public:
+    void setMask (unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify ();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr;
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const size_t ofs = i*sizeof(float);
+        const size_t segment = segments[primID];
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+1)*stride+ofs]);
+        if (P      ) mem<vfloat<N>>::storeu(valid,P+i,lerp(p0,p1,u));
+        if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0);
+        if (ddPdudu) mem<vfloat<N>>::storeu(valid,dPdu+i,vfloat<N>(zero));
+      }
+    }
+    
+  public:
+
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th segment */
+    __forceinline const unsigned int& segment(size_t i) const {
+      return segments[i];
+    }
+
+    /*! returns the segment to the left of the i'th segment */
+    __forceinline bool segmentLeftExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+    }
+
+    /*! returns the segment to the right of the i'th segment */
+    __forceinline bool segmentRightExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+    }
+
+     /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+    {
+      const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+      return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = segment(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+        const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+        if (min(v0.w,v1.w) < 0.0f) return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i,0)) return false;
+      *bbox = bounds(i); 
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+      bbox = bounds(i,itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+    BufferView<unsigned int> segments;      //!< array of line segment indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for bezier curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct LineSegmentsISA : public LineSegments
+    {
+      LineSegmentsISA (Device* device, Geometry::GType gtype)
+        : LineSegments(device,gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0);
+        const Vec3fa v1 = vertex(vtxID+1);
+        return v1-v0;
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0,time);
+        const Vec3fa v1 = vertex(vtxID+1,time);
+        return v1-v0;
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const {
+        return bounds(i);
+      }
+      
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+        return bounds(space,i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+      
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(space,primID,time_range);
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree/kernels/common/scene_points.h b/thirdparty/embree/kernels/common/scene_points.h
new file mode 100644
index 0000000000..017e098a51
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! represents an array of points */
+  struct Points : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+   public:
+    /*! line segments construction */
+    Points(Device* device, Geometry::GType gtype);
+
+   public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps(unsigned int numTimeSteps);
+    void setVertexAttributeCount(unsigned int N);
+    void setBuffer(RTCBufferType type,
+                   unsigned int slot,
+                   RTCFormat format,
+                   const Ref<Buffer>& buffer,
+                   size_t offset,
+                   size_t stride,
+                   unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+   public:
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+      return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = (unsigned int)i;
+      if (index >= numVertices())
+        return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+        const Vec3ff v0 = vertex(index + 0, itime);
+        if (unlikely(!isvalid4(v0)))
+          return false;
+        if (v0.w < 0.0f)
+          return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i, 0))
+        return false;
+      *bbox = bounds(i);
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i, itime + 0) || !valid(i, itime + 1))
+        return false;
+      bbox = bounds(i, itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+   public:
+    BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
+    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct PointsISA : public Points
+    {
+      PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+                                      const BBox1f& t0t1,
+                                      const range<size_t>& r,
+                                      size_t k,
+                                      unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const
+      {
+        return bounds(i);
+      }
+
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+      {
+        return bounds(space, i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(primID, time_range);
+      }
+
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(space, primID, time_range);
+      }
+    };
+  }  // namespace isa
+
+  DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/common/scene_quad_mesh.h b/thirdparty/embree/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..bd8eeaaeb7
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,337 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Quad Mesh */
+  struct QuadMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+    
+    /*! triangle indices */
+    struct Quad
+    {
+      uint32_t v[4];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+        return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+      }
+    };
+
+  public:
+
+    /*! quad mesh construction */
+    QuadMesh (Device* device); 
+  
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+      void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      float v = args->v;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const size_t ofs = i*sizeof(float);
+        const Quad& tri = quad(primID);
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[3]*stride+ofs]);      
+        const vbool<N> left = u+v <= 1.0f;
+        const vfloat<N> Q0 = select(left,p0,p2);
+        const vfloat<N> Q1 = select(left,p1,p3);
+        const vfloat<N> Q2 = select(left,p3,p1);
+        const vfloat<N> U  = select(left,u,vfloat<N>(1.0f)-u);
+        const vfloat<N> V  = select(left,v,vfloat<N>(1.0f)-v);
+        const vfloat<N> W  = 1.0f-U-V;
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2)));
+        }
+        if (dPdu) { 
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1));
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2));
+        }
+        if (ddPdudu) { 
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+        
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th quad */
+    __forceinline const Quad& quad(size_t i) const {
+      return quads[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th quad */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! calculates the bounds of the i'th quad at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0],itime);
+      const Vec3fa v1 = vertex(q.v[1],itime);
+      const Vec3fa v2 = vertex(q.v[2],itime);
+      const Vec3fa v3 = vertex(q.v[3],itime);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(q.v[0],itime))) return false;
+        if (!isvalid(vertex(q.v[1],itime))) return false;
+        if (!isvalid(vertex(q.v[2],itime))) return false;
+        if (!isvalid(vertex(q.v[3],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Quad& q = quad(i);
+      if (q.v[0] >= numVertices()) return false;
+      if (q.v[1] >= numVertices()) return false;
+      if (q.v[2] >= numVertices()) return false;
+      if (q.v[3] >= numVertices()) return false;
+
+      for (unsigned int t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(q.v[0],t);
+        const Vec3fa v1 = vertex(q.v[1],t);
+        const Vec3fa v2 = vertex(q.v[2],t);
+        const Vec3fa v3 = vertex(q.v[3],t);
+
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+          return false;
+      }
+
+      if (bbox) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+      const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return quads.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return quads.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return areaProjectedTriangle(v0,v1,v3) +
+	areaProjectedTriangle(v1,v2,v3);
+    }
+
+  public:
+    BufferView<Quad> quads;                 //!< array of quads
+    BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+  };
+
+  namespace isa
+  {
+    struct QuadMeshISA : public QuadMesh
+    {
+      QuadMeshISA (Device* device)
+        : QuadMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..1db170196d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+  class SubdivMesh : public Geometry
+  {
+    ALIGNED_CLASS_(16);
+  public:
+
+    typedef HalfEdge::Edge Edge;
+    
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+    /*! structure used to sort half edges using radix sort by their key */
+    struct KeyHalfEdge 
+    {
+      KeyHalfEdge() {}
+      
+      KeyHalfEdge (uint64_t key, HalfEdge* edge) 
+      : key(key), edge(edge) {}
+      
+      __forceinline operator uint64_t() const { 
+	return key; 
+      }
+
+      friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+        return e0.key < e1.key;
+      }
+      
+    public:
+      uint64_t key;
+      HalfEdge* edge;
+    };
+
+  public:
+
+    /*! subdiv mesh construction */
+    SubdivMesh(Device* device);
+
+  public:
+    void setMask (unsigned mask);
+    void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+    void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setTopologyCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void setTessellationRate(float N);
+    bool verify();
+    void commit();
+    void addElementsToCount (GeometryCounts & counts) const;
+    void setDisplacementFunction (RTCDisplacementFunctionN func);
+    unsigned int getFirstHalfEdge(unsigned int faceID);
+    unsigned int getFace(unsigned int edgeID);
+    unsigned int getNextHalfEdge(unsigned int edgeID);
+    unsigned int getPreviousHalfEdge(unsigned int edgeID);
+    unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+  public:
+
+    /*! return the number of faces */
+    size_t numFaces() const { 
+      return faceVertices.size(); 
+    }
+
+    /*! return the number of edges */
+    size_t numEdges() const { 
+      return topology[0].vertexIndices.size(); 
+    }
+
+    /*! return the number of vertices */
+    size_t numVertices() const { 
+      return vertices[0].size(); 
+    }
+
+    /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+      return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+    }
+
+    /*! check if the i'th primitive is valid */
+    __forceinline bool valid(size_t i) const {
+      return topology[0].valid(i) && !invalidFace(i);
+    }
+
+    /*! check if the i'th primitive is valid for the j'th time range */
+    __forceinline bool valid(size_t i, size_t j) const {
+      return topology[0].valid(i) && !invalidFace(i,j);
+    }
+
+    /*! prints some statistics */
+    void printStatistics();
+
+    /*! initializes the half edge data structure */
+    void initializeHalfEdgeStructures ();
+ 
+  public:
+
+    /*! returns the vertex buffer for some time step */
+    __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+      return vertices[t];
+    }
+
+    /* returns tessellation level of edge */
+    __forceinline float getEdgeLevel(const size_t i) const
+    {
+      if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+      else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+    }
+
+  public:
+    RTCDisplacementFunctionN displFunc;    //!< displacement function
+
+    /*! all buffers in this section are provided by the application */
+  public:
+    
+    /*! the topology contains all data that may differ when
+     *  interpolating different user data buffers */
+    struct Topology
+    {
+    public:
+
+      /*! Default topology construction */
+      Topology () : halfEdges(nullptr,0) {}
+
+      /*! Topology initialization */
+      Topology (SubdivMesh* mesh);
+
+      /*! make the class movable */
+    public: 
+      Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+        : mesh(std::move(other.mesh)), 
+          vertexIndices(std::move(other.vertexIndices)),
+          subdiv_mode(std::move(other.subdiv_mode)),
+          halfEdges(std::move(other.halfEdges)),
+          halfEdges0(std::move(other.halfEdges0)),
+          halfEdges1(std::move(other.halfEdges1)) {}
+      
+      Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+      {
+        mesh = std::move(other.mesh); 
+        vertexIndices = std::move(other.vertexIndices);
+        subdiv_mode = std::move(other.subdiv_mode);
+        halfEdges = std::move(other.halfEdges);
+        halfEdges0 = std::move(other.halfEdges0);
+        halfEdges1 = std::move(other.halfEdges1);
+        return *this;
+      }
+
+    public:
+      /*! check if the i'th primitive is valid in this topology */
+      __forceinline bool valid(size_t i) const 
+      {
+        if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+          if (getHalfEdge(i)->faceHasBorder()) return false;
+        }
+        return true;
+      }
+      
+      /*! updates the interpolation mode for the topology */
+      void setSubdivisionMode (RTCSubdivisionMode mode);
+
+      /*! marks all buffers as modified */
+      void update ();
+
+      /*! verifies index array */
+      bool verify (size_t numVertices);
+
+      /*! initializes the half edge data structure */
+      void initializeHalfEdgeStructures ();
+
+    private:
+      
+      /*! recalculates the half edges */
+      void calculateHalfEdges();
+      
+      /*! updates half edges when recalculation is not necessary */
+      void updateHalfEdges();
+      
+      /*! user input data */
+    public:
+
+      SubdivMesh* mesh;
+
+      /*! indices of the vertices composing each face */
+      BufferView<unsigned int> vertexIndices;
+      
+      /*! subdiv interpolation mode */
+      RTCSubdivisionMode subdiv_mode;
+
+      /*! generated data */
+    public:
+
+      /*! returns the start half edge for face f */
+      __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { 
+        return &halfEdges[mesh->faceStartEdge[f]]; 
+      }
+
+      /*! Half edge structure, generated by initHalfEdgeStructures */
+      mvector<HalfEdge> halfEdges;
+
+      /*! the following data is only required during construction of the
+       *  half edge structure and can be cleared for static scenes */
+    private:
+      
+      /*! two arrays used to sort the half edges */
+      std::vector<KeyHalfEdge> halfEdges0;
+      std::vector<KeyHalfEdge> halfEdges1;
+    };
+
+    /*! returns the start half edge for topology t and face f */
+    __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { 
+      return topology[t].getHalfEdge(f);
+    }
+
+    /*! buffer containing the number of vertices for each face */
+    BufferView<unsigned int> faceVertices;
+
+    /*! array of topologies */
+    vector<Topology> topology;
+
+    /*! vertex buffer (one buffer for each time step) */
+    vector<BufferView<Vec3fa>> vertices;
+
+    /*! user data buffers */
+    vector<RawBufferView> vertexAttribs;
+
+    /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+    BufferView<Edge> edge_creases;
+    
+    /*! edge crease weights for each edge of the edge_creases buffer */
+    BufferView<float> edge_crease_weights;
+    
+    /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+    BufferView<unsigned int> vertex_creases;
+    
+    /*! vertex crease weights for each vertex of the vertex_creases buffer */
+    BufferView<float> vertex_crease_weights;
+
+    /*! subdivision level for each half edge of the vertexIndices buffer */
+    BufferView<float> levels;
+    float tessellationRate;  // constant rate that is used when levels is not set
+
+    /*! buffer that marks specific faces as holes */
+    BufferView<unsigned> holes;
+
+    /*! all data in this section is generated by initializeHalfEdgeStructures function */
+  private:
+
+    /*! number of half edges used by faces */
+    size_t numHalfEdges; 
+
+    /*! fast lookup table to find the first half edge for some face */
+    mvector<uint32_t> faceStartEdge;
+
+    /*! fast lookup table to find the face for some half edge */
+    mvector<uint32_t> halfEdgeFace;
+
+    /*! set with all holes */
+    parallel_set<uint32_t> holeSet;
+
+    /*! fast lookup table to detect invalid faces */
+    mvector<char> invalid_face;
+
+    /*! test if face i is invalid in timestep j */
+    __forceinline       char& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+    /*! interpolation cache */
+  public:
+    static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+    static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+    static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+      const size_t slots = numInterpolationSlots4(stride); 
+      assert(slot < slots); 
+      return slots*prim+slot;
+    }
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+    std::vector<Patch3fa::Ref> patch_eval_trees;
+    
+    /*! the following data is only required during construction of the
+     *  half edge structure and can be cleared for static scenes */
+  private:
+
+    /*! map with all vertex creases */
+    parallel_map<uint32_t,float> vertexCreaseMap;
+    
+    /*! map with all edge creases */
+    parallel_map<uint64_t,float> edgeCreaseMap;
+
+  protected:
+    
+    /*! counts number of geometry commits */
+    size_t commitCounter;
+  };
+
+  namespace isa
+  {
+    struct SubdivMeshISA : public SubdivMesh
+    {
+      SubdivMeshISA (Device* device)
+        : SubdivMesh(device) {}
+
+      void interpolate(const RTCInterpolateArguments* const args);
+      void interpolateN(const RTCInterpolateNArguments* const args);
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..3bbd7e51ae
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,194 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+  TriangleMesh::TriangleMesh (Device* device)
+    : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+  {
+    vertices.resize(numTimeSteps);
+  }
+
+  void TriangleMesh::setMask (unsigned mask) 
+  {
+    this->mask = mask; 
+    Geometry::update();
+  }
+
+  void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+  {
+    vertices.resize(numTimeSteps);
+    Geometry::setNumTimeSteps(numTimeSteps);
+  }
+
+  void TriangleMesh::setVertexAttributeCount (unsigned int N)
+  {
+    vertexAttribs.resize(N);
+    Geometry::update();
+  }
+  
+  void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+  {
+    /* verify that all accesses are 4 bytes aligned */
+    if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+    if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (format != RTC_FORMAT_FLOAT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+      /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+      if (stride*num > 16ll*1024ll*1024ll*1024ll)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+      vertices[slot].set(buffer, offset, stride, num, format);
+      vertices[slot].checkPadding16();
+      vertices0 = vertices[0];
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+      
+      vertexAttribs[slot].set(buffer, offset, stride, num, format);
+      vertexAttribs[slot].checkPadding16();
+    }
+    else if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      if (format != RTC_FORMAT_UINT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+      triangles.set(buffer, offset, stride, num, format);
+      setNumPrimitives(num);
+    }
+    else 
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+  }
+
+  void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return triangles.getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertices[slot].getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertexAttribs[slot].getPtr();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+      return nullptr;
+    }
+  }
+
+  void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      triangles.setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertices[slot].setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertexAttribs[slot].setModified();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+    }
+
+    Geometry::update();
+  }
+
+  void TriangleMesh::commit() 
+  {
+    /* verify that stride of all time steps are identical */
+    for (unsigned int t=0; t<numTimeSteps; t++)
+      if (vertices[t].getStride() != vertices[0].getStride())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+    Geometry::commit();
+  }
+
+  void TriangleMesh::addElementsToCount (GeometryCounts & counts) const 
+  {
+    if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+    else                   counts.numMBTriangles += numPrimitives;
+  }
+
+  bool TriangleMesh::verify() 
+  {
+    /*! verify size of vertex arrays */
+    if (vertices.size() == 0) return false;
+    for (const auto& buffer : vertices)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify size of user vertex arrays */
+    for (const auto& buffer : vertexAttribs)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify triangle indices */
+    for (size_t i=0; i<size(); i++) {     
+      if (triangles[i].v[0] >= numVertices()) return false; 
+      if (triangles[i].v[1] >= numVertices()) return false; 
+      if (triangles[i].v[2] >= numVertices()) return false; 
+    }
+
+    /*! verify vertices */
+    for (const auto& buffer : vertices)
+      for (size_t i=0; i<buffer.size(); i++)
+	if (!isvalid(buffer[i])) 
+	  return false;
+
+    return true;
+  }
+
+  void TriangleMesh::interpolate(const RTCInterpolateArguments* const args) {
+    interpolate_impl<4>(args);
+  }
+ 
+#endif
+
+  namespace isa
+  {
+    TriangleMesh* createTriangleMesh(Device* device) {
+      return new TriangleMeshISA(device);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.h b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..ad3f602fde
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,318 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Triangle Mesh */
+  struct TriangleMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+    /*! triangle indices */
+    struct Triangle 
+    {
+      uint32_t v[3];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+        return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+      }
+    };
+
+  public:
+
+    /*! triangle mesh construction */
+    TriangleMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      float v = args->v;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        size_t ofs = i*sizeof(float);
+        const float w = 1.0f-u-v;
+        const Triangle& tri = triangle(primID);
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+        
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+        }
+        if (dPdu) {
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0);
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,p2-p0);
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+    
+  public:
+    
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th triangle*/
+    __forceinline const Triangle& triangle(size_t i) const {
+      return triangles[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th triangle */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0],itime);
+      const Vec3fa v1 = vertex(tri.v[1],itime);
+      const Vec3fa v2 = vertex(tri.v[2],itime);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(tri.v[0],itime))) return false;
+        if (!isvalid(vertex(tri.v[1],itime))) return false;
+        if (!isvalid(vertex(tri.v[2],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(tri.v[0],t);
+        const Vec3fa v1 = vertex(tri.v[1],t);
+        const Vec3fa v2 = vertex(tri.v[2],t);
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+          return false;
+      }
+
+      if (likely(bbox)) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const  {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return triangles.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);      
+      return areaProjectedTriangle(v0,v1,v2);
+    }
+
+  public:
+    BufferView<Triangle> triangles;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct TriangleMeshISA : public TriangleMesh
+    {
+      TriangleMeshISA (Device* device)
+        : TriangleMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_user_geometry.h b/thirdparty/embree/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..2867b18b79
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+  /*! User geometry with user defined intersection functions */
+  struct UserGeometry : public AccelSet
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+  public:
+    UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+    virtual void setMask (unsigned mask);
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const;
+  };
+
+  namespace isa
+  {
+    struct UserGeometryISA : public UserGeometry
+    {
+      UserGeometryISA (Device* device)
+        : UserGeometry(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+  
+  DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/stack_item.h b/thirdparty/embree/kernels/common/stack_item.h
new file mode 100644
index 0000000000..c31c64e862
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! An item on the stack holds the node ID and distance of that node. */
+  template<typename T>
+  struct __aligned(16) StackItemT
+  {
+    /*! assert that the xchg function works */
+    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+    __forceinline StackItemT() {}
+
+    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
+    { 
+      const vfloat4 sse_a = vfloat4::load((float*)&a); 
+      const vfloat4 sse_b = vfloat4::load((float*)&b);
+      vfloat4::store(&a,sse_b);
+      vfloat4::store(&b,sse_a);
+    }
+
+    /*! Sort 2 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s4.dist < s3.dist) xchg(s4,s3);
+      if (s3.dist < s1.dist) xchg(s3,s1);
+      if (s4.dist < s2.dist) xchg(s4,s2);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+    }
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
+    { 
+#if defined(__AVX512VL__)
+      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+      const vboolf4 mask0(b < a);
+      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+      const vint4 c = select(mask,b,a);
+      const vint4 d = select(mask,a,b);
+      a = c;
+      b = d;
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s3,s2);
+      cmp_xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s4,s3);
+      cmp_xchg(s3,s1);
+      cmp_xchg(s4,s2);
+      cmp_xchg(s3,s2);
+    }
+
+
+    /*! Sort N stack items. */
+    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+    {
+      for (StackItemT* i = begin+1; i != end; ++i)
+      {
+        const vfloat4 item = vfloat4::load((float*)i);
+        const unsigned dist = i->dist;
+        StackItemT* j = i;
+
+        while ((j != begin) && ((j-1)->dist < dist))
+        {
+          vfloat4::store(j, vfloat4::load((float*)(j-1)));
+          --j;
+        }
+
+        vfloat4::store(j, item);
+      }
+    }
+    
+  public:
+    T ptr;
+    unsigned dist;
+  };
+
+  /*! An item on the stack holds the node ID and active ray mask. */
+  template<typename T>
+  struct __aligned(8) StackItemMaskT
+  {
+    T ptr;
+    size_t mask;
+  };
+
+  struct __aligned(8) StackItemMaskCoherent
+  {
+    size_t mask;
+    size_t parent;
+    size_t child;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/stat.cpp b/thirdparty/embree/kernels/common/stat.cpp
new file mode 100644
index 0000000000..ebb77cd534
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+  Stat Stat::instance; 
+  
+  Stat::Stat () {
+  }
+
+  Stat::~Stat () 
+  {
+#ifdef EMBREE_STAT_COUNTERS
+    Stat::print(std::cout);
+#endif
+  }
+
+  void Stat::print(std::ostream& cout)
+  {
+    Counters& cntrs = instance.cntrs;
+    Counters::Data& data = instance.cntrs.code;
+    //Counters::Data& data = instance.cntrs.active;
+
+    /* print absolute numbers */
+    cout << "--------- ABSOLUTE ---------" << std::endl;
+    cout << "  #normal_travs   = " << float(data.normal.travs            )*1E-6 << "M" << std::endl;
+    cout << "    #nodes        = " << float(data.normal.trav_nodes       )*1E-6 << "M" << std::endl;
+    cout << "    #nodes_xfm    = " << float(data.normal.trav_xfm_nodes   )*1E-6 << "M" << std::endl;
+    cout << "    #leaves       = " << float(data.normal.trav_leaves      )*1E-6 << "M" << std::endl;
+    cout << "    #prims        = " << float(data.normal.trav_prims       )*1E-6 << "M" << std::endl;
+    cout << "    #prim_hits    = " << float(data.normal.trav_prim_hits   )*1E-6 << "M" << std::endl;
+
+    cout << "    #stack nodes  = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+    cout << "    #stack pop    = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+    size_t normal_box_hits = 0;
+    size_t weighted_box_hits = 0;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) { 
+      normal_box_hits += data.normal.trav_hit_boxes[i];
+      weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+    }
+    cout << "    #hit_boxes    = " << normal_box_hits << " (total) distribution: ";
+    float average = 0.0f;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) 
+    {
+      float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+      cout << "[" << i << "] " << value << " ";
+      average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+    }
+    cout << "    average = " << average << std::endl;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+    cout << std::endl;
+
+    if (data.shadow.travs) {
+      cout << "  #shadow_travs = " << float(data.shadow.travs         )*1E-6 << "M" << std::endl;
+      cout << "    #nodes      = " << float(data.shadow.trav_nodes    )*1E-6 << "M" << std::endl;
+      cout << "    #nodes_xfm  = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+      cout << "    #leaves     = " << float(data.shadow.trav_leaves   )*1E-6 << "M" << std::endl;
+      cout << "    #prims      = " << float(data.shadow.trav_prims    )*1E-6 << "M" << std::endl;
+      cout << "    #prim_hits  = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+      cout << "    #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+      cout << "    #stack pop   = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+      size_t shadow_box_hits = 0;
+      size_t weighted_shadow_box_hits = 0;
+
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) {        
+        shadow_box_hits += data.shadow.trav_hit_boxes[i];
+        weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+      }
+      cout << "    #hit_boxes    = ";
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+      cout << std::endl;
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+      cout << std::endl;
+    }
+    cout << std::endl;
+
+    /* print per traversal numbers */
+    cout << "--------- PER TRAVERSAL ---------" << std::endl;
+    float active_normal_travs       = float(cntrs.active.normal.travs      )/float(cntrs.all.normal.travs      );
+    float active_normal_trav_nodes  = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+    float active_normal_trav_xfm_nodes  = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+    float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+    float active_normal_trav_prims   = float(cntrs.active.normal.trav_prims  )/float(cntrs.all.normal.trav_prims  );
+    float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits  )/float(cntrs.all.normal.trav_prim_hits  );
+    float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop  )/float(cntrs.all.normal.trav_stack_pop  );
+
+    cout << "  #normal_travs   = " << float(cntrs.code.normal.travs      )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs       << "% active" << std::endl;
+    cout << "    #nodes        = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes  << "% active" << std::endl;
+    cout << "    #node_xfm     = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes  << "% active" << std::endl;
+    cout << "    #leaves       = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+    cout << "    #prims        = " << float(cntrs.code.normal.trav_prims  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims   << "% active" << std::endl;
+    cout << "    #prim_hits    = " << float(cntrs.code.normal.trav_prim_hits  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits   << "% active" << std::endl;
+    cout << "    #stack_pop    = " << float(cntrs.code.normal.trav_stack_pop  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop   << "% active" << std::endl;
+
+    if (cntrs.all.shadow.travs) {
+      float active_shadow_travs       = float(cntrs.active.shadow.travs      )/float(cntrs.all.shadow.travs      );
+      float active_shadow_trav_nodes  = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+      float active_shadow_trav_xfm_nodes  = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+      float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+      float active_shadow_trav_prims   = float(cntrs.active.shadow.trav_prims  )/float(cntrs.all.shadow.trav_prims  );
+      float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits  )/float(cntrs.all.shadow.trav_prim_hits  );
+
+      cout << "  #shadow_travs = " << float(cntrs.code.shadow.travs      )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs       << "% active" << std::endl;
+      cout << "    #nodes      = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes  << "% active" << std::endl;
+      cout << "    #nodes_xfm  = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes  << "% active" << std::endl;
+      cout << "    #leaves     = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+      cout << "    #prims      = " << float(cntrs.code.shadow.trav_prims  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims   << "% active" << std::endl;
+      cout << "    #prim_hits  = " << float(cntrs.code.shadow.trav_prim_hits  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits   << "% active" << std::endl;
+
+    }
+    cout << std::endl;
+
+     /* print user counters for performance tuning */
+    cout << "--------- USER ---------" << std::endl;
+    for (size_t i=0; i<10; i++)
+      cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+    cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << std::endl;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/stat.h b/thirdparty/embree/kernels/common/stat.h
new file mode 100644
index 0000000000..02fc07e67f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+#  define STAT(x) x
+#  define STAT3(s,x,y,z) \
+  STAT(Stat::get().code  .s+=x);               \
+  STAT(Stat::get().active.s+=y);               \
+  STAT(Stat::get().all   .s+=z);
+#  define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+#  define STAT(x)
+#  define STAT3(s,x,y,z)
+#  define STAT_USER(i,x) 
+#endif
+
+namespace embree
+{
+  /*! Gathers ray tracing statistics. We count 1) how often a code
+   *  location is reached, 2) how many SIMD lanes are active, 3) how
+   *  many SIMD lanes reach the code location */
+  class Stat
+  { 
+  public:
+
+    static const size_t SIZE_HISTOGRAM = 64+1;
+
+    /*! constructs stat counter class */
+    Stat ();
+
+    /*! destructs stat counter class */
+    ~Stat ();
+
+    class Counters 
+    {
+    public:
+      Counters () { 
+        clear(); 
+      }
+      
+      void clear() 
+      { 
+        all.clear();
+        active.clear();
+        code.clear();
+        for (auto& u : user) u.store(0);
+      }
+
+    public:
+
+	/* per packet and per ray stastics */
+	struct Data
+        {
+          void clear () {
+            normal.clear();
+            shadow.clear();
+            point_query.clear();
+          }
+
+	  /* normal and shadow ray statistics */
+	  struct 
+          {
+            void clear() 
+            {
+              travs.store(0);
+              trav_nodes.store(0);
+              trav_leaves.store(0);
+              trav_prims.store(0);
+              trav_prim_hits.store(0);
+              for (auto& v : trav_hit_boxes) v.store(0);
+              trav_stack_pop.store(0);
+              trav_stack_nodes.store(0); 
+              trav_xfm_nodes.store(0); 
+            }
+
+          public:
+	    std::atomic<size_t> travs;
+	    std::atomic<size_t> trav_nodes;
+	    std::atomic<size_t> trav_leaves;
+	    std::atomic<size_t> trav_prims;
+	    std::atomic<size_t> trav_prim_hits;
+	    std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+	    std::atomic<size_t> trav_stack_pop;
+	    std::atomic<size_t> trav_stack_nodes; 
+            std::atomic<size_t> trav_xfm_nodes; 
+            
+	  } normal, shadow, point_query;
+	} all, active, code; 
+
+        std::atomic<size_t> user[10];
+    };
+
+  public:
+
+    static __forceinline Counters& get() {
+      return instance.cntrs;
+    }
+    
+    static void clear() {
+      instance.cntrs.clear();
+    }
+    
+    static void print(embree_ostream cout);
+
+  private: 
+    Counters cntrs;
+
+  private:
+    static Stat instance;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
new file mode 100644
index 0000000000..01c862da0c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -0,0 +1,519 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+  MutexSys g_printMutex;
+
+  State::ErrorHandler State::g_errorHandler;
+
+  State::ErrorHandler::ErrorHandler()
+    : thread_error(createTls()) {}
+
+  State::ErrorHandler::~ErrorHandler()
+  {
+    Lock<MutexSys> lock(errors_mutex);
+    for (size_t i=0; i<thread_errors.size(); i++)
+      delete thread_errors[i];
+    destroyTls(thread_error);
+    thread_errors.clear();
+  }
+
+  RTCError* State::ErrorHandler::error() 
+  {
+    RTCError* stored_error = (RTCError*) getTls(thread_error);
+    if (stored_error) return stored_error;
+
+    Lock<MutexSys> lock(errors_mutex);
+    stored_error = new RTCError(RTC_ERROR_NONE);
+    thread_errors.push_back(stored_error);
+    setTls(thread_error,stored_error);
+    return stored_error;
+  }
+
+  State::State () 
+    : enabled_cpu_features(getCPUFeatures()),
+      enabled_builder_cpu_features(enabled_cpu_features),
+      frequency_level(FREQUENCY_SIMD256)
+  {
+    tri_accel = "default";
+    tri_builder = "default";
+    tri_traverser = "default";
+    
+    tri_accel_mb = "default";
+    tri_builder_mb = "default";
+    tri_traverser_mb = "default";
+
+    quad_accel = "default";
+    quad_builder = "default";
+    quad_traverser = "default";
+
+    quad_accel_mb = "default";
+    quad_builder_mb = "default";
+    quad_traverser_mb = "default";
+
+    line_accel = "default";
+    line_builder = "default";
+    line_traverser = "default";
+
+    line_accel_mb = "default";
+    line_builder_mb = "default";
+    line_traverser_mb = "default";
+    
+    hair_accel = "default";
+    hair_builder = "default";
+    hair_traverser = "default";
+
+    hair_accel_mb = "default";
+    hair_builder_mb = "default";
+    hair_traverser_mb = "default";
+
+    object_accel = "default";
+    object_builder = "default";
+    object_accel_min_leaf_size = 1;
+    object_accel_max_leaf_size = 1;
+
+    object_accel_mb = "default";
+    object_builder_mb = "default";
+    object_accel_mb_min_leaf_size = 1;
+    object_accel_mb_max_leaf_size = 1;
+
+    max_spatial_split_replications = 1.2f;
+    useSpatialPreSplits = false;
+
+    tessellation_cache_size = 128*1024*1024;
+
+    subdiv_accel = "default";
+    subdiv_accel_mb = "default";
+
+    grid_accel = "default";
+    grid_builder = "default";
+    grid_accel_mb = "default";
+    grid_builder_mb = "default";
+
+    instancing_open_min = 0;
+    instancing_block_size = 0;
+    instancing_open_factor = 8.0f; 
+    instancing_open_max_depth = 32;
+    instancing_open_max = 50000000;
+
+    float_exceptions = false;
+    quality_flags = -1;
+    scene_flags = -1;
+    verbose = 0;
+    benchmark = 0;
+
+    numThreads = 0;
+    numUserThreads = 0;
+
+#if TASKING_INTERNAL
+    set_affinity = true;
+#else
+    set_affinity = false;
+#endif
+
+    start_threads = false;
+    enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+    hugepages = true;
+#else
+    hugepages = false;
+#endif
+    hugepages_success = true;
+
+    alloc_main_block_size = 0;
+    alloc_num_main_slots = 0;
+    alloc_thread_block_size = 0;
+    alloc_single_thread_alloc = -1;
+
+    error_function = nullptr;
+    error_function_userptr = nullptr;
+
+    memory_monitor_function = nullptr;
+    memory_monitor_userptr = nullptr;
+  }
+
+  State::~State() {
+  }
+
+  bool State::hasISA(const int isa) {
+    return (enabled_cpu_features & isa) == isa;
+  }
+
+  bool State::checkISASupport() {
+    return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+  }
+  
+  void State::verify()
+  {
+    /* verify that calculations stay in range */
+    assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+    /* here we verify that CPP files compiled for a specific ISA only
+     * call that same or lower ISA version of non-inlined class member
+     * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+    assert(sse2::getISA() <= SSE2);
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512)
+    assert(avx512::getISA() <= AVX512);
+#endif
+#endif
+  }
+
+  const char* symbols[3] = { "=", ",", "|" };
+
+  bool State::parseFile(const FileName& fileName)
+  {
+    FILE* f = fopen(fileName.c_str(),"r");
+    if (!f) return false;
+    Ref<Stream<int> > file = new FileStream(f,fileName);
+    
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+    return true;
+  }
+
+  void State::parseString(const char* cfg)
+  {
+    if (cfg == nullptr) return;
+
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+  }
+  
+  int string_to_cpufeatures(const std::string& isa)
+  {
+    if      (isa == "sse" ) return SSE;
+    else if (isa == "sse2") return SSE2;
+    else if (isa == "sse3") return SSE3;
+    else if (isa == "ssse3") return SSSE3;
+    else if (isa == "sse41") return SSE41;
+    else if (isa == "sse4.1") return SSE41;
+    else if (isa == "sse42") return SSE42;
+    else if (isa == "sse4.2") return SSE42;
+    else if (isa == "avx") return AVX;
+    else if (isa == "avxi") return AVXI;
+    else if (isa == "avx2") return AVX2;
+    else if (isa == "avx512") return AVX512;
+    else return SSE2;
+  }
+
+  void State::parse(Ref<TokenStream> cin)
+  {
+    /* parse until end of stream */
+    while (cin->peek() != Token::Eof())
+    {
+      const Token tok = cin->get();
+
+      if (tok == Token::Id("threads") && cin->trySymbol("=")) 
+        numThreads = cin->get().Int();
+
+      else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) 
+        numUserThreads = cin->get().Int();
+
+      else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+
+      else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+      
+      else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) 
+        start_threads = cin->get().Int();
+      
+      else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features = string_to_cpufeatures(isa_str);
+        enabled_builder_cpu_features = enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features &= string_to_cpufeatures(isa_str);
+        enabled_builder_cpu_features &= enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_builder_cpu_features &= string_to_cpufeatures(isa_str);
+      }
+
+      else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+        std::string freq = cin->get().Identifier();
+        if      (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+        else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+        else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+      }
+
+      else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+        enable_selockmemoryprivilege = cin->get().Int();
+      }
+      else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+        hugepages = cin->get().Int();
+      }
+
+      else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) 
+        float_exceptions = cin->get().Int();
+
+      else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+        tri_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+        tri_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+        tri_traverser = cin->get().Identifier();
+     
+      else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+        tri_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+        tri_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+        tri_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+        quad_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+        quad_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+        quad_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+        quad_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+        quad_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+        quad_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+        line_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+        line_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+        line_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+        line_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+        line_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+        line_traverser_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+        hair_accel = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+        hair_builder = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+        hair_traverser = cin->get().Identifier();
+
+      else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+        hair_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+        hair_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+        hair_traverser_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+        object_accel = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+        object_builder = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+        object_accel_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+        object_accel_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+        object_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+        object_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+        instancing_open_min = cin->get().Int();
+      else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+        instancing_block_size = cin->get().Int();
+        instancing_open_factor = 0.0f;
+      }
+      else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+        instancing_open_max_depth = cin->get().Int();
+      else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+        instancing_block_size = 0;
+        instancing_open_factor = cin->get().Float();
+      }
+      else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+        instancing_open_max = cin->get().Int();
+
+      else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+        subdiv_accel = cin->get().Identifier();
+      else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+        subdiv_accel_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+        grid_accel = cin->get().Identifier();
+      else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+        grid_accel_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+        verbose = cin->get().Int();
+      else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+        benchmark = cin->get().Int();
+      
+      else if (tok == Token::Id("quality")) {
+        if (cin->trySymbol("=")) {
+          Token flag = cin->get();
+          if      (flag == Token::Id("low"))    quality_flags = RTC_BUILD_QUALITY_LOW;
+          else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+          else if (flag == Token::Id("high"))   quality_flags = RTC_BUILD_QUALITY_HIGH;
+        }
+      }
+
+      else if (tok == Token::Id("scene_flags")) {
+        scene_flags = 0;
+        if (cin->trySymbol("=")) {
+          do {
+            Token flag = cin->get();
+            if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+            else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+            else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+          } while (cin->trySymbol("|"));
+        }
+      }
+      
+      else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+        max_spatial_split_replications = cin->get().Float();
+
+      else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+        useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+      else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+      else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+      else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+        alloc_main_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+        alloc_num_main_slots = cin->get().Int();
+       else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+         alloc_thread_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+         alloc_single_thread_alloc = cin->get().Int();
+
+      cin->trySymbol(","); // optional , separator
+    }
+  }
+
+  bool State::verbosity(size_t N) {
+    return N <= verbose;
+  }
+
+  void State::print()
+  {
+    std::cout << "general:" << std::endl;
+    std::cout << "  build threads      = " << numThreads   << std::endl;
+    std::cout << "  build user threads = " << numUserThreads   << std::endl;
+    std::cout << "  start_threads      = " << start_threads << std::endl;
+    std::cout << "  affinity           = " << set_affinity << std::endl;
+    std::cout << "  frequency_level    = ";
+    switch (frequency_level) {
+    case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+    case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+    case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+    default: std::cout << "error" << std::endl; break;
+    }
+    
+    std::cout << "  hugepages          = ";
+    if (!hugepages) std::cout << "disabled" << std::endl;
+    else if (hugepages_success) std::cout << "enabled" << std::endl;
+    else std::cout << "failed" << std::endl;
+
+    std::cout << "  verbosity          = " << verbose << std::endl;
+    std::cout << "  cache_size         = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+    std::cout << "  max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+    
+    std::cout << "triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel << std::endl;
+    std::cout << "  builder            = " << tri_builder << std::endl;
+    std::cout << "  traverser          = " << tri_traverser << std::endl;
+        
+    std::cout << "motion blur triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel_mb << std::endl;
+    std::cout << "  builder            = " << tri_builder_mb << std::endl;
+    std::cout << "  traverser          = " << tri_traverser_mb << std::endl;
+
+    std::cout << "quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel << std::endl;
+    std::cout << "  builder            = " << quad_builder << std::endl;
+    std::cout << "  traverser          = " << quad_traverser << std::endl;
+
+    std::cout << "motion blur quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel_mb << std::endl;
+    std::cout << "  builder            = " << quad_builder_mb << std::endl;
+    std::cout << "  traverser          = " << quad_traverser_mb << std::endl;
+
+    std::cout << "line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel << std::endl;
+    std::cout << "  builder            = " << line_builder << std::endl;
+    std::cout << "  traverser          = " << line_traverser << std::endl;
+
+    std::cout << "motion blur line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel_mb << std::endl;
+    std::cout << "  builder            = " << line_builder_mb << std::endl;
+    std::cout << "  traverser          = " << line_traverser_mb << std::endl;
+    
+    std::cout << "hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel << std::endl;
+    std::cout << "  builder            = " << hair_builder << std::endl;
+    std::cout << "  traverser          = " << hair_traverser << std::endl;
+
+    std::cout << "motion blur hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel_mb << std::endl;
+    std::cout << "  builder            = " << hair_builder_mb << std::endl;
+    std::cout << "  traverser          = " << hair_traverser_mb << std::endl;
+    
+    std::cout << "subdivision surfaces:" << std::endl;
+    std::cout << "  accel              = " << subdiv_accel << std::endl;
+
+    std::cout << "grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel << std::endl;
+    std::cout << "  builder            = " << grid_builder << std::endl;
+
+    std::cout << "motion blur grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel_mb << std::endl;
+    std::cout << "  builder            = " << grid_builder_mb << std::endl;
+
+    std::cout << "object_accel:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_max_leaf_size << std::endl;
+
+    std::cout << "object_accel_mb:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_mb_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_mb_max_leaf_size << std::endl;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/state.h b/thirdparty/embree/kernels/common/state.h
new file mode 100644
index 0000000000..33bcc843b2
--- /dev/null
+++ b/thirdparty/embree/kernels/common/state.h
@@ -0,0 +1,196 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* mutex to make printing to cout thread safe */
+  extern MutexSys g_printMutex;
+
+  struct State : public RefCount
+  {
+  public:
+    /*! state construction */
+    State ();
+
+    /*! state destruction */
+    ~State();
+
+    /*! verifies that state is correct */
+    void verify();
+
+    /*! parses state from a configuration file */
+    bool parseFile(const FileName& fileName);
+
+    /*! parses the state from a string */
+    void parseString(const char* cfg);
+
+    /*! parses the state from a stream */
+    void parse(Ref<TokenStream> cin);
+
+    /*! prints the state */
+    void print();
+
+    /*! checks if verbosity level is at least N */
+    bool verbosity(size_t N);
+
+    /*! checks if some particular ISA is enabled */
+    bool hasISA(const int isa);
+
+    /*! check whether selected ISA is supported by the HW */    
+    bool checkISASupport();
+    
+  public:
+    std::string tri_accel;                 //!< acceleration structure to use for triangles
+    std::string tri_builder;               //!< builder to use for triangles
+    std::string tri_traverser;             //!< traverser to use for triangles
+    
+  public:
+    std::string tri_accel_mb;              //!< acceleration structure to use for motion blur triangles
+    std::string tri_builder_mb;            //!< builder to use for motion blur triangles
+    std::string tri_traverser_mb;          //!< traverser to use for triangles
+
+  public:
+    std::string quad_accel;                 //!< acceleration structure to use for quads
+    std::string quad_builder;               //!< builder to use for quads
+    std::string quad_traverser;             //!< traverser to use for quads
+
+  public:
+    std::string quad_accel_mb;             //!< acceleration structure to use for motion blur quads
+    std::string quad_builder_mb;           //!< builder to use for motion blur quads
+    std::string quad_traverser_mb;         //!< traverser to use for motion blur quads
+
+  public:
+    std::string line_accel;                 //!< acceleration structure to use for line segments
+    std::string line_builder;               //!< builder to use for line segments
+    std::string line_traverser;             //!< traverser to use for line segments
+
+  public:
+    std::string line_accel_mb;             //!< acceleration structure to use for motion blur line segments
+    std::string line_builder_mb;           //!< builder to use for motion blur line segments
+    std::string line_traverser_mb;         //!< traverser to use for motion blur line segments
+
+  public:
+    std::string hair_accel;                //!< hair acceleration structure to use
+    std::string hair_builder;              //!< builder to use for hair
+    std::string hair_traverser;            //!< traverser to use for hair
+
+  public:
+    std::string hair_accel_mb;             //!< acceleration structure to use for motion blur hair
+    std::string hair_builder_mb;           //!< builder to use for motion blur hair
+    std::string hair_traverser_mb;         //!< traverser to use for motion blur hair
+
+  public:
+    std::string object_accel;               //!< acceleration structure for user geometries
+    std::string object_builder;             //!< builder for user geometries
+    int object_accel_min_leaf_size;         //!< minimum leaf size for object acceleration structure
+    int object_accel_max_leaf_size;         //!< maximum leaf size for object acceleration structure
+
+  public:
+    std::string object_accel_mb;            //!< acceleration structure for user geometries
+    std::string object_builder_mb;          //!< builder for user geometries
+    int object_accel_mb_min_leaf_size;      //!< minimum leaf size for mblur object acceleration structure
+    int object_accel_mb_max_leaf_size;      //!< maximum leaf size for mblur object acceleration structure
+
+  public:
+    std::string subdiv_accel;              //!< acceleration structure to use for subdivision surfaces
+    std::string subdiv_accel_mb;           //!< acceleration structure to use for subdivision surfaces
+
+  public:
+    std::string grid_accel;              //!< acceleration structure to use for grids
+    std::string grid_builder;            //!< builder for grids
+    std::string grid_accel_mb;           //!< acceleration structure to use for motion blur grids
+    std::string grid_builder_mb;         //!< builder for motion blur grids
+
+  public:
+    float max_spatial_split_replications;  //!< maximally replications*N many primitives in accel for spatial splits
+    bool useSpatialPreSplits;              //!< use spatial pre-splits instead of the full spatial split builder
+    size_t tessellation_cache_size;        //!< size of the shared tessellation cache 
+
+  public:
+    size_t instancing_open_min;            //!< instancing opens tree to minimally that number of subtrees
+    size_t instancing_block_size;          //!< instancing opens tree up to average block size of primitives
+    float  instancing_open_factor;         //!< instancing opens tree up to x times the number of instances
+    size_t instancing_open_max_depth;      //!< maximum open depth for geometries
+    size_t instancing_open_max;            //!< instancing opens tree to maximally that number of subtrees
+
+  public:
+    bool float_exceptions;                 //!< enable floating point exceptions
+    int quality_flags;
+    int scene_flags;
+    size_t verbose;                        //!< verbosity of output
+    size_t benchmark;                      //!< true
+    
+  public:
+    size_t numThreads;                     //!< number of threads to use in builders
+    size_t numUserThreads;                 //!< number of user provided threads to use in builders
+    bool set_affinity;                     //!< sets affinity for worker threads
+    bool start_threads;                    //!< true when threads should be started at device creation time
+    int enabled_cpu_features;              //!< CPU ISA features to use
+    int enabled_builder_cpu_features;      //!< CPU ISA features to use for builders only
+    enum FREQUENCY_LEVEL {
+      FREQUENCY_SIMD128,
+      FREQUENCY_SIMD256,
+      FREQUENCY_SIMD512
+    } frequency_level;                     //!< frequency level the app wants to run on (default is SIMD256)
+    bool enable_selockmemoryprivilege;     //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+    bool hugepages;                        //!< true if huge pages should get used
+    bool hugepages_success;                //!< status for enabling huge pages
+
+  public:
+    size_t alloc_main_block_size;          //!< main allocation block size (shared between threads)
+    int alloc_num_main_slots;              //!< number of such shared blocks to be used to allocate
+    size_t alloc_thread_block_size;        //!< size of thread local allocator block size
+    int alloc_single_thread_alloc;         //!< in single mode nodes and leaves use same thread local allocator
+
+  public:
+
+    /*! checks if we can use AVX */
+    bool canUseAVX() {
+      return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+    }
+
+    /*! checks if we can use AVX2 */
+    bool canUseAVX2() {
+      return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+    }
+    
+    struct ErrorHandler
+    {
+    public:
+      ErrorHandler();
+      ~ErrorHandler();
+      RTCError* error();
+
+    public:
+      tls_t thread_error;
+      std::vector<RTCError*> thread_errors;
+      MutexSys errors_mutex;
+    };
+    ErrorHandler errorHandler;
+    static ErrorHandler g_errorHandler;
+
+  public:
+    void setErrorFunction(RTCErrorFunction fptr, void* uptr) 
+    {
+      error_function = fptr;
+      error_function_userptr = uptr;
+    }
+
+    RTCErrorFunction error_function;
+    void* error_function_userptr;
+
+  public:
+    void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) 
+    {
+      memory_monitor_function = fptr;
+      memory_monitor_userptr = uptr;
+    }
+      
+    RTCMemoryMonitorFunction memory_monitor_function;
+    void* memory_monitor_userptr;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/vector.h b/thirdparty/embree/kernels/common/vector.h
new file mode 100644
index 0000000000..4b08275f3b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+  /*! invokes the memory monitor callback */
+  struct MemoryMonitorInterface {
+    virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+  };
+
+  /*! allocator that performs aligned monitored allocations */
+  template<typename T, size_t alignment = 64>
+    struct aligned_monitored_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) 
+        : device(device), hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) 
+      {
+        if (n) {
+          assert(device);
+          device->memoryMonitor(n*sizeof(T),false);
+        }
+        if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+        {
+          pointer p =  (pointer) os_malloc(n*sizeof(value_type),hugepages);
+          assert(p);
+          return p;
+        }
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) 
+      {
+        if (p)
+        {
+          if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+            os_free(p,n*sizeof(value_type),hugepages); 
+          else
+            alignedFree(p);
+        }
+        else assert(n == 0);
+
+        if (n) {
+          assert(device);
+          device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+        }
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+    private:
+      MemoryMonitorInterface* device;
+      bool hugepages;
+    };
+
+  /*! monitored vector */
+  template<typename T>
+    using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
new file mode 100644
index 0000000000..80a8ab2a56
--- /dev/null
+++ b/thirdparty/embree/kernels/config.h
@@ -0,0 +1,76 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
diff --git a/thirdparty/embree/kernels/geometry/cone.h b/thirdparty/embree/kernels/geometry/cone.h
new file mode 100644
index 0000000000..17429bab32
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/cone.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cone
+    {
+      const Vec3fa p0; //!< start position of cone
+      const Vec3fa p1; //!< end position of cone
+      const float r0;  //!< start radius of cone
+      const float r1;  //!< end radius of cone
+
+      __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o, 
+                                   float& u1_o, Vec3fa& Ng1_o) const 
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3fa v0 = p0-org;
+        const Vec3fa v1 = p1-org;
+        
+        const float rl = rcp_length(v1-v0);
+        const Vec3fa P0 = v0, dP = (v1-v0)*rl;
+        const float dr = (r1-r0)*rl;
+        const Vec3fa O = -P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+
+        const float R = r0 + Oz*dr;          
+        const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr));
+        const float B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const float C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) return false;
+
+        /* special case for rays that are "parallel" to the cone */
+        const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (unlikely(abs(A) < eps))
+        {
+          /* cylinder case */
+          if (abs(dr) < 16.0f*float(ulp)) {
+            if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } 
+            else           { t_o = BBox1f(pos_inf,neg_inf); return false; }
+          }
+
+          /* cone case */
+          else 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const float t = -C/B;
+            const float z0 = Oz+t*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+
+            /* test if we start inside or outside the cone */
+            if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf);
+            else               t_o = BBox1f(neg_inf,t);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        else
+        {
+          const float Q = sqrt(D);
+          const float rcp_2A = rcp(2.0f*A);
+          t_o.lower = (-B-Q)*rcp_2A;
+          t_o.upper = (-B+Q)*rcp_2A;
+          
+          /* standard case where both hits are on same cone */
+          if (likely(A > 0.0f)) {
+            const float z0 = Oz+t_o.lower*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          else 
+          {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            if (dOz*dr > 0) t_o.upper = pos_inf;
+            else            t_o.lower = neg_inf;
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3fa Pr = t_o.lower*dir;
+          const Vec3fa Pl = v0 + u0_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3fa Pr = t_o.upper*dir;
+          const Vec3fa Pl = v0 + u1_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const 
+      {
+        float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cone.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cone class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf);
+        passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f);
+        passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf);
+        passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f);
+        passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6);
+        passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f);
+        const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f);
+        passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f);
+        passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f);
+        const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) {
+        return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}";
+      }
+    };
+
+    template<int N>
+      struct ConeN
+    {
+      typedef Vec3<vfloat<N>> Vec3vfN;
+      
+      const Vec3vfN p0;     //!< start position of cone
+      const Vec3vfN p1;     //!< end position of cone
+      const vfloat<N> r0;   //!< start radius of cone
+      const vfloat<N> r1;   //!< end radius of cone
+
+      __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline Cone operator[] (const size_t i) const
+      {
+        assert(i<N);
+        return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]);
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vfN& Ng0_o, 
+                                       vfloat<N>& u1_o, Vec3vfN& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3vfN v0 = p0-Vec3vfN(org);
+        const Vec3vfN v1 = p1-Vec3vfN(org);
+
+        const vfloat<N> rl = rcp_length(v1-v0);
+        const Vec3vfN P0 = v0, dP = (v1-v0)*rl;
+        const vfloat<N> dr = (r1-r0)*rl;
+        const Vec3vfN O = -P0, dO = dir;
+       
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> R = r0 + Oz*dr;          
+        const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr));
+        const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const vfloat<N> C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) return valid;
+
+        /* special case for rays that are "parallel" to the cone */
+        const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        const vbool<N> validt = valid &  (abs(A) < eps);
+        const vbool<N> validf = valid & !(abs(A) < eps);
+        if (unlikely(any(validt)))
+        {
+          const vboolx validtt = validt & (abs(dr) <  16.0f*float(ulp));
+          const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp));
+          
+          /* cylinder case */
+          if (unlikely(any(validtt))) 
+          {
+            t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower);
+            t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper);
+            valid &= !validtt | C <= 0.0f;
+          }
+
+          /* cone case */
+          if (any(validtf)) 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const vfloat<N> t = -C/B;
+            const vfloat<N> z0 = Oz+t*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validtf | z0r >= 0.0f;
+
+            /* test if we start inside or outside the cone */
+            t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower);
+            t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        if (likely(any(validf)))
+        {
+          const vfloat<N> Q = sqrt(D);
+          const vfloat<N> rcp_2A = 0.5f*rcp(A);
+          t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower);
+          t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper);
+          
+          /* standard case where both hits are on same cone */
+          const vbool<N> validft = validf &   A>0.0f;
+          const vbool<N> validff = validf & !(A>0.0f);
+          if (any(validft)) {
+            const vfloat<N> z0 = Oz+t_o.lower*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validft | z0r >= 0.0f;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          if (any(validff)) {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower);
+            t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper);
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u0_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u1_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return valid;
+      }
+ 
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/coneline_intersector.h b/thirdparty/embree/kernels/geometry/coneline_intersector.h
new file mode 100644
index 0000000000..90f3792eff
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/coneline_intersector.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    namespace __coneline_internal 
+    {
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectCone(const vbool<M>& valid_i,
+                                                const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                const vbool<M>& cL, const vbool<M>& cR,
+                                                const Epilog& epilog)
+      {   
+        vbool<M> valid = valid_i;
+
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+        const Vec3vf<M> dP = v1.xyz() - v0.xyz();
+        const Vec3vf<M> p0 = ray_org - v0.xyz();
+        const Vec3vf<M> p1 = ray_org - v1.xyz();
+        
+        const vfloat<M> dPdP  = sqr(dP);
+        const vfloat<M> dP0   = dot(p0,dP);
+        const vfloat<M> dP1   = dot(p1,dP); 
+        const vfloat<M> dOdP  = dot(ray_dir,dP);
+
+        // intersect cone body
+        const vfloat<M> dr  = v0.w - v1.w;
+        const vfloat<M> hy  = dPdP + sqr(dr);
+        const vfloat<M> dO0 = dot(ray_dir,p0);
+        const vfloat<M> OO  = sqr(p0);
+        const vfloat<M> dPdP2 = sqr(dPdP);
+        const vfloat<M> dPdPr0 = dPdP*v0.w;
+        
+        const vfloat<M> A = dPdP2     - sqr(dOdP)*hy;
+        const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy   + dPdPr0*(dr*dOdP);
+        const vfloat<M> C = dPdP2*OO  - sqr(dP0)*hy   + dPdPr0*(2.0f*dr*dP0 - dPdPr0);
+        
+        const vfloat<M> D = B*B - A*C;
+        valid &= D >= 0.0f;
+        if (unlikely(none(valid))) {
+          return false;
+        }
+
+        /* standard case for "non-parallel" rays */
+        const vfloat<M> Q = sqrt(D);
+        const vfloat<M> rcp_A = rcp(A);
+        /* special case for rays that are "parallel" to the cone - assume miss */
+        const vbool<M> isParallel = abs(A) <= min_rcp_input;
+
+        vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A);
+        vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A);
+        const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP;
+        const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP;
+        t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf);
+        t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf);
+
+        const vbool<M> hitDisk0 = valid & cL;
+        const vbool<M> hitDisk1 = valid & cR;
+        const vfloat<M> rcp_dOdP = rcp(dOdP);
+        const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk_lower = min(t_disk0, t_disk1);
+        const vfloat<M> t_disk_upper = max(t_disk0, t_disk1);
+
+        const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower);
+        const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, 
+                                                      select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper), 
+                                                      select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower)));
+
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf);
+
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        const vfloat<M> t_first = select(valid_lower, t_lower, t_upper);
+        const vfloat<M> y_first = select(valid_lower, y_lower, y_upper);
+
+        const vfloat<M> rcp_dPdP = rcp(dPdP);
+        const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP;
+        const Vec3vf<M> dPhy = dP*hy;
+        const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper);
+        const vbool<M> disk0_hit_first = valid & (t_first == t_disk0);
+        const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP));
+        const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_upper;
+        const vfloat<M> y_second = y_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> disk0_hit_second = t_second == t_disk0;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP));
+        const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    }
+
+    template<int M>
+      struct ConeLineIntersectorHitM
+      {
+        __forceinline ConeLineIntersectorHitM() {}
+        
+        __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct ConeCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          return  __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct ConeCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          return __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/conelinei_intersector.h b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
new file mode 100644
index 0000000000..6a985ebcad
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "coneline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct ConeCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct ConeCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct ConeCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct ConeCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi.h b/thirdparty/embree/kernels/geometry/curveNi.h
new file mode 100644
index 0000000000..6366a6fb9c
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNi
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue");
+      return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNi () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {  
+      size_t end = min(begin+M,_end);
+      N = (unsigned char)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (unsigned char) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        bounds.extend(scene->get(geomID)->vbounds(primID));
+      }
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRef& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
+        
+        bounds_vx_x(N)[i] = (char) space3.vx.x;
+        bounds_vx_y(N)[i] = (char) space3.vx.y;
+        bounds_vx_z(N)[i] = (char) space3.vx.z;
+        bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
+
+        bounds_vy_x(N)[i] = (char) space3.vy.x;
+        bounds_vy_y(N)[i] = (char) space3.vy.y;
+        bounds_vy_z(N)[i] = (char) space3.vy.z;
+        bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (char) space3.vz.x;
+        bounds_vz_y(N)[i] = (char) space3.vz.y;
+        bounds_vz_z(N)[i] = (char) space3.vz.z;
+        bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = CurveNi::blocks(set.size());
+      size_t numbytes = CurveNi::bytes(set.size());
+      CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    unsigned char ty;
+    unsigned char N;
+    unsigned char data[4+25*M+16];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      char bounds_vx_x[N];
+      char bounds_vx_y[N];
+      char bounds_vx_z[N];
+      short bounds_vx_lower[N];
+      short bounds_vx_upper[N];
+      
+      char bounds_vy_x[N];
+      char bounds_vy_y[N];
+      char bounds_vy_z[N];
+      short bounds_vy_lower[N];
+      short bounds_vy_upper[N];
+      
+      char bounds_vz_x[N];
+      char bounds_vz_y[N];
+      char bounds_vz_z[N];
+      short bounds_vz_lower[N];
+      short bounds_vz_upper[N];
+      
+      Vec3f offset;
+      float scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    
+    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
+    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    
+    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
+    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    
+    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
+    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((char*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((char*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); }
+    
+    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+11*N); }
+    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); }
+    
+    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+12*N); }
+    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); }
+    
+    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+13*N); }
+    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); }
+    
+    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((char*)this+6+14*N); }
+    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); }
+    
+    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((char*)this+6+16*N); }
+    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); }
+    
+    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+18*N); }
+    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); }
+    
+    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+19*N); }
+    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); }
+    
+    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+20*N); }
+    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); }
+    
+    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((char*)this+6+21*N); }
+    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); }
+    
+    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((char*)this+6+23*N); }
+    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); }
+    
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+25*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+25*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); }
+
+    __forceinline       char* end(size_t N)       { return (char*)this+6+25*N+16; }
+    __forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; }
+  };
+
+  template<int M>
+    typename CurveNi<M>::Type CurveNi<M>::type;
+
+  typedef CurveNi<4> Curve4i;
+  typedef CurveNi<8> Curve8i;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
new file mode 100644
index 0000000000..c0b66515c1
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
@@ -0,0 +1,569 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct CurveNiIntersector1
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+         
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiIntersectorK
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+      
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb.h b/thirdparty/embree/kernels/geometry/curveNi_mb.h
new file mode 100644
index 0000000000..5d972b43a0
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb.h
@@ -0,0 +1,278 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNiMB
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue");
+      return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNiMB () {}
+
+    /*! fill curve from curve list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
+    {
+      size_t end = min(begin+M,_end);
+      N = (unsigned char)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (unsigned char) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      LBBox3fa lbounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRefMB& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range));
+      }
+      BBox3fa bounds = lbounds.bounds();
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      this->time_offset(N) = time_range.lower;
+      this->time_scale(N) = 1.0f/time_range.size();
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
+        
+        // NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug
+        bounds_vx_x(N)[i] = (char) (short) space3.vx.x;
+        bounds_vx_y(N)[i] = (char) (short) space3.vx.y;
+        bounds_vx_z(N)[i] = (char) (short) space3.vx.z;
+        bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
+        bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
+        
+        bounds_vy_x(N)[i] = (char) (short) space3.vy.x;
+        bounds_vy_y(N)[i] = (char) (short) space3.vy.y;
+        bounds_vy_z(N)[i] = (char) (short) space3.vy.z;
+        bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
+        bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (char) (short) space3.vz.x;
+        bounds_vz_y(N)[i] = (char) (short) space3.vz.y;
+        bounds_vz_z(N)[i] = (char) (short) space3.vz.z;
+        bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
+        bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+      
+      return lbounds;
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = CurveNiMB::blocks(prims.size());
+      size_t numbytes = CurveNiMB::bytes(prims.size());
+      CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    unsigned char ty;
+    unsigned char N;
+    unsigned char data[4+37*M+24];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      char bounds_vx_x[N];
+      char bounds_vx_y[N];
+      char bounds_vx_z[N];
+      short bounds_vx_lower0[N];
+      short bounds_vx_upper0[N];
+      short bounds_vx_lower1[N];
+      short bounds_vx_upper1[N];
+      
+      char bounds_vy_x[N];
+      char bounds_vy_y[N];
+      char bounds_vy_z[N];
+      short bounds_vy_lower0[N];
+      short bounds_vy_upper0[N];
+      short bounds_vy_lower1[N];
+      short bounds_vy_upper1[N];
+      
+      char bounds_vz_x[N];
+      char bounds_vz_y[N];
+      char bounds_vz_z[N];
+      short bounds_vz_lower0[N];
+      short bounds_vz_upper0[N];
+      short bounds_vz_lower1[N];
+      short bounds_vz_upper1[N];
+      
+      Vec3f offset;
+      float scale;
+
+      float time_offset;
+      float time_scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    
+    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
+    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    
+    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
+    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    
+    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
+    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((char*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((char*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); }
+
+    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((char*)this+6+11*N); }
+    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); }
+    
+    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((char*)this+6+13*N); }
+    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); }
+
+    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+15*N); }
+    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); }
+    
+    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+16*N); }
+    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); }
+    
+    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+17*N); }
+    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); }
+    
+    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((char*)this+6+18*N); }
+    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); }
+    
+    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((char*)this+6+20*N); }
+    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); }
+
+    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((char*)this+6+22*N); }
+    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); }
+    
+    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((char*)this+6+24*N); }
+    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); }
+    
+    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+26*N); }
+    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); }
+    
+    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+27*N); }
+    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); }
+    
+    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+28*N); }
+    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); }
+    
+    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((char*)this+6+29*N); }
+    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); }
+    
+    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((char*)this+6+31*N); }
+    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); }
+
+    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((char*)this+6+33*N); }
+    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); }
+    
+    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((char*)this+6+35*N); }
+    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); }
+
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+37*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+37*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); }
+
+    __forceinline       float& time_offset(size_t N)       { return *(float*)((char*)this+6+37*N+16); }
+    __forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); }
+    
+    __forceinline       float& time_scale(size_t N)       { return *(float*)((char*)this+6+37*N+20); }
+    __forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); }
+
+    __forceinline       char* end(size_t N)       { return (char*)this+6+37*N+24; }
+    __forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; }
+  };
+
+  template<int M>
+    typename CurveNiMB<M>::Type CurveNiMB<M>::type;
+
+  typedef CurveNiMB<4> Curve4iMB;
+  typedef CurveNiMB<8> Curve8iMB;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
new file mode 100644
index 0000000000..bab796b33b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
@@ -0,0 +1,516 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi_mb.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNiMBIntersector1
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiMBIntersectorK
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNv.h b/thirdparty/embree/kernels/geometry/curveNv.h
new file mode 100644
index 0000000000..e41a381706
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNv.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNv : public CurveNi<M>
+  {
+    using CurveNi<M>::N;
+      
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue");
+      return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNv () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {
+      size_t end = min(begin+M,_end);
+      size_t N = end-begin;
+
+      /* encode all primitives */
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID);
+        const unsigned vtxID = mesh->curve(primID);
+        Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0));
+        Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1));
+        Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2));
+        Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3));
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      if (set.size() == 0)
+        return BVH::emptyNode;
+      
+      /* fall back to CurveNi for oriented curves */
+      unsigned int geomID = prims[set.begin()].geomID();
+      if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      
+      size_t start = set.begin();
+      size_t items = CurveNv::blocks(set.size());
+      size_t numbytes = CurveNv::bytes(set.size());
+      CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene);
+        accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    unsigned char data[4*16*M];
+    __forceinline       Vec3fa* vertices(size_t i, size_t N)       { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+    __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+  };
+
+  template<int M>
+    typename CurveNv<M>::Type CurveNv<M>::type;
+
+  typedef CurveNv<4> Curve4v;
+  typedef CurveNv<8> Curve8v;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNv_intersector.h b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
new file mode 100644
index 0000000000..2742725aec
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNv.h"
+#include "curveNi_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNvIntersector1 : public CurveNiIntersector1<M>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector.h b/thirdparty/embree/kernels/geometry/curve_intersector.h
new file mode 100644
index 0000000000..1e8ac26125
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "bezier_hair_intersector.h"
+#include "bezier_ribbon_intersector.h"
+#include "bezier_curve_intersector.h"
+#include "oriented_curve_intersector.h"
+#include "../bvh/node_intersector1.h"
+
+// FIXME: this file seems replicate of curve_intersector_virtual.h
+
+namespace embree
+{
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+      
+      template<int N, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
new file mode 100644
index 0000000000..748a9511a5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3fa, int M>
+    struct DistanceCurveHit
+    {
+      __forceinline DistanceCurveHit() {}
+
+      __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                                     const NativeCurve3fa& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3fa curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+    
+    template<typename NativeCurve3fa>
+    struct DistanceCurve1Intersector1
+    {
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        
+        /* transform control points into ray space */
+        const NativeCurve3fa curve3Di(v0,v1,v2,v3);
+        const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
+        const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
+      
+        /* evaluate the bezier curve */
+        vboolx valid = vfloatx(step) < vfloatx(float(N));
+        const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+        const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+
+        /* approximative intersection with cone */
+        const Vec4vfx v = p1-p0;
+        const Vec4vfx w = -p0;
+        const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+        const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+        const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+        const Vec4vfx p = madd(u,v,p0);
+        const vfloatx t = p.z*pre.depth_scale;
+        const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+        const vfloatx r = p.w;
+        const vfloatx r2 = r*r;
+        valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+          valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+        /* update hit information */
+        bool ishit = false;
+        if (unlikely(any(valid))) {
+          DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+          ishit = ishit | epilog(valid,hit);
+        }
+
+        if (unlikely(VSIZEX < N)) 
+        {
+          /* process SIMD-size many segments per iteration */
+          for (int i=VSIZEX; i<N; i+=VSIZEX)
+          {
+            /* evaluate the bezier curve */
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+            const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+            
+            /* approximative intersection with cone */
+            const Vec4vfx v = p1-p0;
+            const Vec4vfx w = -p0;
+            const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+            const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+            const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+            const Vec4vfx p = madd(u,v,p0);
+            const vfloatx t = p.z*pre.depth_scale;
+            const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+            const vfloatx r = p.w;
+            const vfloatx r2 = r*r;
+            valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+              valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+             /* update hit information */
+            if (unlikely(any(valid))) {
+              DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+              ishit = ishit | epilog(valid,hit);
+            }
+          }
+        }
+        return ishit;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
new file mode 100644
index 0000000000..3d8900c2aa
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+#include "curve_intersector_sweep.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+#define DBG(x)
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Ray, typename Epilog>
+      struct TensorLinearCubicBezierSurfaceIntersector
+      {
+        const LinearSpace3fa& ray_space;
+        Ray& ray;
+        TensorLinearCubicBezierSurface3fa curve3d;
+        TensorLinearCubicBezierSurface2fa curve2d;
+        float eps;
+        const Epilog& epilog;
+        bool isHit;
+
+        __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog)
+          : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false)
+        {
+          const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org);
+          curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R));
+          const BBox2fa b2 = curve2d.bounds();
+          eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper)));
+        }
+        
+        __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1)
+        {
+          if (p1 == p0) {
+            if (p0 == 0.0f) return Interval1f(u0,u1);
+            else return Interval1f(empty);
+          }
+          const float t = -p0/(p1-p0);
+          const float tt = lerp(u0,u1,t);
+          return Interval1f(tt);
+        }
+
+        __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u)
+        {
+          if (sign(p0.lower) != sign(p0.upper)) u.extend(u0);
+          if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower));
+          if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper));
+          if (sign(p1.lower) != sign(p1.upper)) u.extend(u1);
+        }
+
+        __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve)
+        {
+          Interval1f u = empty;
+          solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u);
+          solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u);
+          solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u);
+          solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u);
+          solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u);
+          solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u);
+          return intersect(u,Interval1f(0.0f,1.0f));
+        }
+        
+        __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve)
+        {
+          Interval1f v = empty;
+          solve_linear(0.0f,1.0f,curve.v0,curve.v1,v);
+          return intersect(v,Interval1f(0.0f,1.0f));
+        }
+
+        __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2)
+        {
+          BBox2fa bounds = curve2.bounds();
+          if (bounds.upper.x < 0.0f) return;
+          if (bounds.upper.y < 0.0f) return;
+          if (bounds.lower.x > 0.0f) return;
+          if (bounds.lower.y > 0.0f) return;
+          
+          if (max(cu.size(),cv.size()) < 1E-4f)
+          {
+            const float u = cu.center();
+            const float v = cv.center();
+            TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+            const float t = curve_z.eval(u,v);
+            if (ray.tnear() <= t && t <= ray.tfar) {
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+            }
+            return;
+          }
+          
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return;
+          
+          const Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return;
+          TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          const Vec2fa du = curve2.axis_u();
+          const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du);
+          CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v();         
+          int roots = curve0u.maxRoots();
+          if (roots == 0) return;
+          
+          if (roots == 1)
+          {
+            const Interval1f u = bezier_clipping(curve0u);
+            if (isEmpty(u)) return;
+            TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u);
+            cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper));
+            solve_bezier_clipping(cu,cv,curve2b);
+            return;
+          }
+
+          TensorLinearCubicBezierSurface2fa curve2l, curve2r;
+          curve2a.split_u(curve2l,curve2r);
+          solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l);
+          solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r);
+        }
+        
+        __forceinline bool solve_bezier_clipping()
+        {
+          solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d);
+          return isHit;
+        }
+
+        __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv)
+        {
+          Vec2fa uv(cu.center(),cv.center());
+          const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y);
+          const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J);
+        }
+
+        __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J)
+        {
+          Vec2fa uv = uv_in;
+          
+          for (size_t i=0; i<200; i++)
+          {
+            const Vec2fa f = curve2d.eval(uv.x,uv.y);
+            const Vec2fa duv = rcp_J*f;
+            uv -= duv;
+
+            if (max(abs(f.x),abs(f.y)) < eps)
+            {
+              const float u = uv.x;
+              const float v = uv.y;
+              if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs
+              if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs
+              const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+              const float t = curve_z.eval(u,v);
+              if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+              return;
+            }
+          }       
+        }
+
+        __forceinline bool clip_v(BBox1f& cu, BBox1f& cv)
+        {
+          const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower);
+          const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return false;
+          Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return false;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+          return true;
+        }
+
+        __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv)
+        {
+          /* perform bezier clipping in v-direction to get tight v-bounds */
+          TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv);
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (unlikely(!curve0v.hasRoot())) return true;
+          Interval1f v = bezier_clipping(curve0v);
+          if (unlikely(isEmpty(v))) return true;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          curve2 = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          /* perform one newton raphson iteration */
+          Vec2fa c(cu.center(),cv.center());
+          Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          const Vec2fa c1 = c - rcp_J*f;
+          
+          /* calculate bounds of derivatives */
+          const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds();
+          const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds();
+
+          /* calculate krawczyk test */
+          LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f),
+                                           Interval1f(0.0f), Interval1f(1.0f));
+
+          LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x),
+                                           Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y));
+
+          const LinearSpace2<Vec2f> rcp_J2(rcp_J);
+          const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2);
+          
+          const Vec2<Interval1f> x(cu,cv);
+          const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c)));
+
+          /* test if there is no solution */
+          const Vec2<Interval1f> KK = intersect(K,x);
+          if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true;
+
+          /* exit if convergence cannot get proven, but terminate if we are very small */
+          if (unlikely(!subset(K,x) && !very_small)) return false;
+
+          /* solve using newton raphson iteration of convergence is guarenteed */
+          solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
+          return true;
+        }
+
+        __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv)
+        {
+           if (!clip_v(cu,cv)) return;
+           return solve_newton_raphson(cu,cv);
+        }
+        
+        __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
+        {
+          unsigned int sptr = 0;
+          const unsigned int stack_size = 4;
+          unsigned int mask_stack[stack_size];
+          BBox1f cu_stack[stack_size];
+          BBox1f cv_stack[stack_size];
+          goto entry;
+          
+          /* terminate if stack is empty */
+          while (sptr)
+          {
+            /* pop from stack */
+            {
+              sptr--;
+              size_t mask = mask_stack[sptr];
+              cu = cu_stack[sptr];
+              cv = cv_stack[sptr];
+              const size_t i = bscf(mask);
+              mask_stack[sptr] = mask;
+              if (mask) sptr++; // there are still items on the stack
+              
+              /* process next element recurse into each hit curve segment */
+              const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
+              const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+              const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
+              cu = cui;
+            }
+
+#if 0
+            solve_newton_raphson_no_recursion(cu,cv);
+            continue;
+            
+#else
+            /* we assume convergence for small u ranges and verify using krawczyk */
+            if (cu.size() < 1.0f/6.0f) {
+              const bool very_small = cu.size() < 0.001f || sptr >= stack_size;
+              if (solve_krawczyk(very_small,cu,cv)) {
+                continue;
+              }
+            }
+#endif
+
+          entry:
+          
+            /* split the curve into VSIZEX-1 segments in u-direction */
+            vboolx valid = true;
+            TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
+            
+            /* slabs test in u-direction */
+            Vec2vfx ndv = cross(subcurves.axis_v());
+            BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
+            valid &= boundsv.lower <= eps;
+            valid &= boundsv.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* slabs test in v-direction */
+            Vec2vfx ndu = cross(subcurves.axis_u());
+            BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
+            valid &= boundsu.lower <= eps;
+            valid &= boundsu.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* push valid segments to stack */
+            assert(sptr < stack_size);
+            mask_stack [sptr] = movemask(valid);
+            cu_stack   [sptr] = cu;
+            cv_stack   [sptr] = cv;
+            sptr++;
+          }
+        }
+        
+        __forceinline bool solve_newton_raphson_main()
+        {
+          BBox1f vu(0.0f,1.0f);
+          BBox1f vv(0.0f,1.0f);
+          solve_newton_raphson_recursion(vu,vv);
+          return isHit;
+        }
+      };
+
+
+    template<template<typename Ty> class SourceCurve>
+      struct OrientedCurve1Intersector1
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      __forceinline OrientedCurve1Intersector1() {}
+      
+      __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID, 
+                                const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+
+    template<template<typename Ty> class SourceCurve, int K>
+      struct OrientedCurve1IntersectorK
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                   const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const TensorLinearCubicBezierSurface3fa& curve,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h
new file mode 100644
index 0000000000..de6b70be1b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/geometry.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct CurvePrecalculations1
+    {
+      float depth_scale;
+      LinearSpace3fa ray_space;
+           
+      __forceinline CurvePrecalculations1() {}
+
+      __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr)
+      {
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        LinearSpace3fa space = frame(depth_scale*ray.dir);
+        space.vz *= depth_scale;
+        ray_space = space.transposed();
+      }
+    };
+    
+    template<int K>
+      struct CurvePrecalculationsK
+    {
+      vfloat<K> depth_scale;
+      LinearSpace3fa ray_space[K];
+
+      __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray)
+      {
+        size_t mask = movemask(valid);
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        while (mask) {
+          size_t k = bscf(mask);
+          Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+          LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k);
+          ray_space_k.vz *= depth_scale[k];
+          ray_space[k] = ray_space_k.transposed();
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
new file mode 100644
index 0000000000..c3272e99fd
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "quad_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+#define Bezier1Intersector1 RibbonCurve1Intersector1
+#define Bezier1IntersectorK RibbonCurve1IntersectorK
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3ff, int M>
+    struct RibbonHit
+    {
+      __forceinline RibbonHit() {}
+
+      __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                              const NativeCurve3ff& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return curve3D.eval_du(vu[i]); }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return (Vec3vf<M>) curve3D.template veval_du<M>(vu); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3ff curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+
+    /* calculate squared distance of point p0 to line p1->p2 */
+    __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+    {
+      const vfloatx num = det(p2-p1,p1-p0);
+      const vfloatx den2 = dot(p2-p1,p2-p1);
+      return std::make_pair(num*num,den2);
+    }
+    
+    /* performs culling against a cylinder */
+    __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+    {
+      const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+      return d.first <= r*r*d.second;
+    }
+
+    template<typename NativeCurve3ff, typename Epilog>
+    __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
+                                        const LinearSpace3fa& ray_space, const float& depth_scale,
+                                        const NativeCurve3ff& curve3D, const int N,
+                                        const Epilog& epilog)
+    {
+      /* transform control points into ray space */
+      const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
+      float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
+      
+      /* evaluate the bezier curve */
+      bool ishit = false;
+      vboolx valid = vfloatx(step) < vfloatx(float(N));
+      const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+      const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+      valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+      
+      if (any(valid)) 
+      {
+        Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
+        Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
+        dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+        dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+        const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+        const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+        const Vec3vfx nn0 = normalize(n0);
+        const Vec3vfx nn1 = normalize(n1);
+        const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+        const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+        
+        vfloatx vu,vv,vt;
+        vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+        if (any(valid0))
+        {
+          /* ignore self intersections */
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+            vfloatx r = lerp(p0.w, p1.w, vu);
+            valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+          }
+          
+          if (any(valid0))
+          {
+            vv = madd(2.0f,vv,vfloatx(-1.0f));
+            RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
+            ishit |= epilog(bhit.valid,bhit);
+          }
+        }
+      }
+      
+      if (unlikely(VSIZEX < N)) 
+      {
+        /* process SIMD-size many segments per iteration */
+        for (int i=VSIZEX; i<N; i+=VSIZEX)
+        {
+          /* evaluate the bezier curve */
+          vboolx valid = vintx(i)+vintx(step) < vintx(N);
+          const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+          const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+          valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+          if (none(valid)) continue;
+          
+          Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
+          Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+          const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vfx nn0 = normalize(n0);
+          const Vec3vfx nn1 = normalize(n1);
+          const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+          const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+          
+          vfloatx vu,vv,vt;
+          vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+          if (any(valid0))
+          {
+            /* ignore self intersections */
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+              vfloatx r = lerp(p0.w, p1.w, vu);
+              valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+            }
+            
+            if (any(valid0))
+            {
+              vv = madd(2.0f,vv,vfloatx(-1.0f));
+              RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+              ishit |= epilog(bhit.valid,bhit);
+            }
+          }
+        }
+      }
+      return ishit;
+    }
+        
+    template<template<typename Ty> class NativeCurve>
+    struct RibbonCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+                                                pre.ray_space,pre.depth_scale,
+                                                curve,N,
+                                                epilog);
+      }
+    };
+    
+    template<template<typename Ty> class NativeCurve, int K>
+    struct RibbonCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+                                                pre.ray_space[k],pre.depth_scale[k],
+                                                curve,N,
+                                                epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
new file mode 100644
index 0000000000..2d4abd73ac
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
@@ -0,0 +1,364 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "cylinder.h"
+#include "plane.h"
+#include "line_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t numJacobianIterations = 5;
+#if defined(__AVX__)
+    static const size_t numBezierSubdivisions = 2;
+#else
+    static const size_t numBezierSubdivisions = 3;
+#endif
+
+    struct BezierCurveHit
+    {
+      __forceinline BezierCurveHit() {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng)
+        : t(t), u(u), v(0.0f), Ng(Ng) {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng)
+        : t(t), u(u), v(v), Ng(Ng) {}
+      
+      __forceinline void finalize() {}
+      
+    public:
+      float t;
+      float u;
+      float v; 
+      Vec3fa Ng;
+    };
+    
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i,
+                                                        const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1, 
+                                                        const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du,
+                                                        const Epilog& epilog)
+    {
+      if (tp.lower[i]+dt > ray.tfar) return false;
+      Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]);
+      if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]);
+      if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]);
+      BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o);
+      return epilog(hit);
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog> 
+     __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog)
+    {
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+      const float length_ray_dir = length(dir);
+
+      /* error of curve evaluations is propertional to largest coordinate */
+      const BBox3ff box = curve.bounds();
+      const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
+     
+      for (size_t i=0; i<numJacobianIterations; i++) 
+      {
+        const Vec3fa Q = madd(Vec3fa(t),dir,org);
+        //const Vec3fa dQdu = zero;
+        const Vec3fa dQdt = dir;
+        const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here
+           
+        Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu);
+        //const Vec3fa dPdt = zero;
+
+        const Vec3fa R = Q-P;
+        const float len_R = length(R); //reduce_max(abs(R));
+        const float R_err = max(Q_err,P_err);
+        const Vec3fa dRdu = /*dQdu*/-dPdu;
+        const Vec3fa dRdt = dQdt;//-dPdt;
+
+        const Vec3fa T = normalize(dPdu);
+        const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
+        //const Vec3fa dTdt = zero;
+        const float cos_err = P_err/length(dPdu);
+
+        /* Error estimate for dot(R,T):
+
+           dot(R,T) = cos(R,T) |R| |T|
+                    = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err)
+                    = cos(R,T)*|R|*|T| 
+                      +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err)
+                      +- cos_error*(|R| + |T|)
+                      +- lower order terms
+           with cos(R,T) being in [0,1] and |T| = 1 we get:
+             dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1)
+        */
+              
+        const float f = dot(R,T);
+        const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R);
+        const float dfdu = dot(dRdu,T) + dot(R,dTdu);
+        const float dfdt = dot(dRdt,T);// + dot(R,dTdt);
+
+        const float K = dot(R,R)-sqr(f);
+        const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu);
+        const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt);
+        const float rsqrt_K = rsqrt(K);
+
+        const float g = sqrt(K)-P.w;
+        const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w;
+        const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w;
+        const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w;
+
+        const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt);
+        const Vec2f dut = rcp(J)*Vec2f(f,g);
+        const Vec2f ut = Vec2f(u,t) - dut;
+        u = ut.x; t = ut.y;
+
+        if (abs(f) < f_err && abs(g) < g_err)
+        {
+          t+=dt;
+          if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs
+          if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs
+          const Vec3fa R = normalize(Q-P);
+          const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu);
+          const Vec3fa V = cross(dPdu,R);
+          BezierCurveHit hit(t,u,cross(V,U));
+          return epilog(hit);
+        }
+      }
+      return false;
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
+                                             float u0, float u1, unsigned int depth, const Epilog& epilog)
+    {
+#if defined(__AVX__)
+      enum { VSIZEX_ = 8 };
+      typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
+      typedef vint8 vintx; 
+      typedef vfloat8 vfloatx;
+#else
+      enum { VSIZEX_ = 4 };
+      typedef vbool4 vboolx;
+      typedef vint4 vintx; 
+      typedef vfloat4 vfloatx;
+#endif
+      typedef Vec3<vfloatx> Vec3vfx;
+      typedef Vec4<vfloatx> Vec4vfx;
+    
+      unsigned int maxDepth = numBezierSubdivisions;
+      bool found = false;
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+
+      unsigned int sptr = 0;
+      const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below
+      struct StackEntry {
+        vboolx valid;
+        vfloatx tlower;
+        float u0;
+        float u1;
+        unsigned int depth;
+      };
+      StackEntry stack[stack_size];
+      goto entry;
+
+       /* terminate if stack is empty */
+      while (sptr)
+      {
+        /* pop from stack */
+        {
+          sptr--;
+          vboolx valid = stack[sptr].valid;
+          const vfloatx tlower = stack[sptr].tlower;
+          valid &= tlower+dt <= ray.tfar;
+          if (none(valid)) continue;
+          u0 = stack[sptr].u0;
+          u1 = stack[sptr].u1;
+          depth = stack[sptr].depth;
+          const size_t i = select_min(valid,tlower); clear(valid,i);
+          stack[sptr].valid = valid;
+          if (any(valid)) sptr++; // there are still items on the stack
+
+          /* process next segment */
+          const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+          u0 = vu0[i+0];
+          u1 = vu0[i+1];
+        }
+      entry:
+
+        /* subdivide curve */
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+        Vec4vfx P0, dP0du; curve.template veval<VSIZEX_>(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale);
+        const Vec4vfx P3 = shift_right_1(P0);
+        const Vec4vfx dP3du = shift_right_1(dP0du); 
+        const Vec4vfx P1 = P0 + dP0du; 
+        const Vec4vfx P2 = P3 - dP3du;
+        
+        /* calculate bounding cylinders */
+        const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0));
+        const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0));
+        const vfloatx maxr12 = sqrt(max(rr1,rr2));
+        const vfloatx one_plus_ulp  = 1.0f+2.0f*float(ulp);
+        const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp);
+        vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+        vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+        r_outer = one_plus_ulp*r_outer;
+        r_inner = max(0.0f,one_minus_ulp*r_inner);
+        const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer);
+        const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner);
+        vboolx valid = true; clear(valid,vfloatx::size-1);
+        
+        /* intersect with outer cylinder */
+        BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1;
+        valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1);
+        if (none(valid)) continue;
+        
+        /* intersect with cap-planes */
+        BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt);
+        tp = embree::intersect(tp,tc_outer);
+        BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir);
+        tp = embree::intersect(tp,h0);
+        BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir);
+        tp = embree::intersect(tp,h1);
+        valid &= tp.lower <= tp.upper;
+        if (none(valid)) continue;
+        
+        /* clamp and correct u parameter */
+        u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size)));
+        u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size)));
+        
+        /* intersect with inner cylinder */
+        BBox<vfloatx> tc_inner;
+        vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero;
+        const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+        
+        /* at the unstable area we subdivide deeper */
+        const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f);
+        const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f);
+      
+        /* subtract the inner interval from the current hit interval */
+        BBox<vfloatx> tp0, tp1;
+        subtract(tp,tc_inner,tp0,tp1);
+        vboolx valid0 = valid & (tp0.lower <= tp0.upper);
+        vboolx valid1 = valid & (tp1.lower <= tp1.upper);
+        if (none(valid0 | valid1)) continue;
+        
+        /* iterate over all first hits front to back */
+        const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid0 = valid0 & (depth < termDepth0);
+        valid0 &= depth >= termDepth0;
+        
+        while (any(valid0))
+        {
+          const size_t i = select_min(valid0,tp0.lower); clear(valid0,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog);
+          valid0 &= tp0.lower+dt <= ray.tfar;
+        }
+        valid1 &= tp1.lower+dt <= ray.tfar;
+        
+        /* iterate over all second hits front to back */
+        const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid1 = valid1 & (depth < termDepth1);
+        valid1 &= depth >= termDepth1;
+        while (any(valid1))
+        {
+          const size_t i = select_min(valid1,tp1.lower); clear(valid1,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog);
+          valid1 &= tp1.lower+dt <= ray.tfar;
+        }
+
+        /* push valid segments to stack */
+        recursion_valid0 &= tp0.lower+dt <= ray.tfar;
+        recursion_valid1 &= tp1.lower+dt <= ray.tfar;
+        const vboolx recursion_valid = recursion_valid0 | recursion_valid1;
+        if (any(recursion_valid))
+        {
+          assert(sptr < stack_size);
+          stack[sptr].valid = recursion_valid;
+          stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower);
+          stack[sptr].u0 = u0;
+          stack[sptr].u1 = u1;
+          stack[sptr].depth = depth+1;
+          sptr++;
+        }
+      }
+      return found;
+    }
+
+    template<template<typename Ty> class NativeCurve>
+    struct SweepCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+
+    template<template<typename Ty> class NativeCurve, int K>
+    struct SweepCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+        
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
new file mode 100644
index 0000000000..cffa8e46ad
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "curve_intersector_precalculations.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+#include "intersector_epilog.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+
+#include "spherei_intersector.h"
+#include "disci_intersector.h"
+
+#include "linei_intersector.h"
+#include "roundlinei_intersector.h"
+#include "conelinei_intersector.h"
+
+#include "curveNi_intersector.h"
+#include "curveNv_intersector.h"
+#include "curveNi_mb_intersector.h"
+
+#include "curve_intersector_distance.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
+namespace embree
+{
+  struct VirtualCurveIntersector
+  {
+    typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+  public:
+    struct Intersectors
+    {
+      Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
+      
+      template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+      template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+    public:
+      Intersect1Ty intersect1;
+      Occluded1Ty  occluded1;
+      Intersect4Ty intersect4;
+      Occluded4Ty  occluded4;
+      Intersect8Ty intersect8;
+      Occluded8Ty  occluded8;
+      Intersect16Ty intersect16;
+      Occluded16Ty  occluded16;
+    };
+    
+    Intersectors vtbl[Geometry::GTY_END];
+  };
+
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+      
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+      
+#if defined(__AVX__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+#endif
+  
+#if defined(__AVX512F__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+#endif
+  
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+
+      template<int N, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<bool robust>        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        template<int N, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        template<int N, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/cylinder.h b/thirdparty/embree/kernels/geometry/cylinder.h
new file mode 100644
index 0000000000..dab02989ce
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/cylinder.h
@@ -0,0 +1,223 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cylinder
+    {
+      const Vec3fa p0;  //!< start location
+      const Vec3fa p1;  //!< end position
+      const float rr;   //!< squared radius of cylinder
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) 
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) 
+        : p0(p0), p1(p1), rr(rr) {}
+
+      __forceinline bool intersect(const Vec3fa& org,
+                                   const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o,
+                                   float& u1_o, Vec3fa& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const float rl = rcp_length(p1-p0);
+        const Vec3fa P0 = p0, dP = (p1-p0)*rl;
+        const Vec3fa O = org-P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+        
+        const float A = dOdO - sqr(dOz);
+        const float B = 2.0f * (OdO - dOz*Oz);
+        const float C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) {
+          t_o = BBox1f(pos_inf,neg_inf);
+          return false;
+        }
+        
+        /* special case for rays that are parallel to the cylinder */
+        const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (abs(A) < eps) 
+        {
+          if (C <= 0.0f) {
+            t_o = BBox1f(neg_inf,pos_inf);
+            return true;
+          } else {
+            t_o = BBox1f(pos_inf,neg_inf);
+            return false;
+          }
+        }
+        
+        /* standard case for rays that are not parallel to the cylinder */
+        const float Q = sqrt(D);
+        const float rcp_2A = rcp(2.0f*A);
+        const float t0 = (-B-Q)*rcp_2A;
+        const float t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3fa Pr = t0*dir;
+          const Vec3fa Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3fa Pr = t1*dir;
+          const Vec3fa Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = t0;
+        t_o.upper = t1;
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const
+      {
+        float u0_o; Vec3fa Ng0_o;
+        float u1_o; Vec3fa Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cylinder.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cylinder class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) {
+        return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}";
+      }
+    };
+
+    template<int N>
+      struct CylinderN
+    { 
+      const Vec3vf<N> p0;     //!< start location
+      const Vec3vf<N> p1;     //!< end position
+      const vfloat<N> rr;   //!< squared radius of cylinder
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r)
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool)
+        : p0(p0), p1(p1), rr(rr) {}
+
+     
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vf<N>& Ng0_o,
+                                       vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const vfloat<N> rl = rcp_length(p1-p0);
+        const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl;
+        const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir;
+        
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> A = dOdO - sqr(dOz);
+        const vfloat<N> B = 2.0f * (OdO - dOz*Oz);
+        const vfloat<N> C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) {
+          t_o = BBox<vfloat<N>>(empty);
+          return valid;
+        }
+
+        /* standard case for rays that are not parallel to the cylinder */
+        const vfloat<N> Q = sqrt(D);
+        const vfloat<N> rcp_2A = rcp(2.0f*A);
+        const vfloat<N> t0 = (-B-Q)*rcp_2A;
+        const vfloat<N> t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t0*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+        
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t1*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = select(valid, t0, vfloat<N>(pos_inf));
+        t_o.upper = select(valid, t1, vfloat<N>(neg_inf));
+
+        /* special case for rays that are parallel to the cylinder */
+        const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        vbool<N> validt = valid & (abs(A) < eps); 
+        if (unlikely(any(validt))) 
+        {
+          vbool<N> inside = C <= 0.0f;
+          t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower);
+          t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper);
+          valid &= !validt | inside;
+        }
+        return valid;
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vf<N> Ng0_o;
+        vfloat<N> u1_o; Vec3vf<N> Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h
new file mode 100644
index 0000000000..816c066899
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/disc_intersector.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct DiscIntersectorHitM
+    {
+      __forceinline DiscIntersectorHitM() {}
+
+      __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng)
+      {
+      }
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const
+      {
+        return Vec2f(vu[i], vv[i]);
+      }
+      __forceinline float t(const size_t i) const
+      {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const
+      {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct DiscIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i,
+          Ray& ray,
+          IntersectContext* context,
+          const Points* geom,
+          const Precalculations& pre,
+          const Vec4vf<M>& v0i,
+          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+        
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          Ray& ray,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        vfloat<M> divisor       = dot(Vec3vf<M>((Vec3fa)ray.dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+
+    template<int M, int K>
+    struct DiscIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+        
+        vfloat<M> divisor       = dot(Vec3vf<M>(ray_dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/disci_intersector.h b/thirdparty/embree/kernels/geometry/disci_intersector.h
new file mode 100644
index 0000000000..bb9d396f6e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/disci_intersector.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "disc_intersector.h"
+#include "intersector_epilog.h"
+#include "pointi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct DiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct DiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct DiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct DiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct OrientedDiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct OrientedDiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct OrientedDiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct OrientedDiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h
new file mode 100644
index 0000000000..3b4d924ea7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/filter.h
@@ -0,0 +1,204 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+            
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit);
+      return true;
+    }
+    
+    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runIntersectionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->intersectionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+    
+    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      return true;
+    }
+
+    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runOcclusionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->occlusionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return false;
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+
+    template<int K>
+      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+      
+      copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit);
+      return valid_o;
+    }
+    
+    template<int K>
+    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runIntersectionFilterHelper<K>(&args,geometry,context);
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+
+      RayK<K>* ray = (RayK<K>*) args->ray;
+      ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar);
+      return valid_o;
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runOcclusionFilterHelper<K>(&args,geometry,context);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_intersector.h b/thirdparty/embree/kernels/geometry/grid_intersector.h
new file mode 100644
index 0000000000..9c59cef119
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_intersector.h
@@ -0,0 +1,99 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class Grid1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
+      {
+        GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        intersect(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        return occluded(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+    template <int K>
+      struct GridIntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      
+      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+      }
+    };
+
+    typedef Grid1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef Grid1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef Grid1IntersectorK<16> SubdivPatch1Intersector16;
+
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa.h b/thirdparty/embree/kernels/geometry/grid_soa.h
new file mode 100644
index 0000000000..cea90aedf6
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa.h
@@ -0,0 +1,275 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_subdiv_mesh.h"
+#include "../bvh/bvh.h"
+#include "../subdiv/tessellation.h"
+#include "../subdiv/tessellation_cache.h"
+#include "subdivpatch1.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOA
+    {
+    public:
+
+      /*! GridSOA constructor */
+      GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps,
+              const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight,
+              const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr);
+
+      /*! Subgrid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps,
+                               unsigned x0, unsigned x1, unsigned y0, unsigned y1, 
+                               const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr)
+      {
+        const unsigned width = x1-x0+1;  
+        const unsigned height = y1-y0+1; 
+        const GridRange range(0,width-1,0,height-1);
+        size_t bvhBytes = 0;
+        if (time_steps == 1) 
+          bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0);
+        else {
+          bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0);
+          bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D));
+        }
+        const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);  
+        size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
+#if !defined(__64BIT__)
+        rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
+#endif
+        void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
+        assert(data);
+        return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o);
+      }
+
+      /*! Grid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps,
+                               const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) 
+      {
+        return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o);
+      }
+
+       /*! returns reference to root */
+      __forceinline       BVH4::NodeRef& root(size_t t = 0)       { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+      __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+
+      /*! returns pointer to BVH array */
+      __forceinline       char* bvhData()       { return &data[0]; }
+      __forceinline const char* bvhData() const { return &data[0]; }
+
+      /*! returns pointer to Grid array */
+      __forceinline       float* gridData(size_t t = 0)       { return (float*) &data[gridOffset + t*gridBytes]; }
+      __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; }
+      
+      __forceinline void* encodeLeaf(size_t u, size_t v) {
+        return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf
+      }
+      __forceinline float* decodeLeaf(size_t t, const void* ptr) {
+        return gridData(t) + (((size_t) (ptr) >> 4) - 1);
+      }
+
+      /*! returns the size of the BVH over the grid in bytes */
+      static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes);
+
+      /*! returns the size of the temporal BVH over the time range BVHs */
+      static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes);
+
+      /*! calculates bounding box of grid range */
+      __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const
+      {
+        const float* const grid_array = gridData(time);
+        const float* const grid_x_array = grid_array + 0 * dim_offset;
+        const float* const grid_y_array = grid_array + 1 * dim_offset;
+        const float* const grid_z_array = grid_array + 2 * dim_offset;
+        
+        /* compute the bounds just for the range! */
+        BBox3fa bounds( empty );
+        for (unsigned v = range.v_start; v<=range.v_end; v++) 
+        {
+          for (unsigned u = range.u_start; u<=range.u_end; u++)
+          {
+            const float x = grid_x_array[ v * width + u];
+            const float y = grid_y_array[ v * width + u];
+            const float z = grid_z_array[ v * width + u];
+            bounds.extend( Vec3fa(x,y,z) );
+          }
+        }
+        assert(is_finite(bounds));
+        return bounds;
+      }
+
+      /*! Evaluates grid over patch and builds BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o);
+      
+      /*! Create BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator);
+
+      /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o);
+      
+      /*! Create MBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator);
+
+      /*! Create MSMBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o);
+
+      template<typename Loader>
+        struct MapUV
+      {
+        typedef typename Loader::vfloat vfloat;
+        const float* const grid_uv;
+        size_t line_offset;
+        size_t lines;
+
+        __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines)
+          : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {}
+
+        __forceinline void operator() (vfloat& u, vfloat& v, Vec3<vfloat>& Ng) const {
+          const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines);	
+          const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]);
+          const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]);
+          const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]);        
+          const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0;        
+          u = uv[0];v = uv[1]; 
+        }
+      };
+
+      struct Gather2x3
+      {
+        enum { M = 4 };
+        typedef vbool4 vbool;
+        typedef vint4 vint;
+        typedef vfloat4 vfloat;
+        
+        static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          if (unlikely(line_offset == 2))
+          {
+            r0 = shuffle<0,1,1,1>(r0);
+            r1 = shuffle<0,1,1,1>(r1);
+          }
+          return Vec3vf4(unpacklo(r0,r1),       // r00, r10, r01, r11
+                         shuffle<1,1,2,2>(r0),  // r01, r01, r02, r02
+                         shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf4& v0_o,
+                                         Vec3vf4& v1_o,
+                                         Vec3vf4& v2_o)
+        {
+          const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+      
+#if defined (__AVX__)
+      struct Gather3x3
+      {
+        enum { M = 8 };
+        typedef vbool8 vbool;
+        typedef vint8 vint;
+        typedef vfloat8 vfloat;
+        
+        static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 ra = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          vfloat4 rc;
+          if (likely(lines > 2)) 
+            rc = vfloat4::loadu(grid + 2*line_offset);
+          else                   
+            rc = rb;
+
+          if (unlikely(line_offset == 2))
+          {
+            ra = shuffle<0,1,1,1>(ra);
+            rb = shuffle<0,1,1,1>(rb);
+            rc = shuffle<0,1,1,1>(rc);
+          }
+          
+          const vfloat8 r0 = vfloat8(ra,rb);
+          const vfloat8 r1 = vfloat8(rb,rc);
+          return Vec3vf8(unpacklo(r0,r1),         // r00, r10, r01, r11, r10, r20, r11, r21
+                         shuffle<1,1,2,2>(r0),    // r01, r01, r02, r02, r11, r11, r12, r12
+                         shuffle<0,1,1,2>(r1));   // r10, r11, r11, r12, r20, r21, r21, r22
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf8& v0_o,
+                                         Vec3vf8& v1_o,
+                                         Vec3vf8& v2_o)
+        {
+          const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+#endif
+
+      template<typename vfloat>
+      static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv)
+      {
+        typedef typename vfloat::Int vint;
+        const vint iu  = asInt(uv) & 0xffff;
+        const vint iv  = srl(asInt(uv),16);
+	const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
+	const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+	return Vec2<vfloat>(u,v);
+      }
+      
+      __forceinline unsigned int geomID() const  {
+        return _geomID;
+      } 
+      
+      __forceinline unsigned int primID() const  {
+        return _primID;
+      } 
+
+    public:
+      BVH4::NodeRef troot;
+#if !defined(__64BIT__)
+      unsigned align1;
+#endif
+      unsigned time_steps;
+      unsigned width;
+
+      unsigned height;
+      unsigned dim_offset;
+      unsigned _geomID;
+      unsigned _primID;
+
+      unsigned align2;
+      unsigned gridOffset;
+      unsigned gridBytes;
+      unsigned rootOffset;
+
+      char data[1];        //!< after the struct we first store the BVH, then the grid, and finally the roots
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
new file mode 100644
index 0000000000..8fbf0d4bdf
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOAIntersector1
+    {
+    public:
+      typedef void Primitive;
+      
+      class Precalculations
+      { 
+      public:
+        __forceinline Precalculations (const Ray& ray, const void* ptr)
+          : grid(nullptr) {}
+        
+      public:
+        GridSOA* grid;
+        int itime;
+        float ftime;
+      };
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);       
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+
+    class GridSOAMBIntersector1
+    {
+    public:
+      typedef void Primitive;
+      typedef GridSOAIntersector1::Precalculations Precalculations;
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray, const float ftime,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray, const float ftime,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+       
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      { 
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
new file mode 100644
index 0000000000..14cacab5fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
@@ -0,0 +1,445 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int K>
+      struct MapUV0
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      struct MapUV1
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      class GridSOAIntersectorK
+    {
+    public:
+      typedef void Primitive;
+
+      class Precalculations
+      {
+#if defined(__AVX__)
+        static const int M = 8;
+#else
+        static const int M = 4;
+#endif
+
+      public:
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray)
+          : grid(nullptr), intersector(valid,ray) {}
+
+      public:
+        GridSOA* grid;
+        PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector
+      };
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return !valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+
+    template<int K>
+    class GridSOAMBIntersectorK
+    {
+    public:
+      typedef void Primitive;
+      typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const size_t j = bsf(movemask(valid1));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid_o = valid_i;
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const int j = int(bsf(movemask(valid1)));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+        return !valid_o;
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            const float ftime,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           const float ftime,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      { 
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/instance.h b/thirdparty/embree/kernels/geometry/instance.h
new file mode 100644
index 0000000000..7c0e7e0f49
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance.h"
+
+namespace embree
+{
+  struct InstancePrimitive
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    InstancePrimitive (const Instance* instance, unsigned int instID) 
+    : instance(instance) 
+    , instID_(instID)
+    {}
+
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance, geomID);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      assert(end-i == 1);
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(Instance* instance) {
+      return instance->bounds(0);
+    }
+
+  public:
+    const Instance* instance;
+    const unsigned int instID_ = std::numeric_limits<unsigned int>::max ();
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/instance_intersector.h b/thirdparty/embree/kernels/geometry/instance_intersector.h
new file mode 100644
index 0000000000..28a7b728e5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance_intersector.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct InstanceIntersector1
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    struct InstanceIntersector1MB
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    template<int K>
+      struct InstanceIntersectorK
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    template<int K>
+      struct InstanceIntersectorKMB
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/intersector_epilog.h b/thirdparty/embree/kernels/geometry/intersector_epilog.h
new file mode 100644
index 0000000000..7bf134cc54
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/intersector_epilog.h
@@ -0,0 +1,979 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct UVIdentity {
+      __forceinline void operator() (vfloat<M>& u, vfloat<M>& v, Vec3vf<M>& Ng) const {}
+    };
+
+
+    template<bool filter>
+    struct Intersect1Epilog1
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1Epilog1(RayHit& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar = hit.t;
+        ray.Ng = hit.Ng;
+        ray.u = hit.u;
+        ray.v = hit.v;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+
+    template<bool filter>
+    struct Occluded1Epilog1
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1Epilog1(Ray& ray,
+                                     IntersectContext* context,
+                                     const unsigned int geomID,
+                                     const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            const bool found = runOcclusionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+        return true;
+      }
+    };
+
+    template<int K, bool filter>
+    struct Intersect1KEpilog1
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar[k] = hit.t;
+        ray.Ng.x[k] = hit.Ng.x;
+        ray.Ng.y[k] = hit.Ng.y;
+        ray.Ng.z[k] = hit.Ng.z;
+        ray.u[k] = hit.u;
+        ray.v[k] = hit.v;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int K, bool filter>
+    struct Occluded1KEpilog1
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            hit.finalize();
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Intersect1EpilogM
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar = hit.vt[i];
+        ray.Ng.x = hit.vNg.x[i];
+        ray.Ng.y = hit.vNg.y[i];
+        ray.Ng.z = hit.vNg.z[i];
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primIDs[i];
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+
+      }
+    };
+
+    template<int M, bool filter>
+    struct Occluded1EpilogM
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1EpilogM(Ray& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<M> valid = valid_i;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* if we have no filter then the test passed */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+              ray.tfar = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        return true;
+      }
+    };
+
+    template<int M, bool filter>
+    struct Intersect1EpilogMU
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1EpilogMU(RayHit& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        vbool<M> valid = valid_i;
+        hit.finalize();
+
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+        {
+          bool foundhit = false;
+          while (true)
+          {
+            /* call intersection filter function */
+            Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            const bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            foundhit |= found;
+            clear(valid,i);
+            valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+            if (unlikely(none(valid))) break;
+            i = select_min(valid,hit.vt);
+          }
+          return foundhit;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar = hit.t(i);
+        ray.Ng.x = Ng.x;
+        ray.Ng.y = Ng.y;
+        ray.Ng.z = Ng.z;
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Occluded1EpilogMU
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1EpilogMU(Ray& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+        {
+          hit.finalize();
+          for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+          {
+            const Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+            ray.tfar = old_t;
+          }
+          return false;
+        }
+#endif
+        return true;
+      }
+    };
+        
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogM
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline IntersectKEpilogM(RayHitK<K>& ray,
+                                      IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        vbool<K> valid = valid_i;
+
+        std::tie(u,v,t,Ng) = hit();
+
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogM
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline OccludedKEpilogM(vbool<K>& valid0,
+                                     RayK<K>& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+
+        /* ray masking test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        const unsigned int geomID MAYBE_UNUSED = geomIDs[i];
+        const unsigned int primID MAYBE_UNUSED = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return valid;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogMU
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const
+      {
+        vbool<K> valid = valid_org;
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        std::tie(u,v,t,Ng) = hit();
+
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogMU
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline OccludedKEpilogMU(vbool<K>& valid0,
+                                      RayK<K>& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogM
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const vuint<M>& geomIDs,
+                                       const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        assert(i<M);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+          assert(i<M);
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              assert(i<M);
+              const Vec2f uv = hit.uv(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        assert(i<M);
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = hit.vNg.x[i];
+        ray.Ng.y[k] = hit.vNg.y[i];
+        ray.Ng.z[k] = hit.vNg.z[i];
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primIDs[i];
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogM
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<M> valid = valid_i;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* execute occlusion filer */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogMU
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
+                                        IntersectContext* context,
+                                        const unsigned int geomID,
+                                        const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* finalize hit calculation */
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+          {
+            bool foundhit = false;
+            while (true)
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              if (unlikely(none(valid))) break;
+              i = select_min(valid,hit.vt);
+            }
+            return foundhit;
+          }
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = Ng.x;
+        ray.Ng.y[k] = Ng.y;
+        ray.Ng.z[k] = Ng.z;
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogMU
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            hit.finalize();
+            for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+            }
+            return false;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/intersector_iterators.h b/thirdparty/embree/kernels/geometry/intersector_iterators.h
new file mode 100644
index 0000000000..9cac1cd25c
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/intersector_iterators.h
@@ -0,0 +1,172 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Intersector>
+    struct ArrayIntersector1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++)
+          Intersector::intersect(pre,ray,context,prim[i]);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+      
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0; i<num; i++)
+          changed |= Intersector::pointQuery(query, context, prim[i]);
+        return changed;
+      }
+
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        return valid;
+      }
+    };
+
+    template<int K, typename Intersector>
+    struct ArrayIntersectorK_1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+    };
+
+    // =============================================================================================
+
+    template<int K, typename IntersectorK>
+    struct ArrayIntersectorKStream
+    {
+      typedef typename IntersectorK::Primitive PrimitiveK;
+      typedef typename IntersectorK::Precalculations PrecalculationsK;
+
+      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+
+      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        size_t m_occluded = 0;
+        for (size_t i=0; i<num; i++) {
+          size_t bits = cur_mask & (~m_occluded);
+          for (; bits!=0; )
+          {
+            const size_t rayID = bscf(bits);
+            RayHitK<K> &ray = *inputPackets[rayID / K];
+            const size_t k = rayID % K;
+            PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+            if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            {
+              m_occluded |= (size_t)1 << rayID;
+              ray.tfar[k] = neg_inf;
+            }
+          }
+        }
+        return m_occluded;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/line_intersector.h b/thirdparty/embree/kernels/geometry/line_intersector.h
new file mode 100644
index 0000000000..41096d8794
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/line_intersector.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct LineIntersectorHitM
+      {
+        __forceinline LineIntersectorHitM() {}
+
+        __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+        
+        __forceinline void finalize() {}
+        
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+        __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+        __forceinline vfloat<M> t () const { return vt; }
+        __forceinline Vec3vf<M> Ng() const { return vNg; }
+        
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct FlatLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        template<typename Ray, typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale;
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space;
+
+          const Vec3vf<M> ray_org ((Vec3fa)ray.org);
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+    
+    template<int M, int K>
+      struct FlatLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale[k];
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k];
+          const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/linei.h b/thirdparty/embree/kernels/geometry/linei.h
new file mode 100644
index 0000000000..3ee70ac012
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/linei.h
@@ -0,0 +1,705 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct LineMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;      
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline LineMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype)
+      : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    //template<class T>
+    //static __forceinline T unmask(T &index) { return index & 0x3fffffff; }
+
+    __forceinline     unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; }
+    //__forceinline       vuint<M> geomID()       { return unmask(geomIDs); }
+    //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); }
+    //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* gather the line segments with lateral info */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               Vec4vf<M>& pL,
+                               Vec4vf<M>& pR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               vbool<M>& cL,
+                               vbool<M>& cR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        const Vec3ff& p0 = geom->vertex(v0[i]+0,itime);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1,itime);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) 
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      unsigned short leftExists = 0;
+      unsigned short rightExists = 0;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(prim->geomID());
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          v0[i] = geom->segment(prim->primID());
+          leftExists |= geom->segmentLeftExists(primID[i]) << i;
+          rightExists |= geom->segmentRightExists(primID[i]) << i;         
+          begin++;
+        } else {
+          assert(i);
+          if (i>0) {
+            geomID[i] = geomID[i-1];
+            primID[i] = -1;
+            v0[i] = v0[i-1];
+          }
+        }
+        if (begin<end) prim = &prims[begin]; // FIXME: remove this line
+      }
+      new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store
+    }
+
+     template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = LineMi::blocks(set.size());
+      size_t numbytes = LineMi::bytes(set.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,time_range);
+    }
+
+      template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = LineMi::blocks(prims.size());
+      size_t numbytes = LineMi::bytes(prims.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(LineSegments* geom)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const Vec3ff& p0 = geom->vertex(v0[i]+0);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+    
+  public:
+    unsigned char gtype;
+    unsigned char m;
+    unsigned int sharedGeomID;
+    unsigned short leftExists, rightExists;
+    vuint<M> v0;      // index of start vertex
+  private:
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                       Vec4vf4& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                          Vec4vf4& p1,
+                                          vbool4&  cL,
+                                          vbool4&  cR,
+                                          const LineSegments* geom,
+                                          const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf4 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    pL = lerp(aL,bL,vfloat4(ftime));
+    pR = lerp(aR,bR,vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                       Vec4vf8& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf8 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    pL = lerp(aL,bL,vfloat8(ftime));
+    pR = lerp(aR,bR,vfloat8(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         vbool8& cL,
+                                         vbool8& cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+#endif
+  
+  template<int M>
+  typename LineMi<M>::Type LineMi<M>::type;
+
+  typedef LineMi<4> Line4i;
+  typedef LineMi<8> Line8i;
+}
diff --git a/thirdparty/embree/kernels/geometry/linei_intersector.h b/thirdparty/embree/kernels/geometry/linei_intersector.h
new file mode 100644
index 0000000000..5992827f5b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/linei_intersector.h
@@ -0,0 +1,124 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linei.h"
+#include "line_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct FlatLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct FlatLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct FlatLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct FlatLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/object.h b/thirdparty/embree/kernels/geometry/object.h
new file mode 100644
index 0000000000..2a61829ffd
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/object.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  struct Object
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    /*! constructs a virtual object */
+    Object (unsigned geomID, unsigned primID) 
+    : _geomID(geomID), _primID(primID) {}
+
+    __forceinline unsigned geomID() const {
+      return _geomID;
+    }
+
+    __forceinline unsigned primID() const {
+      return _primID;
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      new (this) Object(prim.geomID(), prim.primID());
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,itime);
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(AccelSet* mesh) {
+      return mesh->bounds(primID());
+    }
+
+  private:
+    unsigned int _geomID;  //!< geometry ID
+    unsigned int _primID;  //!< primitive ID
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h
new file mode 100644
index 0000000000..11ceb2f7fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/object_intersector.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "object.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<bool mblur>
+    struct ObjectIntersector1
+    {
+      typedef Object Primitive;
+     
+      static const bool validIntersectorK = false;
+
+      struct Precalculations {
+        __forceinline Precalculations() {}
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) 
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return;
+#endif
+
+        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+      }
+      
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return false;
+#endif
+
+        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID());
+        context->geomID = prim.geomID();
+        context->primID = prim.primID();
+        return accel->pointQuery(query, context);
+      }
+      
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+        return valid;
+      }
+    };
+
+    template<int K, bool mblur>
+    struct ObjectIntersectorK
+    {
+      typedef Object Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return;
+#endif
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return false;
+#endif
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    typedef ObjectIntersectorK<4,false>  ObjectIntersector4;
+    typedef ObjectIntersectorK<8,false>  ObjectIntersector8;
+    typedef ObjectIntersectorK<16,false> ObjectIntersector16;
+
+    typedef ObjectIntersectorK<4,true>  ObjectIntersector4MB;
+    typedef ObjectIntersectorK<8,true>  ObjectIntersector8MB;
+    typedef ObjectIntersectorK<16,true> ObjectIntersector16MB;
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/plane.h b/thirdparty/embree/kernels/geometry/plane.h
new file mode 100644
index 0000000000..e447122eab
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/plane.h
@@ -0,0 +1,57 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct HalfPlane
+    {
+      const Vec3fa P;  //!< plane origin
+      const Vec3fa N;  //!< plane normal
+
+      __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) 
+        : P(P), N(N) {}
+      
+      __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+      {
+        Vec3fa O = Vec3fa(ray_org) - P;
+        Vec3fa D = Vec3fa(ray_dir);
+        float ON = dot(O,N);
+        float DN = dot(D,N);
+        bool eps = abs(DN) < min_rcp_input;
+        float t = -ON*rcp(DN);
+        float lower = select(eps || DN < 0.0f, float(neg_inf), t);
+        float upper = select(eps || DN > 0.0f, float(pos_inf), t);
+        return BBox1f(lower,upper);
+      }
+    };
+
+    template<int M>
+      struct HalfPlaneN
+      {
+        const Vec3vf<M> P;  //!< plane origin
+        const Vec3vf<M> N;  //!< plane normal
+
+        __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N)
+          : P(P), N(N) {}
+
+        __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+        {
+          Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P;
+          Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir);
+          vfloat<M> ON = dot(O,N);
+          vfloat<M> DN = dot(D,N);
+          vbool<M> eps = abs(DN) < min_rcp_input;
+          vfloat<M> t = -ON*rcp(DN);
+          vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t);
+          vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t);
+          return BBox<vfloat<M>>(lower,upper);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/pointi.h b/thirdparty/embree/kernels/geometry/pointi.h
new file mode 100644
index 0000000000..bed04116b0
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/pointi.h
@@ -0,0 +1,412 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct PointMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+   public:
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size()
+    {
+      return M;
+    }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N)
+    {
+      return (N + max_size() - 1) / max_size();
+    }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N)
+    {
+      return blocks(N) * sizeof(PointMi);
+    }
+
+   public:
+    /* Default constructor */
+    __forceinline PointMi() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives)
+        : gtype((unsigned char)gtype),
+          numPrimitives(numPrimitives),
+          sharedGeomID(geomIDs[0]),
+          primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const {
+      return vint<M>(step) < vint<M>(numPrimitives);
+    }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const
+    {
+      assert(i < M);
+      return i < numPrimitives;
+    }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const {
+      return numPrimitives;
+    }
+
+    __forceinline unsigned int geomID(unsigned int i = 0) const {
+      return sharedGeomID;
+    }
+
+    __forceinline vuint<M>& primID() {
+      return primIDs;
+    }
+    __forceinline const vuint<M>& primID() const {
+      return primIDs;
+    }
+    __forceinline unsigned int primID(const size_t i) const {
+      assert(i < M);
+      return primIDs[i];
+    }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const;
+    __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        bounds.extend(geom->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      const PrimRefT* prim = &prims[begin];
+
+      int numPrimitives = 0;
+      for (size_t i = 0; i < M; i++) {
+        if (begin < end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          begin++;
+          numPrimitives++;
+        } else {
+          assert(i);
+          if (i > 0) {
+            geomID[i] = geomID[i - 1];
+            primID[i] = primID[i - 1];
+          }
+        }
+        if (begin < end)
+          prim = &prims[begin];  // FIXME: remove this line
+      }
+      new (this) PointMi(geomID, primID, gty, numPrimitives);  // FIXME: use non temporal store
+    }
+
+    template<typename BVH, typename Allocator>
+    __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh,
+                                                          const PrimRef* prims,
+                                                          const range<size_t>& set,
+                                                          const Allocator& alloc)
+    {
+      size_t start    = set.begin();
+      size_t items    = PointMi::blocks(set.size());
+      size_t numbytes = PointMi::bytes(set.size());
+      PointMi* accel  = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      for (size_t i = 0; i < items; i++) {
+        accel[i].fill(prims, start, set.end(), bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel, items);
+    };
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(
+        const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start                     = prims.object_range.begin();
+      size_t end                       = prims.object_range.end();
+      size_t items                     = PointMi::blocks(prims.object_range.size());
+      size_t numbytes                  = PointMi::bytes(prims.object_range.size());
+      PointMi* accel                   = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items);
+
+      LBBox3fa bounds = empty;
+      for (size_t i = 0; i < items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range));
+
+      return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range);
+    };
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line)
+    {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+
+   public:
+    unsigned char gtype;
+    unsigned char numPrimitives;
+    unsigned int sharedGeomID;
+
+   private:
+    vuint<M> primIDs;  // primitive ID
+  };
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0; gatheri(a0, geom, itime);
+    Vec4vf4 b0; gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0, b0;
+    Vec3vf4 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+    n0 = lerp(norm0, norm1, vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4)));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5)));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6)));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7)));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0;
+    gatheri(a0, geom, itime);
+    Vec4vf8 b0;
+    gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0, b0;
+    Vec3vf8 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+    n0 = lerp(norm0, norm1, vfloat8(ftime));
+  }
+#endif
+
+  template<int M>
+  typename PointMi<M>::Type PointMi<M>::type;
+
+  typedef PointMi<4> Point4i;
+  typedef PointMi<8> Point8i;
+  
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/primitive.h b/thirdparty/embree/kernels/geometry/primitive.h
new file mode 100644
index 0000000000..608d981dd7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/primitive.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene.h"
+#include "../../common/simd/simd.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  struct PrimitiveType
+  {
+    /*! returns name of this primitive type */
+    virtual const char* name() const = 0;
+    
+    /*! Returns the number of stored active primitives in a block. */
+    virtual size_t sizeActive(const char* This) const = 0;
+
+    /*! Returns the number of stored active and inactive primitives in a block. */
+    virtual size_t sizeTotal(const char* This) const = 0;
+
+    /*! Returns the number of bytes of block. */
+    virtual size_t getBytes(const char* This) const = 0;
+  };
+  
+  template<typename Primitive>
+  struct PrimitivePointQuery1
+  {
+    static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+    {
+      bool changed = false;
+      for (size_t i = 0; i < Primitive::max_size(); i++)
+      {
+        if (!prim.valid(i)) break;
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i));
+        context->geomID = prim.geomID(i);
+        context->primID = prim.primID(i);
+        changed |= accel->pointQuery(query, context);
+      }
+      return changed;
+    }
+    
+    static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { }
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/primitive4.cpp b/thirdparty/embree/kernels/geometry/primitive4.cpp
new file mode 100644
index 0000000000..9c953c5d35
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/primitive4.cpp
@@ -0,0 +1,379 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+#include "curveNv.h"
+#include "curveNi.h"
+#include "curveNi_mb.h"
+#include "linei.h"
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "trianglei.h"
+#include "quadv.h"
+#include "quadi.h"
+#include "subdivpatch1.h"
+#include "object.h"
+#include "instance.h"
+#include "subgrid.h"
+
+namespace embree
+{
+  /********************** Curve4v **************************/
+
+  template<>
+  const char* Curve4v::Type::name () const {
+    return "curve4v";
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::getBytes(const char* This) const
+  {
+     if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+     else
+        return Curve4v::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4i **************************/
+
+  template<>
+  const char* Curve4i::Type::name () const {
+    return "curve4i";
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4i::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4iMB **************************/
+
+  template<>
+  const char* Curve4iMB::Type::name () const {
+    return "curve4imb";
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4iMB::bytes(sizeActive(This));
+  }
+
+  /********************** Line4i **************************/
+
+  template<>
+  const char* Line4i::Type::name () const {
+    return "line4i";
+  }
+
+  template<>
+  size_t Line4i::Type::sizeActive(const char* This) const {
+    return ((Line4i*)This)->size();
+  }
+
+  template<>
+  size_t Line4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Line4i::Type::getBytes(const char* This) const {
+    return sizeof(Line4i);
+  }
+
+  /********************** Triangle4 **************************/
+
+  template<>
+  const char* Triangle4::Type::name () const {
+    return "triangle4";
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeActive(const char* This) const {
+    return ((Triangle4*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4);
+  }
+
+  /********************** Triangle4v **************************/
+
+  template<>
+  const char* Triangle4v::Type::name () const {
+    return "triangle4v";
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeActive(const char* This) const {
+    return ((Triangle4v*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4v::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4v);
+  }
+
+  /********************** Triangle4i **************************/
+
+  template<>
+  const char* Triangle4i::Type::name () const {
+    return "triangle4i";
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeActive(const char* This) const {
+    return ((Triangle4i*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4i::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4i);
+  }
+
+  /********************** Triangle4vMB **************************/
+
+  template<>
+  const char* Triangle4vMB::Type::name () const {
+    return  "triangle4vmb";
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeActive(const char* This) const {
+    return ((Triangle4vMB*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4vMB);
+  }
+
+  /********************** Quad4v **************************/
+
+  template<>
+  const char* Quad4v::Type::name () const {
+    return "quad4v";
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeActive(const char* This) const {
+    return ((Quad4v*)This)->size();
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4v::Type::getBytes(const char* This) const {
+    return sizeof(Quad4v);
+  }
+
+  /********************** Quad4i **************************/
+
+  template<>
+  const char* Quad4i::Type::name () const {
+    return "quad4i";
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeActive(const char* This) const {
+    return ((Quad4i*)This)->size();
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4i::Type::getBytes(const char* This) const {
+    return sizeof(Quad4i);
+  }
+
+  /********************** SubdivPatch1 **************************/
+
+  const char* SubdivPatch1::Type::name () const {
+    return "subdivpatch1";
+  }
+
+  size_t SubdivPatch1::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::getBytes(const char* This) const {
+    return sizeof(SubdivPatch1);
+  }
+
+  SubdivPatch1::Type SubdivPatch1::type;
+
+  /********************** Virtual Object **************************/
+
+  const char* Object::Type::name () const {
+    return "object";
+  }
+
+  size_t Object::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::getBytes(const char* This) const {
+    return sizeof(Object);
+  }
+
+  Object::Type Object::type;
+
+  /********************** Instance **************************/
+
+  const char* InstancePrimitive::Type::name () const {
+    return "instance";
+  }
+
+  size_t InstancePrimitive::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::getBytes(const char* This) const {
+    return sizeof(InstancePrimitive);
+  }
+
+  InstancePrimitive::Type InstancePrimitive::type;
+
+  /********************** SubGrid **************************/
+
+  const char* SubGrid::Type::name () const {
+    return "subgrid";
+  }
+
+  size_t SubGrid::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::getBytes(const char* This) const {
+    return sizeof(SubGrid);
+  }
+
+  SubGrid::Type SubGrid::type;
+  
+  /********************** SubGridQBVH4 **************************/
+
+  template<>
+  const char* SubGridQBVH4::Type::name () const {
+    return "SubGridQBVH4";
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::getBytes(const char* This) const {
+    return sizeof(SubGridQBVH4);
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector.h b/thirdparty/embree/kernels/geometry/quad_intersector.h
new file mode 100644
index 0000000000..93c9526912
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects a ray with a quad with backface culling
+     *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+     *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+     *  triangles gets intersected. */
+    template<int N>
+    __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0,
+                                                           const Vec3fa& ray_org,
+                                                           const Vec3fa& ray_dir,
+                                                           const float ray_tnear,
+                                                           const float ray_tfar,
+                                                           const Vec3vf<N>& quad_v0,
+                                                           const Vec3vf<N>& quad_v1,
+                                                           const Vec3vf<N>& quad_v2,
+                                                           const Vec3vf<N>& quad_v3,
+                                                           vfloat<N>& u_o,
+                                                           vfloat<N>& v_o,
+                                                           vfloat<N>& t_o)
+    {
+      /* calculate vertices relative to ray origin */
+      vbool<N> valid = valid0;
+      const Vec3vf<N> O = Vec3vf<N>(ray_org);
+      const Vec3vf<N> D = Vec3vf<N>(ray_dir);
+      const Vec3vf<N> va = quad_v0-O;
+      const Vec3vf<N> vb = quad_v1-O;
+      const Vec3vf<N> vc = quad_v2-O;
+      const Vec3vf<N> vd = quad_v3-O;
+
+      const Vec3vf<N> edb = vb-vd;
+      const vfloat<N> WW = dot(cross(vd,edb),D);
+      const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc);
+      const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd);
+      const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb);
+
+      /* calculate edges */
+      const Vec3vf<N> e0 = v2-v0;
+      const Vec3vf<N> e1 = v0-v1;
+
+      /* perform edge tests */
+      const vfloat<N> U = dot(cross(v0,e0),D);
+      const vfloat<N> V = dot(cross(v1,e1),D);
+      valid &= max(U,V) <= 0.0f;
+      if (unlikely(none(valid))) return false;
+
+      /* calculate geometry normal and denominator */
+      const Vec3vf<N> Ng = cross(e1,e0);
+      const vfloat<N> den = dot(Ng,D);
+      const vfloat<N> rcpDen = rcp(den);
+
+      /* perform depth test */
+      const vfloat<N> t = rcpDen*dot(v0,Ng);
+      valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar);
+      if (unlikely(none(valid))) return false;
+
+      /* avoid division by 0 */
+      valid &= den != vfloat<N>(zero);
+      if (unlikely(none(valid))) return false;
+
+      /* update hit information */
+      t_o = t;
+      u_o = U * rcpDen;
+      v_o = V * rcpDen;
+      u_o = select(WW <= 0.0f,u_o,1.0f-u_o);
+      v_o = select(WW <= 0.0f,v_o,1.0f-v_o);
+      return valid;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
new file mode 100644
index 0000000000..3abc9d6f70
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
@@ -0,0 +1,460 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitM
+    {
+      __forceinline QuadHitM() {}
+
+      __forceinline QuadHitM(const vbool<M>& valid,
+                             const vfloat<M>& U,
+                             const vfloat<M>& V,
+                             const vfloat<M>& T,
+                             const vfloat<M>& absDen,
+                             const Vec3vf<M>& Ng,
+                             const vbool<M>& flags)
+        : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        const vfloat<M> u = min(U * rcpAbsDen,1.0f);
+        const vfloat<M> v = min(V * rcpAbsDen,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitK
+    {
+      __forceinline QuadHitK(const vfloat<K>& U,
+                             const vfloat<K>& V,
+                             const vfloat<K>& T,
+                             const vfloat<K>& absDen,
+                             const Vec3vf<K>& Ng,
+                             const vbool<K>& flags)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+        const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore;
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          epilog(hit.valid,hit);
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          epilog(hit.valid,hit);
+        }
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+        {
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+
+    struct MoellerTrumboreIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& tri_v0,
+                                           const Vec3vf<M>& tri_e1,
+                                           const Vec3vf<M>& tri_e2,
+                                           const Vec3vf<M>& tri_Ng,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        /* calculate denominator */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      template<int M, int K, typename Epilog>
+      static __forceinline bool intersect1(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& v0,
+                                           const Vec3vf<M>& v1,
+                                           const Vec3vf<M>& v2,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        const Vec3vf<M> Ng = cross(e2,e1);
+        return intersect<M,K>(ray,k,v0,e1,e2,Ng,flags,epilog);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumboreBase
+    {
+      __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray.org;
+        const Vec3vf<K> R = cross(C,ray.dir);
+        const vfloat<K> den = dot(tri_Ng,ray.dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog);
+      }
+
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
new file mode 100644
index 0000000000..9873ff76ac
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
@@ -0,0 +1,438 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quad_intersector_moeller.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitPlueckerM
+    {
+      __forceinline QuadHitPlueckerM() {}
+
+      __forceinline QuadHitPlueckerM(const vbool<M>& valid,
+                                     const vfloat<M>& U,
+                                     const vfloat<M>& V,
+                                     const vfloat<M>& UVW,
+                                     const vfloat<M>& t,
+                                     const Vec3vf<M>& Ng,
+                                     const vbool<M>& flags)
+        : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitPlueckerK
+    {
+      __forceinline QuadHitPlueckerK(const vfloat<K>& U,
+                                     const vfloat<K>& V,
+                                     const vfloat<K>& UVW,
+                                     const vfloat<K>& t,
+                                     const Vec3vf<K>& Ng,
+                                     const vbool<K>& flags)
+        : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+        const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    struct PlueckerIntersectorTriangle1
+    {
+      template<int M, typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          const vbool<M>& flags,
+                                          const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid =  (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+         /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+        return epilog(valid,hit);
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1Pluecker
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+        PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true),epilog);
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+        if (PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return PlueckerIntersectorTriangle1::intersect<8>(ray,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    struct PlueckerIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect1(RayK<K>& ray,
+                                            size_t k,
+                                            const Vec3vf<M>& tri_v0,
+                                            const Vec3vf<M>& tri_v1,
+                                            const Vec3vf<M>& tri_v2,
+                                            const vbool<M>& flags,
+                                            const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+	  
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+	  
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif	
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKPlueckerBase
+    {
+      __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+           /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+      struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        return PlueckerIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quadi.h b/thirdparty/embree/kernels/geometry/quadi.h
new file mode 100644
index 0000000000..70a7bdf158
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadi.h
@@ -0,0 +1,483 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M quads from an indexed face set */
+  template <int M>
+  struct QuadMi
+  {
+    /* Virtual interface to query information about the quad type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMi(const vuint<M>& v0,
+                         const vuint<M>& v1,
+                         const vuint<M>& v2,
+                         const vuint<M>& v3,
+                         const vuint<M>& geomIDs,
+                         const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+     : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill quad from quad list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+      vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID());
+          const QuadMesh::Quad& q = mesh->quad(prim->primID());
+          unsigned int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = q.v[0] * int_stride;
+          v1[i] = q.v[1] * int_stride;
+          v2[i] = q.v[2] * int_stride;
+          v3[i] = q.v[3] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0]; // always valid geomIDs
+            primID[i] = -1;        // indicates invalid data
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+            v3[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) {
+      return cout << "QuadMi<" << M << ">( "
+#if !defined(EMBREE_COMPACT_POLYS)
+                  << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", "
+#endif
+                  << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )";
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+    vuint<M> v3_;         // 4 byte offset of 4th vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct QuadMi : public embree::QuadMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::QuadMi<M>::v0_;
+    using embree::QuadMi<M>::v1_;
+    using embree::QuadMi<M>::v2_;
+    using embree::QuadMi<M>::v3_;
+#endif
+    using embree::QuadMi<M>::geomIDs;
+    using embree::QuadMi<M>::primIDs;
+    using embree::QuadMi<M>::geomID;
+    using embree::QuadMi<M>::primID;
+    using embree::QuadMi<M>::valid;
+    
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      return (Vec3f) mesh->vertices[0][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const QuadMesh::Quad& quad = mesh->quad(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Quad {
+      vfloat4 v0,v1,v2,v3;
+    };
+
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+    
+#else
+
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+    
+#endif
+
+    /* Gather the quads */
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene) const;
+
+#if defined(__AVX512F__)
+    __forceinline void gather(Vec3vf16& p0,
+                              Vec3vf16& p1,
+                              Vec3vf16& p2,
+                              Vec3vf16& p3,
+                              const Scene *const scene) const;
+#endif
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+      Vec3vf<K>& p0,
+      Vec3vf<K>& p1,
+      Vec3vf<K>& p2,
+      Vec3vf<K>& p3,
+      const size_t index,
+      const Scene* const scene,
+      const vfloat<K>& time) const
+    {
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment<K>(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+        p3 = getVertex<3>(index, scene, itime[first], ftime);
+      }
+      else
+      {
+        p0 = getVertex<0,K>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1,K>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2,K>(valid, index, scene, itime, ftime);
+        p3 = getVertex<3,K>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const QuadMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene,
+                              const float time) const;
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (!valid(i)) break;
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& q = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(q.v[0]);
+        const Vec3fa p1 = mesh->vertex(q.v[1]);
+        const Vec3fa p2 = mesh->vertex(q.v[2]);
+        const Vec3fa p3 = mesh->vertex(q.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+      }
+      return bounds;
+    }
+
+  private:
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; }
+#endif
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene) const
+  {
+    prefetchL1(((char*)this)+0*64);
+    prefetchL1(((char*)this)+1*64);
+    const Quad tri0 = loadQuad(0,scene);
+    const Quad tri1 = loadQuad(1,scene);
+    const Quad tri2 = loadQuad(2,scene);
+    const Quad tri3 = loadQuad(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const QuadMesh* mesh,
+                                       const Scene *const scene,
+                                       const int itime) const
+  {
+    // FIXME: for trianglei there all geometries are identical, is this the case here too?
+    
+    const Quad tri0 = loadQuad(0,itime,scene);
+    const Quad tri1 = loadQuad(1,itime,scene);
+    const Quad tri2 = loadQuad(2,itime,scene);
+    const Quad tri3 = loadQuad(3,itime,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene,
+                                       const float time) const
+  {
+    const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+    p3 = lerp(a3,b3,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename QuadMi<M>::Type QuadMi<M>::type;
+
+  typedef QuadMi<4> Quad4i;
+}
diff --git a/thirdparty/embree/kernels/geometry/quadi_intersector.h b/thirdparty/embree/kernels/geometry/quadi_intersector.h
new file mode 100644
index 0000000000..20a98c3406
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadi_intersector.h
@@ -0,0 +1,350 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadi.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h
new file mode 100644
index 0000000000..2137356ff2
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadv.h
@@ -0,0 +1,165 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M quads in struct of array layout */
+  template <int M>
+  struct QuadMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M> primID()       { return primIDs; }
+    __forceinline const vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2,v3);
+      Vec3vf<M> upper = max(v0,v1,v2,v3);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(QuadMv* dst, const QuadMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vfloat<M>::store_nt(&dst->v3.x,src.v3.x);
+      vfloat<M>::store_nt(&dst->v3.y,src.v3.y);
+      vfloat<M>::store_nt(&dst->v3.z,src.v3.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill quad from quad list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID);
+        const QuadMesh::Quad& quad = mesh->quad(primID);
+        const Vec3fa& p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa& p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa& p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa& p3 = mesh->vertex(quad.v[3]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+	
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& quad = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa p3 = mesh->vertex(quad.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the quads
+    Vec3vf<M> v1;      // 2nd vertex of the quads
+    Vec3vf<M> v2;      // 3rd vertex of the quads
+    Vec3vf<M> v3;      // 4rd vertex of the quads
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename QuadMv<M>::Type QuadMv<M>::type;
+
+  typedef QuadMv<4> Quad4v;
+}
diff --git a/thirdparty/embree/kernels/geometry/quadv_intersector.h b/thirdparty/embree/kernels/geometry/quadv_intersector.h
new file mode 100644
index 0000000000..9b28e05614
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Moeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKMoeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Pluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKPluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h
new file mode 100644
index 0000000000..0e9393442b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h
@@ -0,0 +1,715 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+
+/*
+  
+  This file implements the intersection of a ray with a round linear
+  curve segment. We define the geometry of such a round linear curve
+  segment from point p0 with radius r0 to point p1 with radius r1
+  using the cone that touches spheres p0/r0 and p1/r1 tangentially
+  plus the sphere p1/r1. We denote the tangentially touching cone from
+  p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending
+  sphere with cone_sphere(p0,r0,p1,r1).
+
+  For multiple connected round linear curve segments this construction
+  yield a proper shape when viewed from the outside. Using the
+  following CSG we can also handle the interiour in most common cases:
+
+     round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
+       cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
+
+  Thus by subtracting the neighboring cone geometries, we cut away
+  parts of the center cone_sphere surface which lie inside the
+  combined curve. This approach works as long as geometry of the
+  current cone_sphere penetrates into direct neighbor segments only,
+  and not into segments further away.
+  
+  To construct a cone that touches two spheres at p0 and p1 with r0
+  and r1, one has to increase the cone radius at r0 and r1 to obtain
+  larger radii w0 and w1, such that the infinite cone properly touches
+  the spheres.  From the paper "Ray Tracing Generalized Tube
+  Primitives: Method and Applications"
+  (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications)
+  one can derive the following equations for these increased
+  radii:
+
+     sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0))
+     w0 = sr*r0
+     w1 = sr*r1
+
+  Further, we want the cone to start where it touches the sphere at p0
+  and to end where it touches sphere at p1.  Therefore, we need to
+  construct clipping locations y0 and y1 for the start and end of the
+  cone. These start and end clipping location of the cone can get
+  calculated as:
+
+     Y0 =               - r0 * (r1-r0) / length(p1-p0)
+     Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+  Where the cone starts a distance Y0 and ends a distance Y1 away of
+  point p0 along the cone center. The distance between Y1-Y0 can get
+  calculated as:
+
+    dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0)
+
+  In the code below, Y will always be scaled by length(p1-p0) to
+  obtain y and you will find the terms r0*(r1-r0) and
+  (p1-p0)^2-(r1-r0)^2.
+
+ */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct RoundLineIntersectorHitM
+      {
+        __forceinline RoundLineIntersectorHitM() {}
+        
+        __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+        __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+        __forceinline vfloat<M> t () const { return vt; }
+        __forceinline Vec3vf<M> Ng() const { return vNg; }
+       
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    namespace __roundline_internal
+    {
+      template<int M>
+        struct ConeGeometry
+        {
+          ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {}
+          
+          /* 
+             
+             This function tests if a point is accepted by first cone
+             clipping plane.
+
+             First, we need to project the point onto the line p0->p1:
+             
+               Y = (p-p0)*(p1-p0)/length(p1-p0)
+             
+             This value y is the distance to the projection point from
+             p0. The clip distances are calculated as:
+             
+               Y0 =               - r0 * (r1-r0) / length(p1-p0)
+               Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+             
+             Thus to test if the point p is accepted by the first
+             clipping plane we need to test Y > Y0 and to test if it
+             is accepted by the second clipping plane we need to test
+             Y < Y1.
+             
+             By multiplying the calculations with length(p1-p0) these
+             calculation can get simplied to:
+             
+               y = (p-p0)*(p1-p0)
+               y0 =           - r0 * (r1-r0)
+               y1 = (p1-p0)^2 - r1 * (r1-r0)
+
+             and the test y > y0 and y < y1.
+             
+          */
+          
+          __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr;
+            const vbool<M> inside_cone = y > cap0;
+            return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone;
+          }
+          
+          /* 
+             
+             This function tests whether a point lies inside the capped cone
+             tangential to its ending spheres.
+
+             Therefore one has to check if the point is inside the
+             region defined by the cone clipping planes, which is
+             performed similar as in the previous function.
+             
+             To perform the inside cone test we need to project the
+             point onto the line p0->p1:
+             
+               dP = p1-p0
+               Y = (p-p0)*dP/length(dP)
+                           
+             This value Y is the distance to the projection point from
+             p0. To obtain a parameter value u going from 0 to 1 along
+             the line p0->p1 we calculate:
+             
+               U = Y/length(dP)
+             
+             The radii to use at points p0 and p1 are:
+             
+               w0 = sr * r0
+               w1 = sr * r1
+               dw = w1-w0
+             
+             Using these radii and u one can directly test if the point
+             lies inside the cone using the formula dP*dP < wy*wy with:
+             
+               wy = w0 + u*dw
+               py = p0 + u*dP - p
+                          
+             By multiplying the calculations with length(p1-p0) and
+             inserting the definition of w can obtain simpler equations:
+             
+               y = (p-p0)*dP
+               ry = r0 + y/dP^2 * dr
+               wy = sr*ry        
+               py = p0 + y/dP^2*dP - p
+               y0 =      - r0 * dr
+               y1 = dP^2 - r1 * dr
+             
+             Thus for the in-cone test we get:
+             
+                    py^2 < wy^2
+               <=>  py^2 < sr^2 * ry^2
+               <=>  py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2
+             
+             This can further get simplified to:
+             
+               (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y;            
+                      
+          */
+          
+          __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp);
+            const vfloat<M> cap1 = -r1*dr + dPdP;
+            
+            vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf));
+            inside_cone &= y > cap0;  // start clipping plane
+            inside_cone &= y < cap1;  // end clipping plane 
+            inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test
+            return inside_cone;
+          }
+          
+        protected:
+          Vec3vf<M> p0;
+          Vec3vf<M> p1;
+          Vec3vf<M> dP;
+          vfloat<M> dPdP;
+          vfloat<M> r0;
+          vfloat<M> sqr_r0;
+          vfloat<M> r1;
+          vfloat<M> dr;
+          vfloat<M> drdr;
+          vfloat<M> r0dr;
+          vfloat<M> g;
+        };
+      
+      template<int M>
+        struct ConeGeometryIntersector : public ConeGeometry<M>
+      {
+        using ConeGeometry<M>::p0;
+        using ConeGeometry<M>::p1;
+        using ConeGeometry<M>::dP;
+        using ConeGeometry<M>::dPdP;
+        using ConeGeometry<M>::r0;
+        using ConeGeometry<M>::sqr_r0;
+        using ConeGeometry<M>::r1;
+        using ConeGeometry<M>::dr;
+        using ConeGeometry<M>::r0dr;
+        using ConeGeometry<M>::g;
+        
+        ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir),  dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)),  yp(OdP + r0dr) {}
+        
+        /*
+          
+          This function intersects a ray with a cone that touches a
+          start sphere p0/r0 and end sphere p1/r1.
+          
+          To find this ray/cone intersections one could just
+          calculate radii w0 and w1 as described above and use a
+          standard ray/cone intersection routine with these
+          radii. However, it turns out that calculations can get
+          simplified when deriving a specialized ray/cone
+          intersection for this special case. We perform
+          calculations relative to the cone origin p0 and define:
+            
+            O  = ray_org - p0
+            dO = ray_dir
+            dP = p1-p0
+            dr = r1-r0
+            dw = w1-w0
+            
+          For some t we can compute the potential hit point h = O + t*dO and
+          project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In
+          case of an intersection, the squared distance from the hit point
+          projected onto the cone center line to the hit point should be equal
+          to the squared cone radius at u:
+            
+            (u*dP - h)^2 = (w0 + u*dw)^2
+           
+          Inserting the definition of h, u, w0, and dw into this formula, then
+          factoring out all terms, and sorting by t^2, t^1, and t^0 terms
+          yields a quadratic equation to solve.
+            
+          Inserting u:
+            ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2
+            
+          Multiplying by dP^4:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2
+            
+          Inserting w0 and dw:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2)
+            ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2
+            
+          Now one can insert the definition of h, factor out, and presort by t:
+            ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2
+            ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2
+            
+          Factoring out further and sorting by t^2, t^1 and t^0 yields:
+            
+            0 =   t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ]
+              + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ]
+              +   t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ]
+            
+          This can be simplified to:
+            
+             0 =   t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ]
+               + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ]
+               +   t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ]
+            
+          Solving this quadratic equation yields the values for t at which the
+          ray intersects the cone.
+          
+        */
+        
+        __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* return no hit by default */
+          lower = pos_inf;
+          upper = neg_inf;
+          
+          /* compute quadratic equation A*t^2 + B*t + C = 0 */
+          const vfloat<M> OO = dot(O,O);
+          const vfloat<M> OdO = dot(dO,O);
+          const vfloat<M> A = g * dOdO - sqr(dOdP);
+          const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp);
+          const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP;
+          
+          /* we miss the cone if determinant is smaller than zero */
+          const vfloat<M> D = B*B - 4.0f*A*C;
+          valid &= (D >= 0.0f & g > 0.0f);  // if g <= 0 then the cone is inside a sphere end
+          
+          /* When rays are parallel to the cone surface, then the
+           * ray may be inside or outside the cone. We just assume a
+           * miss in that case, which is fine as rays inside the
+           * cone would anyway hit the ending spheres in that
+           * case. */
+          valid &= abs(A) > min_rcp_input;
+          if (unlikely(none(valid))) {
+            return false;
+          }
+          
+          /* compute distance to front and back hit */
+          const vfloat<M> Q = sqrt(D);
+          const vfloat<M> rcp_2A = rcp(2.0f*A);
+          t_cone_front = (-B-Q)*rcp_2A;
+          y_cone_front = yp + t_cone_front*dOdP;
+          lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf));
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+          t_cone_back = (-B+Q)*rcp_2A;
+          y_cone_back  = yp + t_cone_back *dOdP;
+          upper = select( (y_cone_back  > -(float)ulp) & (y_cone_back  <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf));
+#endif          
+          return true;
+        }
+        
+        /* 
+           This function intersects the ray with the end sphere at
+           p1. We already clip away hits that are inside the
+           neighboring cone segment.
+           
+        */
+        
+        __forceinline void intersectEndSphere(vbool<M>& valid, 
+                                              const ConeGeometry<M>& coneR, 
+                                              vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p1;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph1_front = (-O1dO - rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_front = org + t_sph1_front*dO;
+          vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front);
+          lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf));
+          
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph1_back  = (-O1dO + rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_back = org + t_sph1_back*dO;
+          vbool<M> valid_sph1_back  = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back);
+          upper = select(valid_sph1_back, t_sph1_back,  vfloat<M>(neg_inf));
+#else
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+
+        __forceinline void intersectBeginSphere(const vbool<M>& valid, 
+                                                vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p0;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph0_front = (-O1dO - rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0;
+          lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph0_back  = (-O1dO + rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_back  = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0;
+          upper = select(valid_sph1_back, t_sph0_back,  vfloat<M>(neg_inf));
+#else   
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+        
+        /* 
+           
+           This function calculates the geometry normal of some cone hit.
+           
+           For a given hit point h (relative to p0) with a cone
+           starting at p0 with radius w0 and ending at p1 with
+           radius w1 one normally calculates the geometry normal by
+           first calculating the parmetric u hit location along the
+           cone:
+           
+             u = dot(h,dP)/dP^2
+           
+           Using this value one can now directly calculate the
+           geometry normal by bending the connection vector (h-u*dP)
+           from hit to projected hit with some cone dependent value
+           dw/sqrt(dP^2) * normalize(dP):
+           
+             Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP)
+           
+           The length of the vector (h-u*dP) can also get calculated
+           by interpolating the radii as w0+u*dw which yields:
+           
+             Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP
+           
+           Multiplying with (w0+u*dw) yield a scaled Ng':
+           
+             Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
+           
+           Inserting the definition of w0 and dw and refactoring
+           yield a furhter scaled Ng'':
+           
+             Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
+           
+           Now inserting the definition of u gives and multiplying
+           with the denominator yields:
+           
+             Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP
+           
+           Factoring out, cancelling terms, dividing by dP^2, and
+           factoring again yields finally:
+           
+             Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr)
+           
+        */
+        
+        __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back);
+          const Vec3vf<M> h = O + t*dO;
+          return g*h-dP*y;
+#else
+          const Vec3vf<M> h = O + t_cone_front*dO;
+          return g*h-dP*y_cone_front;
+#endif
+        }
+        
+        /* compute geometry normal of sphere hit as the difference
+         * vector from hit point to sphere center */
+        
+        __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back);
+          return org+t_sph1*dO-p1;
+#else 
+          return org+t_sph1_front*dO-p1;
+#endif
+        }
+
+        __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back);
+          return org+t_sph0*dO-p0;
+#else
+          return org+t_sph0_front*dO-p0;
+#endif
+        }
+        
+        /* 
+           This function calculates the u coordinate of a
+           hit. Therefore we use the hit distance y (which is zero
+           at the first cone clipping plane) and divide by distance
+           g between the clipping planes.
+           
+        */
+        
+        __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          return clamp(y*rcp(g));
+#else
+          return clamp(y_cone_front*rcp(g));
+#endif
+        }
+        
+      private:
+        Vec3vf<M> org;
+        Vec3vf<M> O;
+        Vec3vf<M> dO;
+        vfloat<M> dOdO;
+        vfloat<M> rcp_dOdO;
+        vfloat<M> OdP;
+        vfloat<M> dOdP;
+        
+        /* for ray/cone intersection */
+      private:
+        vfloat<M> yp;
+        vfloat<M> y_cone_front;
+        vfloat<M> t_cone_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> y_cone_back;
+        vfloat<M> t_cone_back;
+#endif
+        
+        /* for ray/sphere intersection */
+      private:
+        vfloat<M> t_sph1_front;
+        vfloat<M> t_sph0_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph1_back;
+        vfloat<M> t_sph0_back;
+#endif
+      };
+      
+      
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectConeSphere(const vbool<M>& valid_i,
+                                                      const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                      const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                      const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                      const Vec4vf<M>& vL, const Vec4vf<M>& vR,
+                                                      const Epilog& epilog)
+      {         
+        vbool<M> valid = valid_i;
+        
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+        
+        /* intersect with cone from v0 to v1 */
+        vfloat<M> t_cone_lower, t_cone_upper;
+        ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1);
+        vbool<M> validCone = valid;
+        cone.intersectCone(validCone, t_cone_lower, t_cone_upper);
+
+        valid &= (validCone | (cone.g <= 0.0f));  // if cone is entirely in sphere end - check sphere
+        if (unlikely(none(valid)))
+          return false;
+        
+        /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */
+        const ConeGeometry<M> coneL (v0, vL);
+        const ConeGeometry<M> coneR (v1, vR);
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+        const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir;
+        const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir;
+        t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf));
+        t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf));
+#endif
+
+        /* intersect ending sphere */
+        vfloat<M> t_sph1_lower, t_sph1_upper;
+        vfloat<M> t_sph0_lower = vfloat<M>(pos_inf);
+        vfloat<M> t_sph0_upper = vfloat<M>(neg_inf);
+        cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper);
+
+        const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf));
+        if (unlikely(any(isBeginPoint))) {
+          cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper);
+        }
+        
+        /* CSG union of cone and end sphere */
+        vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower);
+        vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower);
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper);
+        vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper);
+        
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf);
+        
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+        
+        /* construct first hit */
+        const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper);
+        const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+        
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_cone_sphere_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false)));
+        const vfloat<M> u_second  = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+#else
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        
+        /* check if there is a valid hit */
+        if (unlikely(none(valid_lower)))
+          return false;
+        
+        /* construct first hit */
+        const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first);
+        const bool is_hit_first = epilog(valid_lower, hit);
+        
+        return is_hit_first;
+#endif
+      }
+      
+    } // end namespace __roundline_internal
+    
+    template<int M>
+      struct RoundLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+
+        template<typename Ray>
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+	
+        template<typename Ray, typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi);
+          return  __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar<Ray>(ray),v0,v1,vL,vR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct RoundLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi);
+          return __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
new file mode 100644
index 0000000000..29061d6475
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
@@ -0,0 +1,123 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "roundline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct RoundLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct RoundLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct RoundLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct RoundLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/sphere_intersector.h b/thirdparty/embree/kernels/geometry/sphere_intersector.h
new file mode 100644
index 0000000000..2670f9762d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/sphere_intersector.h
@@ -0,0 +1,183 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct SphereIntersectorHitM
+    {
+      __forceinline SphereIntersectorHitM() {}
+
+      __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng)
+        : vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const {
+        return Vec2f(0.0f, 0.0f);
+      }
+      __forceinline float t(const size_t i) const {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct SphereIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i, Ray& ray,
+          const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const vfloat<M> rd2    = rcp(dot(ray.dir, ray.dir));
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
+        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+        const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+        const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
+      {
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        return intersect(valid_i,ray,pre,v0,epilog);
+      }
+    };
+
+    template<int M, int K>
+    struct SphereIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray, size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
+        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/spherei_intersector.h b/thirdparty/embree/kernels/geometry/spherei_intersector.h
new file mode 100644
index 0000000000..7a0b428117
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/spherei_intersector.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intersector_epilog.h"
+#include "pointi.h"
+#include "sphere_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct SphereMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, bool filter>
+    struct SphereMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<M> valid = sphere.valid();
+        SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct SphereMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct SphereMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1.h b/thirdparty/embree/kernels/geometry/subdivpatch1.h
new file mode 100644
index 0000000000..ae0d4e2616
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subdivpatch1.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "../subdiv/subdivpatch1base.h"
+
+namespace embree
+{
+
+  struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    
+    static Type type;
+
+  public:
+
+    /*! constructor for cached subdiv patch */
+    SubdivPatch1 (const unsigned int gID,
+                        const unsigned int pID,
+                        const unsigned int subPatch,
+                        const SubdivMesh *const mesh,
+                        const size_t time,
+                        const Vec2f uv[4],
+                        const float edge_level[4],
+                        const int subdiv[4],
+                        const int simd_width) 
+      : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {}
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
new file mode 100644
index 0000000000..b4b15a1210
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subdivpatch1.h"
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class SubdivPatch1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    class SubdivPatch1MBIntersector1
+    {
+    public:
+      typedef SubdivPatch1 Primitive;
+      typedef GridSOAMBIntersector1::Precalculations Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = nullptr;
+        grid = (GridSOA*) prim->root_ref.get();
+        pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime);
+        lazy_node = grid->root(pre.itime);
+        pre.grid = grid;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    template <int K>
+      struct SubdivPatch1IntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+      
+      template<bool robust>        
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<bool robust>        
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>              
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef SubdivPatch1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16;
+
+    template <int K>
+      struct SubdivPatch1MBIntersectorK
+    {
+      typedef SubdivPatch1 Primitive;
+      //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = (GridSOA*) prim->root_ref.get();
+        lazy_node = grid->troot;
+        pre.grid = grid;
+        return false;
+      }
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>      
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>      
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1MBIntersectorK<4>  SubdivPatch1MBIntersector4;
+    typedef SubdivPatch1MBIntersectorK<8>  SubdivPatch1MBIntersector8;
+    typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16;
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid.h b/thirdparty/embree/kernels/geometry/subgrid.h
new file mode 100644
index 0000000000..ce54421cab
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_grid_mesh.h"
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+    /* Stores M quads from an indexed face set */
+      struct SubGrid
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        /* primitive supports multiple time segments */
+        static const bool singleTimeSegment = false;
+
+        /* Returns maximum number of stored quads */
+        static __forceinline size_t max_size() { return 1; }
+
+        /* Returns required number of primitive blocks for N primitives */
+        static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+      public:
+
+        /* Default constructor */
+        __forceinline SubGrid() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGrid(const unsigned int x,
+                              const unsigned int y,
+                              const unsigned int geomID,
+                              const unsigned int primID)
+          : _x(x), _y(y), _geomID(geomID), _primID(primID)
+        {
+        }
+
+        __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); }
+        __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const GridMesh* const mesh,
+                                  const GridMesh::Grid &g) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = vfloat4::loadu(mesh->vertexPtr(vtxID00));
+          const vfloat4 vtx01  = vfloat4::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = vfloat4::loadu(mesh->vertexPtr(vtxID10));
+          const vfloat4 vtx11  = vfloat4::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = vfloat4::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = vfloat4::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = vfloat4::loadu(mesh->vertexPtr(vtxID20));
+          const vfloat4 vtx21  = vfloat4::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = vfloat4::loadu(mesh->vertexPtr(vtxID22));
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+        template<typename T>
+        __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const
+        {
+          const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0));
+          const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1));
+          return lerp(v0,v1,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const GridMesh* const mesh,
+                                    const GridMesh::Grid &g,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const Scene *const scene) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gather(p0,p1,p2,p3,mesh,g);
+        }
+
+        /* Gather the quads in the motion blur case */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const Scene *const scene,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const Vec3fa vtx00  = Vec3fa::loadu(mesh->vertexPtr(vtxID00));
+          const Vec3fa vtx01  = Vec3fa::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const Vec3fa vtx10  = Vec3fa::loadu(mesh->vertexPtr(vtxID10));
+          const Vec3fa vtx11  = Vec3fa::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const Vec3fa vtx02  = Vec3fa::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const Vec3fa vtx12  = Vec3fa::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const Vec3fa vtx20  = Vec3fa::loadu(mesh->vertexPtr(vtxID20));
+          const Vec3fa vtx21  = Vec3fa::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const Vec3fa vtx22  = Vec3fa::loadu(mesh->vertexPtr(vtxID22));
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }        
+          
+
+        /* Calculate the bounds of the subgrid */
+        __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+        {
+          BBox3fa bounds = empty;
+          FATAL("not implemented yet");
+          return bounds;
+        }
+
+        /* Calculate the linear bounds of the primitive */
+        __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime)
+        {
+          return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
+          return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID() const { return _primID; }
+        __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; }
+        __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; }
+
+      private:
+        unsigned short _x;
+        unsigned short _y;
+        unsigned int _geomID;    // geometry ID of mesh
+        unsigned int _primID;    // primitive ID of primitive inside mesh
+      };
+
+      struct SubGridID {
+        unsigned short x;
+        unsigned short y;
+        unsigned int primID;
+        
+        __forceinline SubGridID() {}
+        __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
+        x(x), y(y), primID(primID) {}        
+      };
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridQBVHN(const unsigned int x[N],
+                                   const unsigned int y[N],
+                                   const unsigned int primID[N],
+                                   const BBox3fa * const subGridBounds,
+                                   const unsigned int geomID,
+                                   const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+
+          __aligned(64) typename BVHN<N>::AABBNode node;
+          node.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node.setBounds(i,subGridBounds[i]);
+          }
+          qnode.init_dim(node);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNode qnode;
+
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) {
+          cout << "SubGridQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID " << sg._geomID << embree_endl;
+          cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+      template<int N>
+        typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type;
+
+      typedef SubGridQBVHN<4> SubGridQBVH4;
+      typedef SubGridQBVHN<8> SubGridQBVH8;
+
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridMBQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridMBQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridMBQBVHN(const unsigned int x[N],
+                                     const unsigned int y[N],
+                                     const unsigned int primID[N],
+                                     const BBox3fa * const subGridBounds0,
+                                     const BBox3fa * const subGridBounds1,
+                                     const unsigned int geomID,
+                                     const float toffset,
+                                     const float tscale,
+                                     const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+          time_offset = toffset;
+          time_scale  = tscale;
+
+          __aligned(64) typename BVHN<N>::AABBNode node0,node1;
+          node0.clear();          
+          node1.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node0.setBounds(i,subGridBounds0[i]);
+            node1.setBounds(i,subGridBounds1[i]);
+          }
+          qnode.node0.init_dim(node0);
+          qnode.node1.init_dim(node1);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+        __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); }
+
+        template<int K>
+        __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNodeMB qnode;
+
+        float time_offset;
+        float time_scale;
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) {
+          cout << "SubGridMBQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID      " << sg._geomID << embree_endl;
+          cout << "time_offset " << sg.time_offset << embree_endl;
+          cout << "time_scale  " << sg.time_scale << embree_endl;         
+          cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl;
+          cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
new file mode 100644
index 0000000000..ad5fee2e4e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "subgrid_intersector_moeller.h"
+#include "subgrid_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    // =======================================================================================
+    // =================================== SubGridIntersectors ===============================
+    // =======================================================================================
+
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Moeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        assert(accel);
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+    
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Pluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+#if defined(__AVX__)
+          STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKMoeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKPluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+          if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+	    
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
new file mode 100644
index 0000000000..64937d34fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
@@ -0,0 +1,382 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+    template<int M>
+    __forceinline void interpolateUV(MoellerTrumboreHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX); 
+      const vint<M> syM(sy + stepY); 
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
+      hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore
+      {
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+	  /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 	  
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+    // ============================================================================================================================
+    // ============================================================================================================================
+    // ============================================================================================================================
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+    __forceinline void interpolateUV(const vbool<K>& valid, MoellerTrumboreHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+      const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+      const float inv_resX = rcp((float)(int)(g.resX-1));
+      const float inv_resY = rcp((float)(int)(g.resY-1));      
+      hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.absDen) * inv_resX,hit.U);
+      hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.absDen) * inv_resY,hit.V);
+    }
+        
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumboreBase
+      {
+        __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);
+	  MoellerTrumboreIntersectorK<M,K> intersector;
+
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	    }
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.absDen - hit.U;
+	      hit.V = hit.absDen - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	    }
+	  return any(valid0|valid1);	  
+        }
+
+       template<typename Epilog>
+        __forceinline bool occludedK(const vbool<K>& valid, 
+				     RayK<K>& ray,
+				     const Vec3vf<K>& v0,
+				     const Vec3vf<K>& v1,
+				     const Vec3vf<K>& v2,
+				     const Vec3vf<K>& v3,
+				     const GridMesh::Grid &g, 
+				     const SubGrid &subgrid,
+				     const unsigned int i,
+				     const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);
+	  MoellerTrumboreIntersectorK<M,K> intersector;
+
+	  vbool<K> valid_final = valid;
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	      valid_final &= !valid0;
+	    }
+	  if (none(valid_final)) return true;	      	  
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.absDen - hit.U;
+	      hit.V = hit.absDen - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	      valid_final &= !valid1;	      
+	    }
+	  return none(valid_final);
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             MoellerTrumboreHitM<M,UVIdentity<M>> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+	  MoellerTrumboreIntersectorK<8,K> intersector;
+	  UVIdentity<M> mapUV;
+	  return intersector.intersectEdge(ray,k,v0,e1,e2,mapUV,hit);
+        }
+	
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+	MoellerTrumboreIntersectorK<M,K> intersector;
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.absDen - hit.U;
+	    hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());	
+	MoellerTrumboreIntersectorK<M,K> intersector;
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;	  
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit))
+        {
+	  const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 	  	  
+	  interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
new file mode 100644
index 0000000000..5ded56e1f7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
@@ -0,0 +1,367 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    template<int M>
+    __forceinline void interpolateUV(PlueckerHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX);
+      const vint<M> syM(sy + stepY);
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + vfloat<M>(sxM) * hit.UVW) * inv_resX;
+      hit.V = (hit.V + vfloat<M>(syM) * hit.UVW) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker
+      {
+        __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+        __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+          PlueckerIntersector1<M> intersector(ray,nullptr);
+	  
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+	  if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+	  if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+          PlueckerIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+	  if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+	      return true;
+          }
+
+          /* intersect second triangle */
+	  if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+	      return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+      __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+
+        UVIdentity<8> mapUV;
+        PlueckerHitM<8,UVIdentity<8>> hit(mapUV);
+        PlueckerIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+	  /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;	  
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+    __forceinline void interpolateUV(const vbool<K>& valid, PlueckerHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+      const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+      const float inv_resX = rcp((float)(int)(g.resX-1));
+      const float inv_resY = rcp((float)(int)(g.resY-1));      
+      hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.UVW) * inv_resX,hit.U);
+      hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.UVW) * inv_resY,hit.V);
+    }
+    
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPlueckerBase
+      {
+        __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  PlueckerHitK<K,UVIdentity<K>> hit(mapUV);
+	  PlueckerIntersectorK<M,K> intersector;
+
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	    }
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.UVW - hit.U;
+	      hit.V = hit.UVW - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	    }
+	  return any(valid0|valid1);	  
+        }
+
+       template<typename Epilog>
+        __forceinline bool occludedK(const vbool<K>& valid, 
+				     RayK<K>& ray,
+				     const Vec3vf<K>& v0,
+				     const Vec3vf<K>& v1,
+				     const Vec3vf<K>& v2,
+				     const Vec3vf<K>& v3,
+				     const GridMesh::Grid &g, 
+				     const SubGrid &subgrid,
+				     const unsigned int i,
+				     const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  PlueckerHitK<K,UVIdentity<K>> hit(mapUV);
+	  PlueckerIntersectorK<M,K> intersector;
+
+	  vbool<K> valid_final = valid;
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	      valid_final &= !valid0;
+	    }
+	  if (none(valid_final)) return true;	      	  
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.UVW - hit.U;
+	      hit.V = hit.UVW - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	      valid_final &= !valid1;	      
+	    }
+	  return none(valid_final);
+        }
+
+	
+      };
+
+
+    
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+	PlueckerIntersectorK<M,K> intersector;
+	
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());	
+	PlueckerIntersectorK<M,K> intersector;
+	
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+        {
+	  hit.U = hit.UVW - hit.U;
+	  hit.V = hit.UVW - hit.V;	  
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker<4,K,filter> : public SubGridQuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+	UVIdentity<8> mapUV;
+	PlueckerHitM<8,UVIdentity<8>> hit(mapUV);
+	PlueckerIntersectorK<8,K> intersector;
+	const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,k,vtx0,vtx1,vtx2,mapUV,hit)))	
+        {
+          /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;	  
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+#endif
+
+    
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
new file mode 100644
index 0000000000..473d656e24
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, bool filter>
+    struct SubGridMBIntersector1Pluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          const float time = prim[i].adjustTime(ray.time());
+
+          assert(time <= 1.0f);
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          const float time = prim[i].adjustTime(ray.time());
+          assert(time <= 1.0f);
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridMBIntersectorKPluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          intersect(pre,ray,ID,context,subgrid);
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          if (occluded(pre,ray,ID,context,subgrid))
+            clear(valid0,ID);
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ 
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].template adjustTime<K>(ray.time());
+
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].template adjustTime<K>(ray.time());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+          
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle.h b/thirdparty/embree/kernels/geometry/triangle.h
new file mode 100644
index 0000000000..24b758ae48
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle.h
@@ -0,0 +1,162 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Precalculated representation for M triangles. Stores for each
+     triangle a base vertex, two edges, and the geometry normal to
+     speed up intersection calculations */
+  template<int M>
+  struct TriangleM
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+    
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleM() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+    
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid()));  }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs;  }
+    __forceinline const vuint<M>& geomID() const { return geomIDs;  }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangle */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> p0 = v0;
+      Vec3vf<M> p1 = v0-e1;
+      Vec3vf<M> p2 = v0+e2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleM* dst, const TriangleM& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->e1.x,src.e1.x);
+      vfloat<M>::store_nt(&dst->e1.y,src.e1.y);
+      vfloat<M>::store_nt(&dst->e1.z,src.e1.z);
+      vfloat<M>::store_nt(&dst->e2.x,src.e2.x);
+      vfloat<M>::store_nt(&dst->e2.y,src.e2.y);
+      vfloat<M>::store_nt(&dst->e2.z,src.e2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+	  for (size_t i=0; i<M; i++)
+      {
+        if (unlikely(geomID(i) == -1)) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+      return bounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // base vertex of the triangles
+    Vec3vf<M> e1;      // 1st edge of the triangles (v0-v1)
+    Vec3vf<M> e2;      // 2nd edge of the triangles (v2-v0)
+  private:
+    vuint<M> geomIDs; // geometry IDs
+    vuint<M> primIDs; // primitive IDs
+  };
+
+  template<int M>
+  typename TriangleM<M>::Type TriangleM<M>::type;
+
+  typedef TriangleM<4> Triangle4;
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_intersector.h
new file mode 100644
index 0000000000..2cdff78ec8
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMIntersector1Moeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+      
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMIntersectorKMoeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT_USER(0,TriangleM<M>::max_size());
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid_i,ray,p0,e1,e2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid0,ray,p0,e1,e2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
new file mode 100644
index 0000000000..0a42d8f08b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Moeller
+ *  Trumbore intersector from the paper "Fast, Minimum Storage
+ *  Ray-Triangle Intersection". In contrast to the paper we
+ *  precalculate some factors and factor the calculations differently
+ *  to allow precalculating the cross product e1 x e2. The resulting
+ *  algorithm is similar to the fastest one of the paper "Optimizing
+ *  Ray-Triangle Intersection via Automated Search". */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct MoellerTrumboreHitM
+    {
+      __forceinline MoellerTrumboreHitM(const UVMapper& mapUV) : mapUV(mapUV) {}
+
+      __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), T(T), absDen(absDen), mapUV(mapUV), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        vu = U * rcpAbsDen;
+        vv = V * rcpAbsDen;
+        mapUV(vu,vv,vNg);
+      }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return vNg; }
+     
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      UVMapper mapUV;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+    
+    template<int M, bool early_out = true>
+    struct MoellerTrumboreIntersector1
+    {
+      __forceinline MoellerTrumboreIntersector1() {}
+
+      __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_e1,
+                                   const Vec3vf<M>& tri_e2,
+                                   const Vec3vf<M>& tri_Ng,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        /* calculate denominator */
+        vbool<M> valid = valid0;
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */        
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(early_out && none(valid))) return false;
+
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
+        if (likely(early_out && none(valid))) return false;
+           
+        /* update hit information */
+        new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV);
+
+        return true;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(const vbool<M>& valid,
+                                       Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        vbool<M> valid = true;
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,v0,e1,e2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(valid,ray,v0,e1,e2,mapUV,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const UVMapper& mapUV,
+                                     const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        auto mapUV = UVIdentity<M>();
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+    template<int K, typename UVMapper>
+    struct MoellerTrumboreHitK
+    {
+      __forceinline MoellerTrumboreHitK(const UVMapper& mapUV) : mapUV(mapUV) {}
+      __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        vfloat<K> u = U * rcpAbsDen;
+        vfloat<K> v = V * rcpAbsDen;
+        Vec3vf<K> vNg = Ng;
+        mapUV(u,v,vNg);
+        return std::make_tuple(u,v,t,vNg);
+      }
+      
+      vfloat<K> U;
+      vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct MoellerTrumboreIntersectorK
+    {
+      __forceinline MoellerTrumboreIntersectorK() {}
+      __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const UVMapper& mapUV,
+                                        MoellerTrumboreHitK<K,UVMapper> &hit) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitK<K,UVMapper>(U,V,T,absDen,tri_Ng,mapUV);
+        return valid;
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        MoellerTrumboreHitK<K,UVMapper> &hit) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+      }
+      
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);		
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+
+      
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+	UVIdentity<K> mapUV;	
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);			
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2,
+                                            const UVMapper& mapUV,
+                                            const Epilog& epilog) const
+      {
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);			
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV);
+        return true;
+      }
+
+       template<typename UVMapper>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,mapUV,hit);
+      }
+      
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,mapUV,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const      
+      {
+        return intersect(ray,k,v0,v1,v2,UVIdentity<M>(),epilog);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,mapUV,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
new file mode 100644
index 0000000000..8fbefcea88
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
@@ -0,0 +1,407 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "intersector_epilog.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct PlueckerHitM
+    {
+      __forceinline PlueckerHitM(const UVMapper& mapUV) : mapUV(mapUV) {}
+      
+      __forceinline PlueckerHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        :  U(U), V(V), UVW(UVW), mapUV(mapUV), valid(valid), vt(t), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        vu = min(U * rcpUVW,1.0f);
+        vv = min(V * rcpUVW,1.0f);	
+        mapUV(vu,vv,vNg);
+      }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return vNg; }
+    
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      const UVMapper& mapUV;
+      
+    public:
+      vbool<M> valid;      
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M, bool early_out = true>
+    struct PlueckerIntersector1
+    {
+      __forceinline PlueckerIntersector1() {}
+
+      __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+				   PlueckerHitM<M,UVMapper>& hit) const
+      {
+        vbool<M> valid = valid0;
+        
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+	const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(early_out && none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(early_out && none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV);
+        return true;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(const vbool<M>& valid,
+				       Ray& ray,
+				       const Vec3vf<M>& tri_v0,
+				       const Vec3vf<M>& tri_v1,
+				       const Vec3vf<M>& tri_v2,
+				       const UVMapper& mapUV,
+				       PlueckerHitM<M,UVMapper>& hit) const
+      {
+        return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(Ray& ray,
+				       const Vec3vf<M>& tri_v0,
+				       const Vec3vf<M>& tri_v1,
+				       const Vec3vf<M>& tri_v2,
+				       const UVMapper& mapUV,				       
+				       PlueckerHitM<M,UVMapper>& hit) const
+      {
+	vbool<M> valid = true;
+        return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,				   
+                                   PlueckerHitM<M,UVMapper>& hit) const
+      {
+        return intersectEdge(ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const UVMapper& mapUV,
+                                     const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        auto mapUV = UVIdentity<M>();
+        PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+    };
+
+    template<int K, typename UVMapper>
+    struct PlueckerHitK
+    {
+      __forceinline PlueckerHitK(const UVMapper& mapUV) : mapUV(mapUV) {}
+      
+      __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        :  U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        vfloat<K> u = min(U * rcpUVW,1.0f);
+        vfloat<K> v = min(V * rcpUVW,1.0f);
+        Vec3vf<K> vNg = Ng;
+        mapUV(u,v,vNg);
+        return std::make_tuple(u,v,t,vNg);
+      }
+      vfloat<K> U;
+      vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct PlueckerIntersectorK
+    {
+      __forceinline PlueckerIntersectorK() {}      
+      __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+				    RayK<K>& ray,
+				    const Vec3vf<K>& tri_v0,
+				    const Vec3vf<K>& tri_v1,
+				    const Vec3vf<K>& tri_v2,
+				    const UVMapper& mapUV,
+				    PlueckerHitK<K,UVMapper> &hit) const
+      {
+        /* calculate vertices relative to ray origin */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> O = ray.org;
+        const Vec3vf<K> D = ray.dir;
+        const Vec3vf<K> v0 = tri_v0-O;
+        const Vec3vf<K> v1 = tri_v1-O;
+        const Vec3vf<K> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<K> e0 = v2-v0;
+        const Vec3vf<K> e1 = v0-v1;
+        const Vec3vf<K> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+        const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+        const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+        const vfloat<K> UVW = U+V+W;
+        const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return valid;
+
+         /* calculate geometry normal and denominator */
+        const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+        /* perform depth test */
+        const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+        const vfloat<K> t = rcp(den)*T;
+        valid &= ray.tnear() <= t & t <= ray.tfar;
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return valid;
+        
+        /* calculate hit information */
+        new (&hit) PlueckerHitK<K,UVMapper>(U,V,UVW,t,Ng,mapUV);
+        return valid;
+      }
+
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+	UVIdentity<K> mapUV;	
+        PlueckerHitK<K,UVIdentity<K>> hit(mapUV);		
+        const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+					const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        PlueckerHitK<K,UVMapper> hit(mapUV);		
+        const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+	return epilog(valid,hit);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+				   PlueckerHitM<M,UVMapper> &hit) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+	
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+	
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+        if (unlikely(none(valid))) return false;
+
+        /* avoid division by 0 */
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV);
+        return true;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,				   
+                                   const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);	
+        if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit))
+	  return epilog(hit.valid,hit);
+	return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const Epilog& epilog) const
+      {
+	UVIdentity<M> mapUV;	
+        PlueckerHitM<M,UVIdentity<M>> hit(mapUV);	
+        if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit))
+	  return epilog(hit.valid,hit);
+	return false;
+      }
+      
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
new file mode 100644
index 0000000000..f05dcc4537
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
@@ -0,0 +1,418 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct WoopHitM
+    {
+      __forceinline WoopHitM() {}
+
+      __forceinline WoopHitM(const vbool<M>& valid, 
+                             const vfloat<M>& U, 
+                             const vfloat<M>& V, 
+                             const vfloat<M>& T, 
+                             const vfloat<M>& inv_det,                              
+                             const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        vt = T;
+        vu = U*inv_det;
+        vv = V*inv_det;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> T;
+      const vfloat<M> inv_det;
+      
+    public:
+      const vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct WoopPrecalculations1
+    {
+      unsigned int kx,ky,kz;
+      Vec3vf<M> org;
+      Vec3fa S;
+      __forceinline WoopPrecalculations1() {}
+
+      __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr)
+      {
+        kz = maxDim(abs(ray.dir));
+        kx = (kz+1) % 3;
+        ky = (kx+1) % 3;
+        const float inv_dir_kz = rcp(ray.dir[kz]);
+        if (ray.dir[kz]) std::swap(kx,ky);
+        S.x = ray.dir[kx] * inv_dir_kz;
+        S.y = ray.dir[ky] * inv_dir_kz;
+        S.z = inv_dir_kz;
+        org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]);
+      }
+    };
+
+    
+    template<int M>
+    struct WoopIntersector1
+    {
+
+        typedef WoopPrecalculations1<M> Precalculations;
+
+      __forceinline WoopIntersector1() {}
+
+      __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {}
+
+      static __forceinline bool intersect(const vbool<M>& valid0,
+                                          Ray& ray,
+                                          const Precalculations& pre,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          WoopHitM<M>& hit)
+      {       
+        vbool<M> valid = valid0;
+
+        /* vertices relative to ray origin */
+        const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z);
+        const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org;
+        const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org;
+        const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org;
+
+        /* shear and scale vertices */
+        const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x);
+        const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y);
+        const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x);
+        const vfloat<M> By = nmadd(B.z,pre.S.y,B.y);
+        const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x);
+        const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y);
+
+        /* scaled barycentric */
+        const vfloat<M> U0 = Cx*By;
+        const vfloat<M> U1 = Cy*Bx;
+        const vfloat<M> V0 = Ax*Cy;
+        const vfloat<M> V1 = Ay*Cx;
+        const vfloat<M> W0 = Bx*Ay;
+        const vfloat<M> W1 = By*Ax;
+#if !defined(__AVX512F__)
+        valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) |
+          (U0 <= U1) & (V0 <= V1) & (W0 <= W1);
+#else
+        valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1);
+#endif
+
+        if (likely(none(valid))) return false;
+        const vfloat<M> U = U0-U1;
+        const vfloat<M> V = V0-V1;
+        const vfloat<M> W = W0-W1;
+
+        const vfloat<M> det = U+V+W;
+
+        valid &= det != 0.0f;
+        const vfloat<M> inv_det = rcp(det);
+
+        const vfloat<M> Az = pre.S.z * A.z;
+        const vfloat<M> Bz = pre.S.z * B.z;
+        const vfloat<M> Cz = pre.S.z * C.z;
+        const vfloat<M> T  = madd(U,Az,madd(V,Bz,W*Cz)); 
+        const vfloat<M> t  = T * inv_det;
+        /* perform depth test */
+        valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+        
+        const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1);
+
+        /* update hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng);
+        return true;
+      }
+      
+      static __forceinline bool intersect(Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   WoopHitM<M>& hit)
+      {
+        vbool<M> valid = true;
+        return intersect(valid,ray,pre,v0,v1,v2,hit);
+      }
+
+
+      template<typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                     const Precalculations& pre,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+#if 0
+    template<int K>
+    struct WoopHitK
+    {
+      __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct WoopIntersectorK
+    {
+      __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        WoopHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       WoopHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       WoopHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h
new file mode 100644
index 0000000000..50106bcc16
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    struct TriangleTriangleIntersector
+    {
+      __forceinline static float T(float pa0, float pa1, float da0, float da1) {
+        return pa0 + (pa1-pa0)*da0/(da0-da1);
+      }
+      
+      __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) {
+        return det(p-a0,a0-a1) >= 0.0f;
+      }
+      
+      __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) 
+      {
+        const bool pab = point_line_side(p,a,b); 
+        const bool pbc = point_line_side(p,b,c);
+        const bool pca = point_line_side(p,c,a);
+        return pab == pbc && pab == pca;
+      }
+      
+      __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1)
+      {
+        const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1);
+        const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1);
+        return different_sides0 && different_sides1;
+      }
+      
+      __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, 
+                                                             const Vec2f& b0, const Vec2f& b1, const Vec2f& b2)
+      {
+        const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); 
+        if (a01_b01) return true;
+        const bool a01_b12 = intersect_line_line(a0,a1,b1,b2);
+        if (a01_b12) return true;
+        const bool a01_b20 = intersect_line_line(a0,a1,b2,b0);
+        if (a01_b20) return true;
+        const bool a12_b01 = intersect_line_line(a1,a2,b0,b1);
+        if (a12_b01) return true;
+        const bool a12_b12 = intersect_line_line(a1,a2,b1,b2);
+        if (a12_b12) return true;
+        const bool a12_b20 = intersect_line_line(a1,a2,b2,b0);
+        if (a12_b20) return true;
+        const bool a20_b01 = intersect_line_line(a2,a0,b0,b1);
+        if (a20_b01) return true;
+        const bool a20_b12 = intersect_line_line(a2,a0,b1,b2);
+        if (a20_b12) return true;
+        const bool a20_b20 = intersect_line_line(a2,a0,b2,b0);
+        if (a20_b20) return true;
+        
+        bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2);
+        if (a_in_b) return true;
+        
+        bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2);
+        if (b_in_a) return true;
+        
+        return false;
+      }
+      
+      static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2,
+                                               const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2)
+      {
+        const float eps = 1E-5f;
+        
+        /* calculate triangle planes */
+        const Vec3fa Na = cross(a1-a0,a2-a0);
+        const float  Ca = dot(Na,a0);
+        const Vec3fa Nb = cross(b1-b0,b2-b0);
+        const float  Cb = dot(Nb,b0);
+        
+        /* project triangle A onto plane B */
+        const float da0 = dot(Nb,a0)-Cb;
+        const float da1 = dot(Nb,a1)-Cb;
+        const float da2 = dot(Nb,a2)-Cb;
+        if (max(da0,da1,da2) < -eps) return false;
+        if (min(da0,da1,da2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections4++);
+        
+        /* project triangle B onto plane A */
+        const float db0 = dot(Na,b0)-Ca;
+        const float db1 = dot(Na,b1)-Ca;
+        const float db2 = dot(Na,b2)-Ca;
+        if (max(db0,db1,db2) < -eps) return false;
+        if (min(db0,db1,db2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections5++);
+        
+        if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) ||
+                     (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps)))
+        {
+          const size_t dz = maxDim(Na);
+          const size_t dx = (dz+1)%3;
+          const size_t dy = (dx+1)%3;
+          const Vec2f A0(a0[dx],a0[dy]);
+          const Vec2f A1(a1[dx],a1[dy]);
+          const Vec2f A2(a2[dx],a2[dy]);
+          const Vec2f B0(b0[dx],b0[dy]);
+          const Vec2f B1(b1[dx],b1[dy]);
+          const Vec2f B2(b2[dx],b2[dy]);
+          return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2);
+        }
+        
+        const Vec3fa D = cross(Na,Nb);
+        const float pa0 = dot(D,a0);
+        const float pa1 = dot(D,a1);
+        const float pa2 = dot(D,a2);
+        const float pb0 = dot(D,b0);
+        const float pb1 = dot(D,b1);
+        const float pb2 = dot(D,b2);
+        
+        BBox1f ba = empty;
+        if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1));
+        if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2));
+        if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0));
+        
+        BBox1f bb = empty;
+        if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1));
+        if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2));
+        if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0));
+        
+        return conjoint(ba,bb);
+      }
+    };
+  }
+}
+
+  
diff --git a/thirdparty/embree/kernels/geometry/trianglei.h b/thirdparty/embree/kernels/geometry/trianglei.h
new file mode 100644
index 0000000000..6aad48a5ef
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglei.h
@@ -0,0 +1,442 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M triangles from an indexed face set */
+  template <int M>
+  struct TriangleMi
+  {
+    /* Virtual interface to query information about the triangle type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMi(const vuint<M>& v0,
+                             const vuint<M>& v1,
+                             const vuint<M>& v2,
+                             const vuint<M>& geomIDs,
+                             const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+    : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline vuint<M> geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+    
+    /* Non-temporal store */
+    __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src)
+    {
+#if !defined(EMBREE_COMPACT_POLYS)
+      vuint<M>::store_nt(&dst->v0_,src.v0_);
+      vuint<M>::store_nt(&dst->v1_,src.v1_);
+      vuint<M>::store_nt(&dst->v2_,src.v2_);
+#endif
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> v0 = zero, v1 = zero, v2 = zero;
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID());
+          const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID());
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride;
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0];
+            primID[i] = -1;
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned int primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+      }
+      return bounds;
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct TriangleMi : public embree::TriangleMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::TriangleMi<M>::v0_;
+    using embree::TriangleMi<M>::v1_;
+    using embree::TriangleMi<M>::v2_;
+#endif
+    using embree::TriangleMi<M>::geomIDs;
+    using embree::TriangleMi<M>::primIDs;
+    using embree::TriangleMi<M>::geomID;
+    using embree::TriangleMi<M>::primID;
+    using embree::TriangleMi<M>::valid;
+        
+    /* loads a single vertex */
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      return (Vec3f) mesh->vertices[0][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Triangle {
+      vfloat4 v0,v1,v2;
+    };
+    
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID);
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]];
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]];
+      return { v0, v1, v2 };
+    }
+    
+#else
+
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+    
+#endif
+
+    /* Gather the triangles */
+    __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const;
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+                Vec3vf<K>& p0,
+                Vec3vf<K>& p1,
+                Vec3vf<K>& p2,
+                const size_t index,
+                const Scene* const scene,
+                const vfloat<K>& time) const
+    {
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment<K>(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+      } else {
+        p0 = getVertex<0,K>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1,K>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2,K>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const TriangleMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const Scene *const scene,
+                              const float time) const;
+
+
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; }
+#endif
+  
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene* const scene) const
+  {
+    const Triangle tri0 = loadTriangle(0,scene);
+    const Triangle tri1 = loadTriangle(1,scene);
+    const Triangle tri2 = loadTriangle(2,scene);
+    const Triangle tri3 = loadTriangle(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const TriangleMesh* mesh,
+                                           const Scene *const scene,
+                                           const int itime) const
+  {
+    const Triangle tri0 = loadTriangle(0,itime,mesh);
+    const Triangle tri1 = loadTriangle(1,itime,mesh);
+    const Triangle tri2 = loadTriangle(2,itime,mesh);
+    const Triangle tri3 = loadTriangle(3,itime,mesh);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene *const scene,
+                                           const float time) const
+  {
+    const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename TriangleMi<M>::Type TriangleMi<M>::type;
+
+  typedef TriangleMi<4> Triangle4i;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglei_intersector.h b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
new file mode 100644
index 0000000000..f7deb9e72d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "trianglei.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMiIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMiIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiMBIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMiMBIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiMBIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMiMBIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev.h b/thirdparty/embree/kernels/geometry/trianglev.h
new file mode 100644
index 0000000000..cd94756b9e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev.h
@@ -0,0 +1,157 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template <int M>
+  struct TriangleMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMv<M>::Type TriangleMv<M>::type;
+
+  typedef TriangleMv<4> Triangle4v;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
new file mode 100644
index 0000000000..3abb7f8e32
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
@@ -0,0 +1,206 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_pluecker.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_woop.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Moeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Woop
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef WoopIntersector1<M> intersec;
+      typedef WoopPrecalculations1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMvIntersectorKMoeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Pluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMvIntersectorKPluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb.h b/thirdparty/embree/kernels/geometry/trianglev_mb.h
new file mode 100644
index 0000000000..b550a29fd5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb.h
@@ -0,0 +1,201 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template<int M>
+  struct TriangleMvMB
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+
+    static Type type;
+
+  public:
+
+    /* primitive supports single time segments */
+    static const bool singleTimeSegment = true;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMvMB() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1,
+                               const Vec3vf<M>& b0, const Vec3vf<M>& b1,
+                               const Vec3vf<M>& c0, const Vec3vf<M>& c1,
+                               const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles at t0 */
+    __forceinline BBox3fa bounds0() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the bounds of the triangles at t1 */
+    __forceinline BBox3fa bounds1() const 
+    {
+      const Vec3vf<M> p0 = v0+dv0;
+      const Vec3vf<M> p1 = v1+dv1;
+      const Vec3vf<M> p2 = v2+dv2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds() const {
+      return LBBox3fa(bounds0(),bounds1());
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      BBox3fa bounds0 = empty;
+      BBox3fa bounds1 = empty;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
+	va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z;
+	vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z;
+	vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z;
+	vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z;
+	vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID);
+        const range<int> itime_range = mesh->timeSegmentRange(time_range);
+        assert(itime_range.size() == 1);
+        const int ilower = itime_range.begin();
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        allBounds.extend(mesh->linearBounds(primID, time_range));
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+        const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
+        auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
+        auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
+        auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z;
+	va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z;
+	vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z;
+	vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z;
+	vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z;
+	vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return allBounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+    Vec3vf<M> dv0;     // difference vector between time steps t0 and t1 for first vertex
+    Vec3vf<M> dv1;     // difference vector between time steps t0 and t1 for second vertex
+    Vec3vf<M> dv2;     // difference vector between time steps t0 and t1 for third vertex
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMvMB<M>::Type TriangleMvMB<M>::type;
+
+  typedef TriangleMvMB<4> Triangle4vMB;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
new file mode 100644
index 0000000000..38cd52e85d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
@@ -0,0 +1,211 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvMBIntersector1Moeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMvMBIntersectorKMoeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvMBIntersector1Pluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMvMBIntersectorKPluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
new file mode 100644
index 0000000000..10f315cee7
--- /dev/null
+++ b/thirdparty/embree/kernels/hash.h
@@ -0,0 +1,5 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab"
diff --git a/thirdparty/embree/kernels/subdiv/bezier_curve.h b/thirdparty/embree/kernels/subdiv/bezier_curve.h
new file mode 100644
index 0000000000..a5adad5cc9
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bezier_curve.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+//#include "../common/scene_curves.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  class BezierBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0 * t0 * t0;
+      const T B1 = 3.0f * t1 * (t0 * t0);
+      const T B2 = 3.0f * (t1 * t1) * t0;
+      const T B3 = t1 * t1 * t1;
+      return Vec4<T>(B0,B1,B2,B3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = -(t0*t0);
+      const T B1 = madd(-2.0f,t0*t1,t0*t0);
+      const T B2 = msub(+2.0f,t0*t1,t1*t1);
+      const T B3 = +(t1*t1);
+      return T(3.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0;
+      const T B1 = madd(-2.0f,t0,t1);
+      const T B2 = madd(-2.0f,t1,t0);
+      const T B3 = t1;
+      return T(6.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+  };
+  
+  struct PrecomputedBezierBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBezierBasis() {}
+    PrecomputedBezierBasis(int shift);
+
+    /* basis for bezier evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bezier derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBezierBasis bezier_basis0;
+  extern PrecomputedBezierBasis bezier_basis1;
+
+  
+  template<typename V>
+    struct LinearBezierCurve
+    {
+      V v0,v1;
+      
+      __forceinline LinearBezierCurve () {}
+      
+      __forceinline LinearBezierCurve (const LinearBezierCurve& other)
+        : v0(other.v0), v1(other.v1) {}
+      
+      __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; return *this;
+      }
+        
+        __forceinline LinearBezierCurve (const V& v0, const V& v1)
+          : v0(v0), v1(v1) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v1; }
+      
+      bool hasRoot() const;
+      
+      friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) {
+        return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")";
+      }
+    };
+  
+  template<> __forceinline bool LinearBezierCurve<Interval1f>::hasRoot() const {
+    return numRoots(v0,v1);
+  }
+  
+  template<typename V>
+    struct QuadraticBezierCurve
+    {
+      V v0,v1,v2;
+      
+      __forceinline QuadraticBezierCurve () {}
+      
+      __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other)
+        : v0(other.v0), v1(other.v1), v2(other.v2) {}
+      
+      __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this;
+      }
+        
+        __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2)
+          : v0(v0), v1(v1), v2(v2) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v2; }
+      
+      __forceinline V interval() const {
+        return merge(v0,v1,v2);
+      }
+      
+      __forceinline BBox<V> bounds() const {
+        return merge(BBox<V>(v0),BBox<V>(v1),BBox<V>(v2));
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) {
+        return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")";
+      }
+    };
+  
+  
+  typedef QuadraticBezierCurve<float> QuadraticBezierCurve1f;
+  typedef QuadraticBezierCurve<Vec2fa> QuadraticBezierCurve2fa;
+  typedef QuadraticBezierCurve<Vec3fa> QuadraticBezierCurve3fa;
+
+  template<typename Vertex>
+    struct CubicBezierCurve
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CubicBezierCurve() {}
+
+      template<typename T1>
+      __forceinline CubicBezierCurve (const CubicBezierCurve<T1>& other)
+      : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {}
+      
+      __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this;
+      }
+      
+      __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return v0;
+      }
+
+      __forceinline Vertex end() const {
+        return v3;
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline Vertex begin_direction() const {
+        return v1-v0;
+      }
+
+      __forceinline Vertex end_direction() const {
+        return v3-v2;
+      }
+
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const {
+        return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const {
+        return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const {
+        return CubicBezierCurve<float>(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx));
+      }
+
+       __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0);
+        const Vec3fa q1 = xfmVector(space,v1);
+        const Vec3fa q2 = xfmVector(space,v2);
+        const Vec3fa q3 = xfmVector(space,v3);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0-p);
+        const Vec3fa q1 = xfmVector(space,v1-p);
+        const Vec3fa q2 = xfmVector(space,v2-p);
+        const Vec3fa q3 = xfmVector(space,v3-p);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return CubicBezierCurve<Vec3ff>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const
+      {
+        const Vec3fa q0 = xfmVector(space,s*(v0-p));
+        const Vec3fa q1 = xfmVector(space,s*(v1-p));
+        const Vec3fa q2 = xfmVector(space,s*(v2-p));
+        const Vec3fa q3 = xfmVector(space,s*(v3-p));
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline int maxRoots() const;
+      
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) {
+        return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+      
+      __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3);
+      }
+
+      __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b,  const CubicBezierCurve& c) {
+        return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3));
+      }
+      
+      __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) {
+        return cmadd((Vertex(1.0f)-t),a,t*b);
+      }
+      
+      __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3));
+      }
+      
+      __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        new (&left ) CubicBezierCurve(p00,p10,p20,p30);
+        new (&right) CubicBezierCurve(p30,p21,p12,p03);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split() const
+      {
+        const float u0 = 0.0f, u1 = 1.0f;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split(const BBox1f& u) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline void eval(float t, Vertex& p, Vertex& dp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = Vertex(3.0f)*(p21-p20);
+      }
+
+#if 0
+      __forceinline Vertex eval(float t) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        return p30;
+      }
+#else
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+#endif
+      
+      __forceinline Vertex eval_dt(float t) const
+      {
+        const Vertex p00 = v1-v0;
+        const Vertex p01 = v2-v1;
+        const Vertex p02 = v3-v2;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        return Vertex(3.0f)*p20;
+      }
+
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BezierBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const
+      {
+        const Vec2vfx p00 = v0;
+        const Vec2vfx p01 = v1;
+        const Vec2vfx p02 = v2;
+        const Vec2vfx p03 = v3;
+        
+        const Vec2vfx p10 = lerp(p00,p01,t);
+        const Vec2vfx p11 = lerp(p01,p02,t);
+        const Vec2vfx p12 = lerp(p02,p03,t);
+        
+        const Vec2vfx p20 = lerp(p10,p11,t);
+        const Vec2vfx p21 = lerp(p11,p12,t);
+        
+        const Vec2vfx p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloatx(3.0f)*(p21-p20);
+      }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        p = p30;
+        dp = 3.0f*(p21-p20);
+        ddp = eval_dudu(t);
+      }
+      
+      __forceinline CubicBezierCurve clip(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f1,df1; eval(u1.upper,f1,df1);
+        float s = u1.upper-u1.lower;
+        return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+      }
+      
+      __forceinline QuadraticBezierCurve<Vertex> derivative() const
+      {
+        const Vertex q0 = 3.0f*(v1-v0);
+        const Vertex q1 = 3.0f*(v2-v1);
+        const Vertex q2 = 3.0f*(v3-v2);
+        return QuadraticBezierCurve<Vertex>(q0,q1,q2);
+      }
+      
+      __forceinline BBox<Vertex> derivative_bounds(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f3,df3; eval(u1.upper,f3,df3);
+        const float s = u1.upper-u1.lower;
+        const Vertex f1 = f0+s*(1.0f/3.0f)*df0;
+        const Vertex f2 = f3-s*(1.0f/3.0f)*df3;
+        const Vertex q0 = s*df0;
+        const Vertex q1 = 3.0f*(f2-f1);
+        const Vertex q2 = s*df3;
+        return merge(BBox<Vertex>(q0),BBox<Vertex>(q1),BBox<Vertex>(q2));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        const Vec4vf<M> p00 = v0;
+        const Vec4vf<M> p01 = v1;
+        const Vec4vf<M> p02 = v2;
+        const Vec4vf<M> p03 = v3;
+        
+        const Vec4vf<M> p10 = lerp(p00,p01,t);
+        const Vec4vf<M> p11 = lerp(p01,p02,t);
+        const Vec4vf<M> p12 = lerp(p02,p03,t);
+        const Vec4vf<M> p20 = lerp(p10,p11,t);
+        const Vec4vf<M> p21 = lerp(p11,p12,t);
+        const Vec4vf<M> p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloat<M>(3.0f)*(p21-p20);
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), 
+                    madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3))));
+      }
+
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec3vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec3vfx p  = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero));
+          const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        return BBox3fa(lower,upper);
+      }
+      
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru   = select(valid,max(ru,abs(pi.w)),ru);
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) {
+        return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+#if defined(__AVX__)
+  template<>
+    __forceinline CubicBezierCurve<vfloat4> CubicBezierCurve<vfloat4>::clip(const Interval1f& u1) const
+  {
+    const vfloat8 p00 = vfloat8(v0);
+    const vfloat8 p01 = vfloat8(v1);
+    const vfloat8 p02 = vfloat8(v2);
+    const vfloat8 p03 = vfloat8(v3);
+
+    const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper));
+    const vfloat8 p10 = lerp(p00,p01,t);
+    const vfloat8 p11 = lerp(p01,p02,t);
+    const vfloat8 p12 = lerp(p02,p03,t);
+    const vfloat8 p20 = lerp(p10,p11,t);
+    const vfloat8 p21 = lerp(p11,p12,t);
+    const vfloat8 p30 = lerp(p20,p21,t);
+    
+    const vfloat8 f01  = p30;
+    const vfloat8 df01 = vfloat8(3.0f)*(p21-p20);
+        
+    const vfloat4 f0  = extract4<0>(f01),  f1  = extract4<1>(f01);
+    const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01);
+    const float s = u1.upper-u1.lower;
+    return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+  }
+#endif
+  
+  template<typename Vertex> using BezierCurveT = CubicBezierCurve<Vertex>;
+  
+  typedef CubicBezierCurve<float> CubicBezierCurve1f;
+  typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa;
+  typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa;
+  typedef CubicBezierCurve<Vec3fa> BezierCurve3fa;
+  
+  template<> __forceinline int CubicBezierCurve<float>::maxRoots() const
+  {
+    float eps = 1E-4f;
+    bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps;
+    bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps;
+    bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps;
+    bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps;
+    return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3);
+  }
+  
+  template<> __forceinline int CubicBezierCurve<Interval1f>::maxRoots() const {
+    return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3);
+  }
+
+  template<typename CurveGeometry>
+  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
+  {
+    return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h
new file mode 100644
index 0000000000..2ff03902a7
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h
@@ -0,0 +1,372 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{  
+  template<class T, class S>
+    static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    const T v0_3 = lerp(v0_2,v1_2,uu);
+    return v0_3;
+  }
+  
+  template<class T, class S>
+    static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    return S(3.0f)*(v1_2-v0_2);
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] +  v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1]));
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1]));
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x)
+  {
+    return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]);
+  }
+
+  template<typename Vertex, typename Vertex_t>
+    class __aligned(64) BezierPatchT
+  {
+   public:
+      Vertex matrix[4][4];
+    
+  public:
+
+    __forceinline BezierPatchT() {}
+
+    __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch,
+                               const BezierCurveT<Vertex>* border0,
+                               const BezierCurveT<Vertex>* border1,
+                               const BezierCurveT<Vertex>* border2,
+                               const BezierCurveT<Vertex>* border3);
+                               
+    __forceinline BezierPatchT(const BSplinePatchT<Vertex,Vertex_t>& source)
+    {
+      /* compute inner bezier control points */
+      matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1);
+      matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2);
+      matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2);
+      matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1);
+      
+      /* compute top edge control points */
+      matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
+      matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
+      
+      /* compute buttom edge control points */
+      matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
+      matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute left edge control points */
+      matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1);
+      matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1);
+      
+      /* compute right edge control points */
+      matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2);
+      matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute corner control points */
+      matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1);
+      matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1);
+      matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1);
+      matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1);      
+    }
+
+    static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv)
+    {
+      const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); 
+      const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3])));
+      const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3])));
+      const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])));
+      return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3)));
+    }
+
+    static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) 
+    {      
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative2(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative2(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t normal(const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t eval(const float uu, const float vv) const {
+      return eval(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_du(const float uu, const float vv) const { 
+      return eval_du(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dv(const float uu, const float vv) const {
+      return eval_dv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { 
+      return eval_dudu(matrix,uu,vv);
+    }
+    
+    __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { 
+      return eval_dvdv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { 
+      return eval_dudv(matrix,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]);
+        const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]);
+        const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]);
+        const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]);
+        return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x;
+      }
+
+    template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative2(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+    template<typename T>
+      static __forceinline Vec3<T> eval(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {      
+      const T one_minus_uu = 1.0f - uu;
+      const T one_minus_vv = 1.0f - vv;      
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+      
+      const T x = 
+        madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))),
+        madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))),
+             B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); 
+
+      const T y = 
+        madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))),
+        madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))),
+             B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); 
+      
+      const T z = 
+        madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))),
+        madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))),
+             B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); 
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const {     
+      return eval(matrix,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_11 = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> matrix_12 = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_21 = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      const Vec3<T> matrix_22 = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+            
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {     
+      return normal(matrix,uu,vv);
+    }
+  };
+
+  typedef BezierPatchT<Vec3fa,Vec3fa_t> BezierPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/bilinear_patch.h b/thirdparty/embree/kernels/subdiv/bilinear_patch.h
new file mode 100644
index 0000000000..cade104a6c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bilinear_patch.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BilinearPatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      Vertex v[4];
+      
+    public:
+      
+      __forceinline BilinearPatchT () {}
+
+      __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView<Vertex>& vertices) {
+        init(edge,vertices.getPtr(),vertices.getStride());
+      }
+      
+      __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride)
+      {
+        v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+      }
+
+      __forceinline BilinearPatchT (const CatmullClarkPatch& patch)
+      {
+        v[0] = patch.ring[0].getLimitVertex();
+        v[1] = patch.ring[1].getLimitVertex();
+        v[2] = patch.ring[2].getLimitVertex();
+        v[3] = patch.ring[3].getLimitVertex();
+      }
+
+      __forceinline BBox<Vertex> bounds() const
+      {
+        
+        BBox<Vertex> bounds (v[0]);
+        bounds.extend(v[1]);
+        bounds.extend(v[2]);
+        bounds.extend(v[3]);
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const {
+        return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv);
+      }
+
+      __forceinline Vertex eval_du(const float uu, const float vv) const {
+        return lerp(v[1]-v[0],v[2]-v[3],vv);
+      }
+
+      __forceinline Vertex eval_dv(const float uu, const float vv) const {
+        return lerp(v[3]-v[0],v[2]-v[1],uu);
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const {
+        return (v[2]-v[3]) - (v[1]-v[0]);
+      }
+
+      __forceinline Vertex normal(const float uu, const float vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+      
+      __forceinline void eval(const float u, const float v, 
+                              Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                              const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv);
+        const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv);
+        const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_du(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv);
+        const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv);
+        const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_dv(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu);
+        const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu);
+        const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+       template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu);
+      }
+      
+      template<class vfloat>
+      __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]);
+      }
+
+      template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv));
+        }
+        if (dPdu) {
+          for (size_t i=0; i<N; i++) {
+            assert(dPdu); vfloat::store(valid,dPdu+i*dstride,eval_du(i,uu,vv)*dscale);
+            assert(dPdv); vfloat::store(valid,dPdv+i*dstride,eval_dv(i,uu,vv)*dscale);
+          }
+        }
+        if (ddPdudu) {
+          for (size_t i=0; i<N; i++) {
+            assert(ddPdudu); vfloat::store(valid,ddPdudu+i*dstride,eval_dudu(i,uu,vv)*sqr(dscale));
+            assert(ddPdvdv); vfloat::store(valid,ddPdvdv+i*dstride,eval_dvdv(i,uu,vv)*sqr(dscale));
+            assert(ddPdudv); vfloat::store(valid,ddPdudv+i*dstride,eval_dudv(i,uu,vv)*sqr(dscale));
+          }
+        }
+      }
+    };
+  
+  typedef BilinearPatchT<Vec3fa,Vec3fa_t> BilinearPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/bspline_curve.h b/thirdparty/embree/kernels/subdiv/bspline_curve.h
new file mode 100644
index 0000000000..51489ef37c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bspline_curve.h
@@ -0,0 +1,320 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  class BSplineBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = s*s*s;
+      const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t));
+      const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s));
+      const T n3 = t*t*t;
+      return T(1.0f/6.0f)*Vec4<T>(n0,n1,n2,n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = -s*s;
+      const T n1 = -t*t - 4.0f*(t*s);
+      const T n2 =  s*s + 4.0f*(s*t);
+      const T n3 =  t*t;
+      return T(0.5f)*Vec4<T>(n0,n1,n2,n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = s;
+      const T n1 = t - 2.0f*s;
+      const T n2 = s - 2.0f*t;
+      const T n3 = t;
+      return Vec4<T>(n0,n1,n2,n3);
+    }
+  };
+  
+  struct PrecomputedBSplineBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBSplineBasis() {}
+    PrecomputedBSplineBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBSplineBasis bspline_basis0;
+  extern PrecomputedBSplineBasis bspline_basis1;
+
+  template<typename Vertex>
+    struct BSplineCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline BSplineCurveT() {}
+      
+      __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) {
+        return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline BSplineCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return BSplineCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BSplineBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval<M>(t);
+        dp = veval_du<M>(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of bspline curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) {
+        return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2));
+    const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2);
+    const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1);
+    const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3));
+    ocurve = BSplineCurveT<Vertex>(v0,v1,v2,v3);
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2));
+    const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2);
+    const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2);
+    const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3));
+    ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3);
+  }
+
+  template<typename CurveGeometry>
+  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
+  {
+    return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef BSplineCurveT<Vec3fa> BSplineCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/bspline_patch.h b/thirdparty/embree/kernels/subdiv/bspline_patch.h
new file mode 100644
index 0000000000..ff47f01c7a
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bspline_patch.h
@@ -0,0 +1,449 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bspline_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BSplinePatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      
+      __forceinline BSplinePatchT () {}
+
+      __forceinline BSplinePatchT (const CatmullClarkPatch& patch) {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT(const CatmullClarkPatch& patch,
+                                  const BezierCurveT<Vertex>* border0,
+                                  const BezierCurveT<Vertex>* border1,
+                                  const BezierCurveT<Vertex>* border2,
+                                  const BezierCurveT<Vertex>* border3)
+      {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline Vertex hard_corner(const                    Vertex& v01, const Vertex& v02, 
+                                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return 4.0f*v11 - 2.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex soft_convex_corner( const                    Vertex& v01, const Vertex& v02, 
+                                               const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                               const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return -8.0f*v11 + 4.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex convex_corner(const float vertex_crease_weight, 
+                                         const                    Vertex& v01, const Vertex& v02, 
+                                         const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                         const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+        else                                  return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+      }
+
+      __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride);
+      }
+
+      __forceinline void init_border(const CatmullClarkRing& edge0,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0.has_opposite_back(0)))
+        {
+          v01 = edge0.back(2);
+          v02 = edge0.back(1);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+
+      __forceinline void init_corner(const CatmullClarkRing& edge0,
+                                     Vertex& v00,       const Vertex& v01, const Vertex& v02, 
+                                     const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                     const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1);
+        const bool has_back0 = edge0.has_opposite_back(0);
+        const bool has_front1 = edge0.has_opposite_front(1);
+        const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2);
+        
+        if (likely(has_back0)) {
+          if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); }
+          else { assert(!has_back1); v00 = 2.0f*v01-v02; }
+        }
+        else {
+          if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; }
+          else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+        }
+      }
+      
+      void init(const CatmullClarkPatch& patch)
+      {
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = patch.ring[0].vtx;
+        const Vertex v12 = v[1][2] = patch.ring[1].vtx;
+        const Vertex v22 = v[2][2] = patch.ring[2].vtx; 
+        const Vertex v21 = v[2][1] = patch.ring[3].vtx; 
+        
+        /* fill border vertices */
+        init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      void init_border(const HalfEdge* edge0, const char* vertices, size_t stride,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0->hasOpposite())) 
+        {
+          const HalfEdge* e = edge0->opposite()->next()->next(); 
+          v01 = load(e,vertices,stride); 
+          v02 = load(e->next(),vertices,stride);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+      
+      void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride,
+                       Vertex& v00, const Vertex& v01, const Vertex& v02, 
+                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool has_back0 = edge0->hasOpposite();
+        const bool has_front1 = edge0->prev()->hasOpposite();
+
+        if (likely(has_back0))
+        { 
+          const HalfEdge* e = edge0->opposite()->next();
+          if (likely(has_front1))
+          {
+            assert(e->hasOpposite());
+            assert(edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = load(e->opposite()->prev(),vertices,stride);
+          } 
+          else {
+            assert(!e->hasOpposite());
+            v00 = 2.0f*v01-v02;
+          }
+        }
+        else
+        {
+          if (likely(has_front1)) {
+            assert(!edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = 2.0f*v10-v20;
+          }
+          else {
+            assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight));
+            v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+          }
+        }
+      }
+      
+      void init(const HalfEdge* edge0, const char* vertices, size_t stride)
+      {
+        assert( edge0->isRegularFace() );
+        
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next();
+        const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next();
+        const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next();
+        const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0  == edge3->next());
+        
+        /* fill border vertices */
+        init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      __forceinline BBox<Vertex> bounds() const
+      {
+        const Vertex* const cv = &v[0][0];
+        BBox<Vertex> bounds (cv[0]);
+        for (size_t i=1; i<16 ; i++)
+          bounds.extend( cv[i] );
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_du(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_dv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative2(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative2(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex normal(const float uu, const float vv) const
+      {
+        const Vertex tu = eval_du(uu,vv);
+        const Vertex tv = eval_dv(uu,vv);
+        return cross(tu,tv);
+      }   
+
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv, const Vec4<T>& u_n, const Vec4<T>& v_n) const
+      {
+        const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x))));
+        const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x))));
+        const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x))));
+        const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x))));
+        const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+                  
+        const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y))));
+        const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y))));
+        const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y))));
+        const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y))));
+        const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y)));
+          
+        const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z))));
+        const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z))));
+        const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z))));
+        const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z))));
+        const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z)));
+        
+        return Vec3<T>(x,y,z);
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu);
+        const Vec4<T> v_n = BSplineBasis::eval(vv);
+        return eval(uu,vv,u_n,v_n);
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_du(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval_dv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudu(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative2(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dvdv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative2(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> normal(const T& uu, const T& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+      void eval(const float u, const float v, 
+                Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, 
+                const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+        const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i]))));
+        const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i]))));
+        const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+        return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+      }
+        
+      template<typename vbool, typename vfloat>
+      void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, 
+                const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BSplineBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative2(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative2(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+      friend __forceinline embree_ostream operator<<(embree_ostream o, const BSplinePatchT& p)
+      {
+        for (size_t y=0; y<4; y++)
+          for (size_t x=0; x<4; x++)
+            o << "[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+        return o;
+      } 
+
+    public:
+      Vertex v[4][4];
+    };
+  
+  typedef BSplinePatchT<Vec3fa,Vec3fa_t> BSplinePatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h
new file mode 100644
index 0000000000..46959797bf
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+
+namespace embree
+{
+  static const size_t MAX_PATCH_VALENCE = 16;         //!< maximum number of vertices of a patch
+  static const size_t MAX_RING_FACE_VALENCE = 64;     //!< maximum number of faces per ring
+  static const size_t MAX_RING_EDGE_VALENCE = 2*64;   //!< maximum number of edges per ring
+
+  class CatmullClarkPrecomputedCoefficients 
+  {
+  private:
+    
+    float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1];
+
+    float* table_limittangent_a[MAX_RING_FACE_VALENCE+1];
+    float* table_limittangent_b[MAX_RING_FACE_VALENCE+1];
+    float table_limittangent_c[MAX_RING_FACE_VALENCE+1];
+
+    __forceinline float set_cos_2PI_div_n(const size_t n) { 
+      if (unlikely(n == 0)) return 1.0f;
+      return cosf(2.0f*float(pi)/(float)n); 
+    }
+
+    __forceinline float set_limittangent_a(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); 
+      return cosf(2.0f*float(pi)*(float)i/(float)n) * c1;
+    }
+
+    __forceinline float set_limittangent_b(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0;
+    }
+
+    __forceinline float set_limittangent_c(const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n)));
+    }
+
+  public:
+
+    __forceinline float cos_2PI_div_n(const size_t n)
+    {
+      if (likely(n <= MAX_RING_FACE_VALENCE))
+        return table_cos_2PI_div_n[n];
+      else
+        return set_cos_2PI_div_n(n);
+    }
+
+    __forceinline float limittangent_a(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_a[n][i];
+    }
+
+    __forceinline float limittangent_b(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_b[n][i];
+    }
+
+    __forceinline float limittangent_c(const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      return table_limittangent_c[n];
+    }
+
+    static CatmullClarkPrecomputedCoefficients table;
+ 
+    CatmullClarkPrecomputedCoefficients();    
+    ~CatmullClarkPrecomputedCoefficients();    
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h
new file mode 100644
index 0000000000..91772d94ed
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h
@@ -0,0 +1,562 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_ring.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) CatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef typename CatmullClark1Ring::Type Type;
+    
+    array_t<CatmullClark1RingT<Vertex,Vertex_t>,4> ring;
+    
+    public:
+    __forceinline CatmullClarkPatchT () {}
+
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) {
+      init(first_half_edge,vertices,stride);
+    }
+    
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+    
+    __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) 
+    {
+      for (unsigned i=0; i<4; i++)
+        ring[i].init(first_half_edge+i,vertices,stride);
+
+      assert(verify());
+    }
+
+    __forceinline size_t bytes() const {
+      return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes();
+    }
+
+    __forceinline void serialize(void* ptr, size_t& ofs) const
+    {
+      for (size_t i=0; i<4; i++)
+        ring[i].serialize((char*)ptr,ofs);
+    }
+
+    __forceinline void deserialize(void* ptr)
+    {
+      size_t ofs = 0;
+      for (size_t i=0; i<4; i++)
+        ring[i].deserialize((char*)ptr,ofs);
+    }
+
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds (ring[0].bounds());
+      for (size_t i=1; i<4; i++)
+	bounds.extend(ring[i].bounds());
+      return bounds;
+    }
+    
+    __forceinline Type type() const 
+    {
+      const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES);
+    }
+    
+    __forceinline bool isFinalResolution(float res) const {
+      return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res);
+    }
+    
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      dest.vertex_level = 0.0f;
+      dest.face_valence = 4;
+      dest.edge_valence = 8;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (size_t i=0; i<8; i++) 
+	dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8];
+      for (size_t i=0; i<4; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      dest.eval_start_index = (8-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatchT,4>& patch) const
+    {
+      ring[0].subdivide(patch[0].ring[0]);
+      ring[1].subdivide(patch[1].ring[1]);
+      ring[2].subdivide(patch[2].ring[2]);
+      ring[3].subdivide(patch[3].ring[3]);
+      
+      patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2;
+      if (likely(regular0))
+        init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      else
+        init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      
+      const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2;
+      if (likely(regular1))
+        init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      else
+        init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      
+      const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2;
+      if (likely(regular2))
+        init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      else
+        init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      
+      const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2;
+      if (likely(regular3))
+        init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      else
+        init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      
+      Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f;
+
+      Vertex_t center_ring[8];
+      center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0];
+      center_ring[7] = (Vertex_t)patch[3].ring[3].vtx;
+      center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0];
+      center_ring[5] = (Vertex_t)patch[2].ring[2].vtx;
+      center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0];
+      center_ring[3] = (Vertex_t)patch[1].ring[1].vtx;
+      center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0];
+      center_ring[1] = (Vertex_t)patch[0].ring[0].vtx;
+      
+      init_regular(center,center_ring,0,patch[0].ring[2]);
+      init_regular(center,center_ring,2,patch[1].ring[3]);
+      init_regular(center,center_ring,4,patch[2].ring[0]);
+      init_regular(center,center_ring,6,patch[3].ring[1]);
+      
+      assert(patch[0].verify());
+      assert(patch[1].verify());
+      assert(patch[2].verify());
+      assert(patch[3].verify());
+    }
+    
+    bool verify() const {
+      return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions();
+    }
+    
+    __forceinline void init( FinalQuad& quad ) const
+    {
+      quad.vtx[0] = (Vertex_t)ring[0].vtx;
+      quad.vtx[1] = (Vertex_t)ring[1].vtx;
+      quad.vtx[2] = (Vertex_t)ring[2].vtx;
+      quad.vtx[3] = (Vertex_t)ring[3].vtx;
+    };
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p)
+    {
+      o << "CatmullClarkPatch { " << embree_endl;
+      for (size_t i=0; i<4; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef CatmullClarkPatchT<Vec3fa,Vec3fa_t> CatmullClarkPatch3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) GeneralCatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+    static const unsigned SIZE = MAX_PATCH_VALENCE;
+    DynamicStackArray<GeneralCatmullClark1RingT<Vertex,Vertex_t>,8,SIZE> ring;
+    unsigned N;
+    
+    __forceinline GeneralCatmullClarkPatchT () 
+    : N(0) {}
+    
+    GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) {
+      init(h,vertices,stride);
+    }
+
+    __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+
+    __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) 
+    {
+      unsigned int i = 0;
+      const HalfEdge* edge = h; 
+      do {
+        ring[i].init(edge,vertices,stride);
+        edge = edge->next();
+        i++;
+      } while ((edge != h) && (i < SIZE));
+      N = i;
+    }
+
+    __forceinline unsigned size() const { 
+      return N; 
+    }
+    
+    __forceinline bool isQuadPatch() const {
+      return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads;
+    }
+
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }      
+    }
+    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const array_t<Vertex_t,2*SIZE>& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      assert(N<(MAX_RING_FACE_VALENCE));
+      assert(2*N<(MAX_RING_EDGE_VALENCE));
+      dest.vertex_level = vertex_level;
+      dest.face_valence = N;
+      dest.edge_valence = 2*N;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (unsigned i=0; i<2*N; i++) {
+        dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)];
+        assert(isvalid(dest.ring[i]));
+      }
+      for (unsigned i=0; i<N; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      assert(offset <= 2*N);
+      dest.eval_start_index = (2*N-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatch,SIZE>& patch, unsigned& N_o) const
+    {
+      N_o = N;
+      assert( N );
+      for (unsigned i=0; i<N; i++) {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        ring[i].subdivide(patch[i].ring[0]);
+        patch[i]  .ring[0].edge_level = 0.5f*ring[i].edge_level;
+        patch[ip1].ring[3].edge_level = 0.5f*ring[i].edge_level;
+        
+	assert( patch[i].ring[0].hasValidPositions() );
+        
+      }
+      assert(N < 2*SIZE);
+      Vertex_t center = Vertex_t(0.0f);
+      array_t<Vertex_t,2*SIZE> center_ring;
+      float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads
+      
+      for (unsigned i=0; i<N; i++)
+      {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        unsigned im1 = (i+N-1)%N; // FIXME: %
+        bool regular = ring[i].has_last_face() && ring[ip1].face_valence > 2;
+        if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); 
+        else                 init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]);
+        
+	assert( patch[i].ring[1].hasValidPositions() );
+	assert( patch[ip1].ring[3].hasValidPositions() );
+        
+	float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level);
+        patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level;
+	center_vertex_level = max(center_vertex_level,level);
+        
+        center += ring[i].vtx;
+        center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx;
+        center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0];
+      }
+      center /= float(N);
+      
+      for (unsigned int i=0; i<N; i++) {
+        init_regular(center,center_ring,center_vertex_level,N,2*i,patch[i].ring[2]);
+        
+	assert( patch[i].ring[2].hasValidPositions() );
+      }
+    }
+    
+    void init(CatmullClarkPatch& patch) const
+    {
+      assert(size() == 4);
+      ring[0].convert(patch.ring[0]);
+      ring[1].convert(patch.ring[1]);
+      ring[2].convert(patch.ring[2]);
+      ring[3].convert(patch.ring[3]);
+    }
+    
+    static void fix_quad_ring_order (array_t<CatmullClarkPatch,GeneralCatmullClarkPatchT::SIZE>& patches)
+    {
+      CatmullClark1Ring patches1ring1 = patches[1].ring[1];
+      patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments
+      patches[1].ring[0] = patches[1].ring[3];
+      patches[1].ring[3] = patches[1].ring[2];
+      patches[1].ring[2] = patches1ring1;
+      
+      CatmullClark1Ring patches2ring2 = patches[2].ring[2];
+      patches[2].ring[2] = patches[2].ring[0];
+      patches[2].ring[0] = patches2ring2;
+      CatmullClark1Ring patches2ring3 = patches[2].ring[3];
+      patches[2].ring[3] = patches[2].ring[1];
+      patches[2].ring[1] = patches2ring3;
+      
+      CatmullClark1Ring patches3ring3 = patches[3].ring[3];
+      patches[3].ring[3] = patches[3].ring[0];
+      patches[3].ring[0] = patches[3].ring[1];
+      patches[3].ring[1] = patches[3].ring[2];
+      patches[3].ring[2] = patches3ring3;
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const
+    {
+      Vertex P0 = ring[0].getLimitVertex();
+      for (unsigned i=0; i<N; i++)
+      {
+        const unsigned i0 = i, i1 = i+1==N ? 0 : i+1;
+        const Vertex P1 = madd(1.0f/3.0f,ring[i0].getLimitTangent(),P0);
+        const Vertex P3 = ring[i1].getLimitVertex();
+        const Vertex P2 = madd(1.0f/3.0f,ring[i1].getSecondLimitTangent(),P3);
+        new (&curves[i]) BezierCurve(P0,P1,P2,P3);
+        P0 = P3;
+      }
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[2], const unsigned subPatch) const
+    {
+      const unsigned i0 = subPatch;
+      const Vertex t0_p = ring[i0].getLimitTangent();
+      const Vertex t0_m = ring[i0].getSecondLimitTangent();
+          
+      const unsigned i1 = subPatch+1 == N ? 0 : subPatch+1;
+      const Vertex t1_p = ring[i1].getLimitTangent();
+      const Vertex t1_m = ring[i1].getSecondLimitTangent();
+      
+      const unsigned i2 = subPatch == 0 ? N-1 : subPatch-1;
+      const Vertex t2_p = ring[i2].getLimitTangent();
+      const Vertex t2_m = ring[i2].getSecondLimitTangent();
+      
+      const Vertex b00 = ring[i0].getLimitVertex();
+      const Vertex b03 = ring[i1].getLimitVertex();
+      const Vertex b33 = ring[i2].getLimitVertex();
+      
+      const Vertex b01 = madd(1.0/3.0f,t0_p,b00);
+      const Vertex b11 = madd(1.0/3.0f,t0_m,b00);
+      
+      //const Vertex b13 = madd(1.0/3.0f,t1_p,b03);
+      const Vertex b02 = madd(1.0/3.0f,t1_m,b03);
+          
+      const Vertex b22 = madd(1.0/3.0f,t2_p,b33);
+      const Vertex b23 = madd(1.0/3.0f,t2_m,b33);
+          
+      new (&curves[0]) BezierCurve(b00,b01,b02,b03);
+      new (&curves[1]) BezierCurve(b33,b22,b11,b00);
+    }
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClarkPatchT &p)
+    {
+      o << "GeneralCatmullClarkPatch { " << embree_endl;
+      for (unsigned i=0; i<p.N; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef GeneralCatmullClarkPatchT<Vec3fa,Vec3fa_t> GeneralCatmullClarkPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
new file mode 100644
index 0000000000..e5ad5dadfe
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
@@ -0,0 +1,826 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/buffer.h"
+#include "half_edge.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  struct __aligned(64) FinalQuad {
+    Vec3fa vtx[4];
+  };
+
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) CatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    int border_index;                                   //!< edge index where border starts
+    unsigned int face_valence;                          //!< number of adjacent quad faces
+    unsigned int edge_valence;                          //!< number of adjacent edges (2*face_valence)
+    float vertex_crease_weight;                         //!< weight of vertex crease (0 if no vertex crease)
+    DynamicStackArray<float,16,MAX_RING_FACE_VALENCE> crease_weight; //!< edge crease weights for each adjacent edge
+    float vertex_level;                                 //!< maximum level of all adjacent edges
+    float edge_level;                                   //!< level of first edge
+    unsigned int eval_start_index;                      //!< topology dependent index to start evaluation
+    unsigned int eval_unique_identifier;                //!< topology dependent unique identifier for this ring 
+    Vertex vtx;                                         //!< center vertex
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring;  //!< ring of neighboring vertices
+   
+  public:
+    CatmullClark1RingT () 
+    : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty
+
+    /*! calculates number of bytes required to serialize this structure */
+    __forceinline size_t bytes() const
+    {
+      size_t ofs = 0;
+      ofs += sizeof(border_index);
+      ofs += sizeof(face_valence);
+      assert(2*face_valence == edge_valence);
+      ofs += sizeof(vertex_crease_weight);
+      ofs += face_valence*sizeof(float);
+      ofs += sizeof(vertex_level);
+      ofs += sizeof(edge_level);
+      ofs += sizeof(eval_start_index);
+      ofs += sizeof(eval_unique_identifier);
+      ofs += sizeof(vtx);
+      ofs += edge_valence*sizeof(Vertex);
+      return ofs;
+    }
+
+    template<typename Ty>
+    static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) {
+      *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty);
+    }
+
+    template<typename Ty>
+    static __forceinline void load(char* ptr, size_t& ofs, Ty& v) {
+      v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty);
+    }
+
+    /*! serializes the ring to some memory location */
+    __forceinline void serialize(char* ptr, size_t& ofs) const
+    {
+      store(ptr,ofs,border_index);
+      store(ptr,ofs,face_valence);
+      store(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        store(ptr,ofs,crease_weight[i]);
+      store(ptr,ofs,vertex_level);
+      store(ptr,ofs,edge_level);
+      store(ptr,ofs,eval_start_index);
+      store(ptr,ofs,eval_unique_identifier);
+      Vertex_t::storeu(&ptr[ofs],vtx); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        Vertex_t::storeu(&ptr[ofs],ring[i]); ofs += sizeof(Vertex);
+      }
+    }
+
+    /*! deserializes the ring from some memory location */
+    __forceinline void deserialize(char* ptr, size_t& ofs)
+    {
+      load(ptr,ofs,border_index);
+      load(ptr,ofs,face_valence);
+      edge_valence = 2*face_valence;
+      load(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        load(ptr,ofs,crease_weight[i]);
+      load(ptr,ofs,vertex_level);
+      load(ptr,ofs,edge_level);
+      load(ptr,ofs,eval_start_index);
+      load(ptr,ofs,eval_unique_identifier);
+      vtx = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        ring[i] = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      }
+    }
+
+    __forceinline bool hasBorder() const {
+      return border_index != -1;
+    }
+    
+    __forceinline const Vertex& front(size_t i) const {
+      assert(edge_valence>i);
+      return ring[i];
+    }
+    
+    __forceinline const Vertex& back(size_t i) const {
+      assert(edge_valence>=i);
+      return ring[edge_valence-i];
+    }
+    
+    __forceinline bool has_last_face() const {
+      return (size_t)border_index != (size_t)edge_valence-2;
+    }
+
+    __forceinline bool has_opposite_front(size_t i) const {
+      return (size_t)border_index != 2*i;
+    }
+
+    __forceinline bool has_opposite_back(size_t i) const {
+      return (size_t)border_index != ((size_t)edge_valence-2-2*i);
+    }
+    
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds ( vtx );
+      for (size_t i = 0; i<edge_valence ; i++)
+	bounds.extend( ring[i] );
+      return bounds;
+    }
+
+    /*! initializes the ring from the half edge structure */
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) 
+    {
+      border_index = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      
+      HalfEdge* p = (HalfEdge*) h;
+
+      unsigned i=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+
+      do
+      {
+        vertex_level = max(vertex_level,p->edge_level);
+        crease_weight[i/2] = p->edge_crease_weight;
+        assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+
+        /* store first two vertices of face */
+        p = p->next();
+        const unsigned index0 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+        if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+        p = p->next();
+
+        const unsigned index1 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index1*stride);
+        p = p->next();
+       
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          const unsigned index0 = p->getStartVertexIndex();
+          if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_index = i;
+          crease_weight[i/2] = inf; 
+          ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+          ring[i++] = vtx; // dummy vertex
+          	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+
+      } while (p != h); 
+
+      edge_valence = i;
+      face_valence = i >> 1;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_index = min_vertex_index_face;
+
+      assert( hasValidPositions() );
+    }
+      
+    __forceinline void subdivide(CatmullClark1RingT& dest) const
+    {
+      dest.edge_level             = 0.5f*edge_level;
+      dest.vertex_level           = 0.5f*vertex_level;
+      dest.face_valence           = face_valence;
+      dest.edge_valence           = edge_valence;
+      dest.border_index           = border_index;
+      dest.vertex_crease_weight   = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index       = eval_start_index;
+      dest.eval_unique_identifier = eval_unique_identifier;
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t i=0; i<face_valence; i++) 
+      {
+        size_t face_index = i + eval_start_index; if (face_index >= face_valence) face_index -= face_valence; assert(face_index < face_valence);
+        size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence);
+        size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence);
+        size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence);
+        S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t face_index = i + eval_start_index;
+        if (face_index >= face_valence) face_index -= face_valence;
+        const float edge_crease = crease_weight[face_index];
+        dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f);
+      
+        size_t index      = 2*face_index;
+        size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1;
+        size_t next_index = 2*face_index+1;
+
+        const Vertex_t v = vtx + ring[index];
+        const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index];
+        S += ring[index];
+                
+        /* fast path for regular edge points */
+        if (likely(edge_crease <= 0.0f)) {
+          dest.ring[index] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          crease_id[num_creases++] = face_index;
+          dest.ring[index] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(edge_crease < 1.0f)) {
+            dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease);
+          }
+        }
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      /* no edge crease rule and dart rule */
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) 
+      {
+        /* update vertex using crease rule */
+        const size_t crease0 = crease_id[0], crease1 = crease_id[1];
+        const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f);
+        dest.vtx = v_sharp;
+
+        /* update crease_weights using chaikin rule */
+        const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1];
+        dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+
+        /* interpolate between sharp and smooth rule */
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_smooth,v_sharp,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+    
+    __forceinline bool isRegular1() const 
+    {
+      if (border_index == -1) {
+	if (face_valence == 4) return true;
+      } else {
+	if (face_valence < 4) return true;
+      }
+      return false;
+    }
+
+    __forceinline size_t numEdgeCreases() const
+    {
+      ssize_t numCreases = 0;
+      for (size_t i=0; i<face_valence; i++) {
+        numCreases += crease_weight[i] > 0.0f;
+      }
+      return numCreases;
+    }
+
+    enum Type {
+      TYPE_NONE            = 0,      //!< invalid type
+      TYPE_REGULAR         = 1,      //!< regular patch when ignoring creases
+      TYPE_REGULAR_CREASES = 2,      //!< regular patch when considering creases
+      TYPE_GREGORY         = 4,      //!< gregory patch when ignoring creases
+      TYPE_GREGORY_CREASES = 8,      //!< gregory patch when considering creases
+      TYPE_CREASES         = 16      //!< patch has crease features
+    };
+    
+    __forceinline Type type() const
+    {
+      /* check if there is an edge crease anywhere */      
+      const size_t numCreases = numEdgeCreases();
+      const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0;
+
+      Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY);
+      if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES);
+      if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES);
+
+      /* calculate if this vertex is regular */
+      bool hasBorder = border_index != -1;
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else                                         return TYPE_CREASES;
+      }
+      else if (vertex_crease_weight != 0.0f)         return TYPE_CREASES;
+      else if (face_valence == 3 &&  hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else if (face_valence == 4 && !hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else                                           return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+    }
+
+    __forceinline bool isFinalResolution(float res) const {
+      return vertex_level <= res;
+    }
+
+    /* computes the limit vertex */
+    __forceinline Vertex getLimitVertex() const
+    {
+      /* return hard corner */ 
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+	const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f;
+      }
+      
+      Vertex_t F( 0.0f );
+      Vertex_t E( 0.0f );
+      
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++) {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        F += ring[2*index+1];
+        E += ring[2*index];
+      }
+
+      const float n = (float)face_valence;
+      return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[0] - vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {	
+	if (border_index != (int)edge_valence-2 ) {
+	  return ring[0] - vtx; 
+	}
+	else
+	{
+	  const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	  return (ring[second_border_index] - ring[border_index]) * 0.5f;
+	}
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+      
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      Vertex_t q( 0.0f );
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n);
+	alpha +=  a * ring[2*index];
+	beta  +=  b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma * (alpha + beta);
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[2] - vtx;
+ 
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+        if (border_index != 2) {
+          return ring[2] - vtx;
+        }
+        else {
+          const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+          return (ring[border_index] - ring[second_border_index]) * 0.5f;
+        }
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+
+        size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n);
+	alpha += a * ring[2*index];
+	beta  += b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma* (alpha + beta);      
+    }
+
+    /* gets surface normal */
+    const Vertex getNormal() const  {
+      return cross(getLimitTangent(),getSecondLimitTangent());
+    }
+    
+    /* returns center of the n-th quad in the 1-ring */
+    __forceinline Vertex getQuadCenter(const size_t index) const
+    {
+      const Vertex_t &p0 = vtx;
+      const Vertex_t &p1 = ring[2*index+0];
+      const Vertex_t &p2 = ring[2*index+1];
+      const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2];
+      const Vertex p = (p0+p1+p2+p3) * 0.25f;
+      return p;
+    }
+    
+    /* returns center of the n-th edge in the 1-ring */
+    __forceinline Vertex getEdgeCenter(const size_t index) const {
+      return (vtx + ring[index*2]) * 0.5f;
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", " << 
+	"hard_edge = " << c.border_index << ", face_valence " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", eval_start_index: " << c.eval_start_index << ", ring: " << embree_endl;
+      
+      for (unsigned int i=0; i<min(c.edge_valence,(unsigned int)MAX_RING_FACE_VALENCE); i++) {
+        o << i << " -> " << c.ring[i];
+        if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2];
+        o << embree_endl;
+      }
+      return o;
+    } 
+  };
+
+  typedef CatmullClark1RingT<Vec3fa,Vec3fa_t> CatmullClark1Ring3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) GeneralCatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    
+    struct Face 
+    {
+      __forceinline Face() {}
+      __forceinline Face (int size, float crease_weight)
+        : size(size), crease_weight(crease_weight) {}
+
+      // FIXME: add member that returns total number of vertices
+
+      int size;              // number of vertices-2 of nth face in ring
+      float crease_weight;
+    };
+
+    Vertex vtx;
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; 
+    DynamicStackArray<Face,16,MAX_RING_FACE_VALENCE> faces;
+    unsigned int face_valence;
+    unsigned int edge_valence;
+    int border_face;
+    float vertex_crease_weight;
+    float vertex_level;                      //!< maximum level of adjacent edges
+    float edge_level;                        // level of first edge
+    bool only_quads;                         // true if all faces are quads
+    unsigned int eval_start_face_index;
+    unsigned int eval_start_vertex_index;
+    unsigned int eval_unique_identifier;
+
+  public:
+    GeneralCatmullClark1RingT() 
+      : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {}
+
+    __forceinline bool isRegular() const 
+    {
+      if (border_face == -1 && face_valence == 4) return true;
+      return false;
+    }
+    
+    __forceinline bool has_last_face() const {
+      return border_face != (int)face_valence-1;
+    }
+    
+    __forceinline bool has_second_face() const {
+      return (border_face == -1) || (border_face >= 2);
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride)
+    {
+      only_quads = true;
+      border_face = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      HalfEdge* p = (HalfEdge*) h;
+      
+      unsigned int e=0, f=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      unsigned min_vertex_index_vertex = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+      do 
+      {
+        HalfEdge* p_prev = p->prev();
+        HalfEdge* p_next = p->next();
+        const float crease_weight = p->edge_crease_weight;
+         assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+        vertex_level = max(vertex_level,p->edge_level);
+
+        /* find minimum start vertex */
+        unsigned vertex_index = p_next->getStartVertexIndex();
+        if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+	/* store first N-2 vertices of face */
+	unsigned int vn = 0;
+        for (p = p_next; p!=p_prev; p=p->next()) {
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          vn++;
+	}
+	faces[f++] = Face(vn,crease_weight);
+	only_quads &= (vn == 2);
+	
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          unsigned vertex_index = p->getStartVertexIndex();
+          if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_face = f;
+	  faces[f++] = Face(2,inf); 
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          ring[e++] = vtx; // dummy vertex
+	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+	
+      } while (p != h); 
+      
+      edge_valence = e;
+      face_valence = f;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_face_index = min_vertex_index_face;
+      eval_start_vertex_index = min_vertex_index_vertex;
+
+      assert( hasValidPositions() );
+    }
+    
+    __forceinline void subdivide(CatmullClark1Ring& dest) const
+    {
+      dest.edge_level = 0.5f*edge_level;
+      dest.vertex_level = 0.5f*vertex_level;
+      dest.face_valence = face_valence;
+      dest.edge_valence = 2*face_valence;
+      dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME:
+      dest.vertex_crease_weight    = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index        = eval_start_face_index;
+      dest.eval_unique_identifier  = eval_unique_identifier;
+      assert(dest.face_valence <= MAX_RING_FACE_VALENCE);
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t face=0, v=eval_start_vertex_index; face<face_valence; face++) {
+        size_t f = (face + eval_start_face_index)%face_valence;
+
+        Vertex_t F = vtx;
+        for (size_t k=v; k<=v+faces[f].size; k++) F += ring[k%edge_valence]; // FIXME: optimize
+        S += dest.ring[2*f+1] = F/float(faces[f].size+2);
+        v+=faces[f].size;
+        v%=edge_valence;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+      Vertex_t C = Vertex_t(0.0f);
+      for (size_t face=0, j=eval_start_vertex_index; face<face_valence; face++)
+      {
+        size_t i = (face + eval_start_face_index)%face_valence;
+        
+        const Vertex_t v = vtx + ring[j];
+        Vertex_t f = dest.ring[2*i+1];
+        if (i == 0) f += dest.ring[dest.edge_valence-1]; 
+        else        f += dest.ring[2*i-1];
+        S += ring[j];
+        dest.crease_weight[i] = max(faces[i].crease_weight-1.0f,0.0f);
+        
+        /* fast path for regular edge points */
+        if (likely(faces[i].crease_weight <= 0.0f)) {
+          dest.ring[2*i] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          C += ring[j]; crease_id[num_creases++] = i;
+          dest.ring[2*i] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(faces[i].crease_weight < 1.0f)) {
+            dest.ring[2*i] = lerp((v+f)*0.25f,v*0.5f,faces[i].crease_weight);
+          }
+        }
+        j+=faces[i].size;
+        j%=edge_valence;
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) {
+        const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f);
+        const float crease_weight0 = faces[crease_id[0]].crease_weight;
+        const float crease_weight1 = faces[crease_id[1]].crease_weight;
+        dest.vtx = v_sharp;
+        dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_sharp,v_smooth,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+
+    void convert(CatmullClark1Ring& dst) const
+    {
+      dst.edge_level = edge_level;
+      dst.vertex_level = vertex_level;
+      dst.vtx = vtx;
+      dst.face_valence = face_valence;
+      dst.edge_valence = 2*face_valence;
+      dst.border_index = border_face == -1 ? -1 : 2*border_face;
+      for (size_t i=0; i<face_valence; i++) 
+	dst.crease_weight[i] = faces[i].crease_weight;
+      dst.vertex_crease_weight = vertex_crease_weight;
+      for (size_t i=0; i<edge_valence; i++) dst.ring[i] = ring[i];
+
+      dst.eval_start_index = eval_start_face_index;
+      dst.eval_unique_identifier = eval_unique_identifier;
+
+      assert( dst.hasValidPositions() );
+    }
+
+
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getLimitTangent();
+    }
+
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getSecondLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getSecondLimitTangent();
+    }
+
+
+    /* gets limit vertex */
+    __forceinline Vertex getLimitVertex() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+        convert(cc_vtx);
+      else 
+        subdivide(cc_vtx);
+      return cc_vtx.getLimitVertex();
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl;
+      for (size_t v=0, f=0; f<c.face_valence; v+=c.faces[f++].size) {
+        for (size_t i=v; i<v+c.faces[f].size; i++) {
+          o << i << " -> " << c.ring[i];
+          if (i == v) o << " crease = " << c.faces[f].crease_weight;
+          o << embree_endl;
+        }
+      }
+      return o;
+    } 
+  };  
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
new file mode 100644
index 0000000000..74fc4c1230
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
@@ -0,0 +1,297 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+/*
+
+  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
+  the curve goes through p2 with tangent (p3-p2)/2.
+
+ */
+
+namespace embree
+{
+  class CatmullRomBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = - t * s * s;
+      const T n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+      const T n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+      const T n3 = - s * t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 =  - s * s + 2.0f * s * t;
+      const T n1 =  2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+      const T n2 =  2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+      const T n3 = -2.0f * s * t + t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T n0 = -3.0f * t + 2.0f;
+      const T n1 =  9.0f * t - 5.0f;
+      const T n2 = -9.0f * t + 4.0f;
+      const T n3 =  3.0f * t - 1.0f;
+      return Vec4<T>(n0, n1, n2, n3);
+    }
+  };
+  
+  struct PrecomputedCatmullRomBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedCatmullRomBasis() {}
+    PrecomputedCatmullRomBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedCatmullRomBasis catmullrom_basis0;
+  extern PrecomputedCatmullRomBasis catmullrom_basis1;
+
+  template<typename Vertex>
+    struct CatmullRomCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CatmullRomCurveT() {}
+      
+      __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+
+      __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) {
+        return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline CatmullRomCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,v3-p), v3.w);
+        return CatmullRomCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval<M>(t);
+        dp = veval_du<M>(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of catmull-rom curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) {
+        return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+  template<typename CurveGeometry>
+  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
+  {
+    return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef CatmullRomCurveT<Vec3fa> CatmullRomCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h
new file mode 100644
index 0000000000..58c0b63e62
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h
@@ -0,0 +1,226 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+        
+      public:
+        
+        FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval(patch,Vec2f(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          eval(patch,Vec2f(u,v),dscale,depth);
+        }
+        
+        void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE>& patches, const Vec2f& uv, size_t depth)
+        {
+          float u = uv.x, v = uv.y;
+          if (v < 0.5f) {
+            if (u < 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,0);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdx; *dPdv = dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,1);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdy; *dPdv = dpdx;
+              }
+            }
+          } else {
+            if (u > 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,2);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdx; *dPdv = -dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,3);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdy; *dPdv = -dpdx;
+              }
+            }
+          }
+        }
+
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth;
+//#else
+          return depth>=(size_t)max_eval_depth;
+//#endif
+        }
+        
+        void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, 
+                  BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          while (true) 
+          {
+            typename CatmullClarkPatch::Type ty = patch.type();
+
+            if (unlikely(final(patch,ty,depth)))
+            {
+              if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+                RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(234423,c,c,-1);
+                return;
+              } else {
+                IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(34534,c,-1,c);
+                return;
+              }
+            }
+            else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+              assert(depth > 0); 
+              RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(43524,c,c,-1);
+              return;
+            }
+#if PATCH_USE_GREGORY == 2
+            else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+              assert(depth > 0); 
+              GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(23498,c,-1,c);
+              return;
+            }
+#endif
+            else
+            {
+              array_t<CatmullClarkPatch,4> patches; 
+              patch.subdivide(patches); // FIXME: only have to generate one of the patches
+              
+              const float u = uv.x, v = uv.y;
+              if (v < 0.5f) {
+                if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; }
+                else          { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; }
+              } else {
+                if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; }
+                else          { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; }
+              }
+              depth++;
+            }
+          }
+        }
+        
+        void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) 
+        {  
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) 
+          {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval(qpatch,uv,1.0f,depth); 
+          }
+          
+          /* subdivide patch */
+          unsigned N;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,N); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (N == 4) 
+            eval_general_quad(patch,patches,uv,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const unsigned i = 4*h+l; assert(i<N);
+            if (i >= N) return;
+
+#if PATCH_USE_GREGORY == 2
+            BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+            BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+            BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1);
+#endif
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h
new file mode 100644
index 0000000000..4755aba28d
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h
@@ -0,0 +1,359 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "catmullclark_patch.h"
+#include "bspline_patch.h"
+#include "gregory_patch.h"
+#include "tessellation.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct FeatureAdaptiveEvalGrid
+    {
+      typedef CatmullClark1Ring3fa CatmullClarkRing;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth;
+      //const unsigned dheight;
+      unsigned count;
+      
+
+    public:      
+      FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1));
+        
+        /* convert into standard quad patch if possible */
+        if (likely(patch.isQuadPatch())) 
+        {
+          CatmullClarkPatch3fa qpatch; patch.init(qpatch);
+          eval(qpatch, srange, erange, 0);
+          assert(count == (x1-x0+1)*(y1-y0+1));
+          return;
+        }
+        
+        /* subdivide patch */
+        unsigned N;
+        array_t<CatmullClarkPatch3fa,GeneralCatmullClarkPatch3fa::SIZE> patches; 
+        patch.subdivide(patches,N);
+        
+        if (N == 4)
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r);
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r);
+          eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr);
+          eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr);
+          eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l);
+#else
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1);
+          eval(patches[1],srange1,intersect(srange1,erange),1);
+          eval(patches[2],srange2,intersect(srange2,erange),1);
+          eval(patches[3],srange3,intersect(srange3,erange),1);
+#endif
+        }
+        else
+        {
+          assert(subPatch < N);
+          
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r);
+          eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r);
+#else
+          eval(patches[subPatch], srange, erange, 1);
+#endif
+          
+        }
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+      
+      FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch,
+                               const BBox2f& srange, const BBox2f& erange, const unsigned depth,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        eval(patch,srange,erange,depth);
+      }
+
+      template<typename Patch>
+      void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+      
+      __forceinline bool final(const CatmullClarkPatch3fa& patch, const CatmullClarkRing::Type type, unsigned depth) 
+      {
+        const unsigned max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//        return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+        return depth>=max_eval_depth;
+//#endif
+      }
+      
+      void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, 
+                const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr)
+      {
+        if (erange.empty())
+          return;
+        
+        int lx0 = (int) ceilf(erange.lower.x);
+        int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        int ly0 = (int) ceilf(erange.lower.y);
+        int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) return;
+
+        CatmullClarkPatch::Type ty = patch.type();
+
+        if (unlikely(final(patch,ty,depth)))
+        {
+          if (ty & CatmullClarkRing::TYPE_REGULAR) {
+            RegularPatch rpatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          } else {
+            IrregularFillPatch ipatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          }
+        }
+        else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+          assert(depth > 0); 
+          RegularPatch rpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+          return;
+        }
+#if PATCH_USE_GREGORY == 2
+        else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+          assert(depth > 0); 
+          GregoryPatch gpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1);
+        }
+#endif
+        else
+        {
+          array_t<CatmullClarkPatch3fa,4> patches; 
+          patch.subdivide(patches);
+          
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          eval(patches[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patches[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patches[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patches[3],srange3,intersect(srange3,erange),depth+1);
+        }
+      }
+    };
+    
+    template<typename Eval, typename Patch>
+      bool stitch_col(const Patch& patch, int subPatch,
+                      const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_y <= fine_y);
+      if (likely(fine_y == coarse_y))
+        return false;
+      
+      const unsigned y0s = stitch(y0,fine_y,coarse_y);
+      const unsigned y1s = stitch(y1,fine_y,coarse_y);
+      const unsigned M = y1s-y0s+1 + VSIZEX;
+      
+      dynamic_large_stack_array(float,px,M,64*sizeof(float));
+      dynamic_large_stack_array(float,py,M,64*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,64*sizeof(float));
+      dynamic_large_stack_array(float,u,M,64*sizeof(float));
+      dynamic_large_stack_array(float,v,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,64*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,64*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097);
+      
+      for (unsigned y=y0; y<=y1; y++) 
+      {
+        const unsigned ys = stitch(y,fine_y,coarse_y)-y0s;
+        Px[(y-y0)*dwidth+dx0] = px[ys];
+        Py[(y-y0)*dwidth+dx0] = py[ys];
+        Pz[(y-y0)*dwidth+dx0] = pz[ys];
+        U [(y-y0)*dwidth+dx0] = u[ys];
+        V [(y-y0)*dwidth+dx0] = v[ys];
+        if (unlikely(has_Nxyz)) {
+          Nx[(y-y0)*dwidth+dx0] = nx[ys];
+          Ny[(y-y0)*dwidth+dx0] = ny[ys];
+          Nz[(y-y0)*dwidth+dx0] = nz[ys];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+      bool stitch_row(const Patch& patch, int subPatch, 
+                      const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_x <= fine_x);
+      if (likely(fine_x == coarse_x))
+	return false;
+      
+      const unsigned x0s = stitch(x0,fine_x,coarse_x);
+      const unsigned x1s = stitch(x1,fine_x,coarse_x);
+      const unsigned M = x1s-x0s+1 + VSIZEX;
+
+      dynamic_large_stack_array(float,px,M,32*sizeof(float));
+      dynamic_large_stack_array(float,py,M,32*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,32*sizeof(float));
+      dynamic_large_stack_array(float,u,M,32*sizeof(float));
+      dynamic_large_stack_array(float,v,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,32*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,32*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1);
+      
+      for (unsigned x=x0; x<=x1; x++) 
+      {
+	const unsigned xs = stitch(x,fine_x,coarse_x)-x0s;
+	Px[dy0*dwidth+x-x0] = px[xs];
+        Py[dy0*dwidth+x-x0] = py[xs];
+        Pz[dy0*dwidth+x-x0] = pz[xs];
+        U [dy0*dwidth+x-x0] = u[xs];
+        V [dy0*dwidth+x-x0] = v[xs];
+        if (unlikely(has_Nxyz)) {
+          Nx[dy0*dwidth+x-x0] = nx[xs];
+          Ny[dy0*dwidth+x-x0] = ny[xs];
+          Nz[dy0*dwidth+x-x0] = nz[xs];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+    void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4],
+                                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                                     float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight)
+    {
+      bool sl = false, sr = false, st = false, sb = false;
+      if (levels) {
+        sl = x0 == 0         && stitch_col<Eval,Patch>(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sr = x1 == swidth-1  && stitch_col<Eval,Patch>(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight);
+        st = y0 == 0         && stitch_row<Eval,Patch>(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sb = y1 == sheight-1 && stitch_row<Eval,Patch>(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight);
+      }
+      const unsigned ofs = st*dwidth+sl;
+      Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight);
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h
new file mode 100644
index 0000000000..edab0db12f
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+
+        FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT<Vertex,Vertex_t>(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval_direct(valid,patch,Vec2<vfloat>(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          eval_direct(valid,patch,Vec2<vfloat>(u,v),dscale,depth);
+        }
+
+        template<size_t N>
+        __forceinline void eval_quad_direct(const vbool& valid, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+        }
+        
+        template<size_t N>
+        __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+#if PATCH_USE_GREGORY == 2
+          BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+#endif
+          GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+#if PATCH_USE_GREGORY == 2
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+#endif
+        }
+        
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+          return depth>=max_eval_depth;
+//#endif
+        }
+
+        void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2<vfloat>& uv, float dscale, size_t depth,
+                         BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          typename CatmullClarkPatch::Type ty = patch.type();
+
+          if (unlikely(final(patch,ty,depth)))
+          {
+            if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+              RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            } else {
+              IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            }
+          }
+          else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+            assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#if PATCH_USE_GREGORY == 2
+          else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+            assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#endif
+          else
+          {
+            array_t<CatmullClarkPatch,4> patches; 
+            patch.subdivide(patches); // FIXME: only have to generate one of the patches
+            eval_quad_direct(valid,patches,uv,dscale,depth);
+          }
+        }  
+
+        void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2<vfloat>& uv, const size_t depth) 
+        {
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval_direct(valid,qpatch,uv,1.0f,depth);
+          }
+          
+          /* subdivide patch */
+          unsigned Nc;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (Nc == 4) 
+            eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const vint i = (h<<2)+l; assert(all(valid,i<Nc));
+            foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+#if PATCH_USE_GREGORY == 2
+                BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+                BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+                BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1);
+#endif
+              });
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch.h b/thirdparty/embree/kernels/subdiv/gregory_patch.h
new file mode 100644
index 0000000000..9026d5c407
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gregory_patch.h
@@ -0,0 +1,893 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_patch.h"
+#include "bezier_curve.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{  
+  template<typename Vertex, typename Vertex_t = Vertex>
+  class __aligned(64) GregoryPatchT
+  {
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+  public:
+    Vertex v[4][4];
+    Vertex f[2][2];
+
+    __forceinline GregoryPatchT() {}
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch) {
+      init(patch);
+    }
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch, 
+                                const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+    {
+      init_crackfix(patch,border0,border1,border2,border3);
+    }
+
+    __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { 
+      init(CatmullClarkPatch(edge,vertices,stride));
+    }
+      
+    __forceinline Vertex& p0() { return v[0][0]; }
+    __forceinline Vertex& p1() { return v[0][3]; }
+    __forceinline Vertex& p2() { return v[3][3]; }
+    __forceinline Vertex& p3() { return v[3][0]; }
+    
+    __forceinline Vertex& e0_p() { return v[0][1]; }
+    __forceinline Vertex& e0_m() { return v[1][0]; }
+    __forceinline Vertex& e1_p() { return v[1][3]; }
+    __forceinline Vertex& e1_m() { return v[0][2]; }
+    __forceinline Vertex& e2_p() { return v[3][2]; }
+    __forceinline Vertex& e2_m() { return v[2][3]; }
+    __forceinline Vertex& e3_p() { return v[2][0]; }
+    __forceinline Vertex& e3_m() { return v[3][1]; }
+    
+    __forceinline Vertex& f0_p() { return v[1][1]; }
+    __forceinline Vertex& f1_p() { return v[1][2]; }
+    __forceinline Vertex& f2_p() { return v[2][2]; }
+    __forceinline Vertex& f3_p() { return v[2][1]; }
+    __forceinline Vertex& f0_m() { return f[0][0]; }
+    __forceinline Vertex& f1_m() { return f[0][1]; }
+    __forceinline Vertex& f2_m() { return f[1][1]; }
+    __forceinline Vertex& f3_m() { return f[1][0]; }
+    
+    __forceinline const Vertex& p0() const { return v[0][0]; }
+    __forceinline const Vertex& p1() const { return v[0][3]; }
+    __forceinline const Vertex& p2() const { return v[3][3]; }
+    __forceinline const Vertex& p3() const { return v[3][0]; }
+    
+    __forceinline const Vertex& e0_p() const { return v[0][1]; }
+    __forceinline const Vertex& e0_m() const { return v[1][0]; }
+    __forceinline const Vertex& e1_p() const { return v[1][3]; }
+    __forceinline const Vertex& e1_m() const { return v[0][2]; }
+    __forceinline const Vertex& e2_p() const { return v[3][2]; }
+    __forceinline const Vertex& e2_m() const { return v[2][3]; }
+    __forceinline const Vertex& e3_p() const { return v[2][0]; }
+    __forceinline const Vertex& e3_m() const { return v[3][1]; }
+    
+    __forceinline const Vertex& f0_p() const { return v[1][1]; }
+    __forceinline const Vertex& f1_p() const { return v[1][2]; }
+    __forceinline const Vertex& f2_p() const { return v[2][2]; }
+    __forceinline const Vertex& f3_p() const { return v[2][1]; }
+    __forceinline const Vertex& f0_m() const { return f[0][0]; }
+    __forceinline const Vertex& f1_m() const { return f[0][1]; }
+    __forceinline const Vertex& f2_m() const { return f[1][1]; }
+    __forceinline const Vertex& f3_m() const { return f[1][0]; }
+    
+    __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) {
+      return irreg_patch.ring[index].getLimitVertex();
+    }
+    
+    __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx);
+    }
+
+    __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx);
+    }
+    
+    void initFaceVertex(const CatmullClarkPatch& irreg_patch, 
+			const size_t index, 
+			const Vertex& p_vtx, 
+                        const Vertex& e0_p_vtx, 
+			const Vertex& e1_m_vtx, 
+			const unsigned int face_valence_p1,
+ 			const Vertex& e0_m_vtx,	
+			const Vertex& e3_p_vtx,	
+			const unsigned int face_valence_p3,
+			Vertex& f_p_vtx, 
+			Vertex& f_m_vtx)
+    {
+      const unsigned int face_valence = irreg_patch.ring[index].face_valence;
+      const unsigned int edge_valence = irreg_patch.ring[index].edge_valence;
+      const unsigned int border_index = irreg_patch.ring[index].border_index;
+      
+      const Vertex& vtx     = irreg_patch.ring[index].vtx;
+      const Vertex e_i      = irreg_patch.ring[index].getEdgeCenter(0);
+      const Vertex c_i_m_1  = irreg_patch.ring[index].getQuadCenter(0);
+      const Vertex e_i_m_1  = irreg_patch.ring[index].getEdgeCenter(1);
+      
+      Vertex c_i, e_i_p_1;
+      const bool hasHardEdge0 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[0]);
+                
+      if (unlikely((border_index == edge_valence-2) || hasHardEdge0))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i     = madd(2.0f, e_i - c_i_m_1, c_i_m_1);
+        e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1);
+      }
+      else
+      {
+        c_i     = irreg_patch.ring[index].getQuadCenter( face_valence-1 );
+        e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 );
+      }
+      
+      Vertex c_i_m_2, e_i_m_2;
+      const bool hasHardEdge1 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[1]);
+      
+      if (unlikely(border_index == 2 || hasHardEdge1))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i_m_2  = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1);
+        e_i_m_2  = madd(2.0f, vtx - e_i, + e_i);
+      }
+      else
+      {
+        c_i_m_2  = irreg_patch.ring[index].getQuadCenter( 1 );
+        e_i_m_2  = irreg_patch.ring[index].getEdgeCenter( 2 );
+      }      
+      
+      const float d = 3.0f;
+      //const float c     = cosf(2.0f*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3);
+      
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+      const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i);
+      const Vertex r_e_m = 1.0f/3.0f * (e_i     - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2);
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);     
+    }
+
+    __noinline void init(const CatmullClarkPatch& patch)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+
+    }
+
+    __noinline void init_crackfix(const CatmullClarkPatch& patch, 
+                                  const BezierCurve* border0, 
+                                  const BezierCurve* border1,
+                                  const BezierCurve* border2, 
+                                  const BezierCurve* border3)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      if (unlikely(border0 != nullptr)) 
+      {         
+        p0()   = border0->v0;
+        e0_p() = border0->v1; 
+        e1_m() = border0->v2; 
+        p1()   = border0->v3;
+      }
+      
+      if (unlikely(border1 != nullptr))
+      {          
+        p1()   = border1->v0; 
+        e1_p() = border1->v1; 
+        e2_m() = border1->v2; 
+        p2()   = border1->v3; 
+      }
+
+      if (unlikely(border2 != nullptr))
+      {          
+        p2()   = border2->v0; 
+        e2_p() = border2->v1; 
+        e3_m() = border2->v2; 
+        p3()   = border2->v3; 
+      }
+
+      if (unlikely(border3 != nullptr))
+      {          
+        p3()   = border3->v0; 
+        e3_p() = border3->v1; 
+        e0_m() = border3->v2; 
+        p0()   = border3->v3; 
+      }
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+    }
+
+    
+    void computeGregoryPatchFacePoints(const unsigned int face_valence,
+				       const Vertex& r_e_p, 
+				       const Vertex& r_e_m, 					 
+				       const Vertex& p_vtx, 
+				       const Vertex& e0_p_vtx, 
+				       const Vertex& e1_m_vtx, 
+				       const unsigned int face_valence_p1,
+				       const Vertex& e0_m_vtx,	
+				       const Vertex& e3_p_vtx,	
+				       const unsigned int face_valence_p3,
+				       Vertex& f_p_vtx, 
+				       Vertex& f_m_vtx,
+                                       const float d = 3.0f)
+    {
+      //const float c     = cosf(2.0*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3);
+
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);      
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);
+    }
+
+    __noinline void init(const GeneralCatmullClarkPatch& patch)
+    {
+      assert(patch.size() == 4);
+#if 0
+      CatmullClarkPatch qpatch; patch.init(qpatch);
+      init(qpatch);
+#else
+      const float face_valence_p0 = patch.ring[0].face_valence;
+      const float face_valence_p1 = patch.ring[1].face_valence;
+      const float face_valence_p2 = patch.ring[2].face_valence;
+      const float face_valence_p3 = patch.ring[3].face_valence;
+
+      Vertex p0_r_p, p0_r_m;
+      patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m );
+
+      Vertex p1_r_p, p1_r_m;
+      patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m );
+      
+      Vertex p2_r_p, p2_r_m;
+      patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m );
+
+      Vertex p3_r_p, p3_r_m;
+      patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m );
+
+      computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() );
+      computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() );
+      computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() );
+      computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() );
+
+#endif
+    }
+   
+    
+    __forceinline void convert_to_bezier()
+    {
+      f0_p() = (f0_p() + f0_m()) * 0.5f;
+      f1_p() = (f1_p() + f1_m()) * 0.5f;
+      f2_p() = (f2_p() + f2_m()) * 0.5f;
+      f3_p() = (f3_p() + f3_m()) * 0.5f;
+      f0_m() = Vertex( zero );
+      f1_m() = Vertex( zero );
+      f2_m() = Vertex( zero );
+      f3_m() = Vertex( zero );      
+    }
+    
+    static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv,
+						   Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21)
+    {
+      if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) 
+      {
+	matrix_11 = matrix[1][1];
+	matrix_12 = matrix[1][2];
+	matrix_22 = matrix[2][2];
+	matrix_21 = matrix[2][1];	 
+      }
+      else
+      {
+	const Vertex_t f0_p = matrix[1][1];
+	const Vertex_t f1_p = matrix[1][2];
+	const Vertex_t f2_p = matrix[2][2];
+	const Vertex_t f3_p = matrix[2][1];
+        
+	const Vertex_t f0_m = f_m[0][0];
+	const Vertex_t f1_m = f_m[0][1];
+	const Vertex_t f2_m = f_m[1][1];
+	const Vertex_t f3_m = f_m[1][0];
+        
+	matrix_11 = (      uu  * f0_p +       vv  * f0_m)*rcp(uu+vv);
+	matrix_12 = ((1.0f-uu) * f1_m +       vv  * f1_p)*rcp(1.0f-uu+vv);
+	matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv);
+	matrix_21 = (      uu  * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv);
+      }
+    } 
+
+    template<typename vfloat>
+    static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], 
+                                                   size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) 
+    {
+      const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const vfloat f0_p = v[1][1][i];
+      const vfloat f1_p = v[1][2][i];
+      const vfloat f2_p = v[2][2][i];
+      const vfloat f3_p = v[2][1][i];
+      
+      const vfloat f0_m = f[0][0][i];
+      const vfloat f1_m = f[0][1][i];
+      const vfloat f2_m = f[1][1][i];
+      const vfloat f3_m = f[1][0][i];
+      
+      const vfloat one_minus_uu = vfloat(1.0f) - uu;
+      const vfloat one_minus_vv = vfloat(1.0f) - vv;      
+      
+      const vfloat f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const vfloat f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const vfloat f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      matrix_11 = select(m_border,f0_p,f0_i);
+      matrix_12 = select(m_border,f1_p,f1_i);
+      matrix_22 = select(m_border,f2_p,f2_i);
+      matrix_21 = select(m_border,f3_p,f3_i);
+    }
+
+    static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) 
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+      
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative2(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+     }
+
+    static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative2(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    __forceinline Vertex eval(const float uu, const float vv) const {
+      return eval(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_du( const float uu, const float vv) const {
+      return eval_du(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dv( const float uu, const float vv) const {
+      return eval_dv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudu( const float uu, const float vv) const {
+      return eval_dudu(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dvdv( const float uu, const float vv) const {
+      return eval_dvdv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudv( const float uu, const float vv) const {
+      return eval_dudv(v,f,uu,vv);
+    }
+
+    static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv)  // FIXME: why not using basis functions
+    {
+      /* interpolate inner vertices */
+      Vertex_t matrix_11, matrix_12, matrix_22, matrix_21;
+      computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21);
+      
+      /* tangentU */
+      const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]);
+      const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11   , (Vertex_t)matrix_21   , (Vertex_t)matrix[3][1]);
+      const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12   , (Vertex_t)matrix_22   , (Vertex_t)matrix[3][2]);
+      const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]);
+      const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11   , (Vertex_t)matrix_12   , (Vertex_t)matrix[1][3]);
+      const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21   , (Vertex_t)matrix_22   , (Vertex_t)matrix[2][3]);
+      const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vertex_t n = cross(tangentU,tangentV);
+      
+      return n;     
+    }
+   
+    __forceinline Vertex normal( const float uu, const float vv) const {
+      return normal(v,f,uu,vv);
+    }    
+    
+    __forceinline void eval(const float u, const float v, 
+                            Vertex* P, Vertex* dPdu, Vertex* dPdv, 
+                            Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+    static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                     const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n,
+                                     vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21)
+    {
+      const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+      const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i]))));
+      const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i]))));
+      const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+      return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+    }
+    
+    template<typename vbool, typename vfloat>
+    static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                   const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                                   float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                                   const float dscale, const size_t dstride, const size_t N) 
+    {
+      if (P) {
+        const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+        const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+        for (size_t i=0; i<N; i++) {
+          vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+          computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times
+          vfloat::store(valid,P+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21));
+        }
+      }
+      if (dPdu)
+      {
+        {
+          assert(dPdu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+        {
+          assert(dPdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+      }
+      if (ddPdudu)
+      {
+        {
+          assert(ddPdudu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdvdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative2(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdvdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdudv);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                            float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                            const float dscale, const size_t dstride, const size_t N) const {
+      eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> eval_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      const Vec3<T> F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+
+      const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), 
+                  madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x          ,madd(B2_u,F1.x          ,B3_u * matrix[1][3].x))), 
+                  madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x          ,madd(B2_u,F2.x          ,B3_u * matrix[2][3].x))), 
+                       B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); 
+
+      const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y          ,madd(B2_u,F1.y          ,B3_u * matrix[1][3].y))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y          ,madd(B2_u,F2.y          ,B3_u * matrix[2][3].y))),
+                       B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y))))));
+      
+      const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z          ,madd(B2_u,F1.z          ,B3_u * matrix[1][3].z))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z          ,madd(B2_u,F2.z          ,B3_u * matrix[2][3].z))),
+                       B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z))))));
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<class T>
+    __forceinline Vec3<T> eval(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return eval_t(v,ff,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+
+#if 1
+      const M m_corner0 = (uu == 0.0f) & (vv == 0.0f);
+      const M m_corner1 = (uu == 1.0f) & (vv == 0.0f);
+      const M m_corner2 = (uu == 1.0f) & (vv == 1.0f);
+      const M m_corner3 = (uu == 0.0f) & (vv == 1.0f);      
+      const Vec3<T> matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) );
+#else
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+      const Vec3<T> matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+#endif
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+      
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+      
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+      
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+     template<class T>
+    __forceinline Vec3<T> normal(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return normal_t(v,ff,uu,vv);
+    }
+
+    __forceinline BBox<Vertex> bounds() const
+    {
+      const Vertex *const cv = &v[0][0];
+      BBox<Vertex> bounds (cv[0]);
+      for (size_t i=1; i<16; i++) 
+        bounds.extend( cv[i] );
+      bounds.extend(f[0][0]);
+      bounds.extend(f[1][0]);
+      bounds.extend(f[1][1]);
+      bounds.extend(f[1][1]);
+      return bounds;
+    }
+    
+    friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+      
+      for (size_t y=0; y<2; y++)
+	for (size_t x=0; x<2; x++)
+	  o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl;
+      return o;
+    } 
+  };
+
+  typedef GregoryPatchT<Vec3fa,Vec3fa_t> GregoryPatch3fa;
+
+  template<typename Vertex, typename Vertex_t>
+    __forceinline  BezierPatchT<Vertex,Vertex_t>::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) 
+  {
+    CatmullClarkPatchT<Vertex,Vertex_t> patch(edge,vertices,stride);
+    GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+    gpatch.convert_to_bezier(); 
+    for (size_t y=0; y<4; y++)
+      for (size_t x=0; x<4; x++)
+        matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+  }
+  
+   template<typename Vertex, typename Vertex_t>
+    __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+
+   template<typename Vertex, typename Vertex_t>
+     __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, 
+                                                               const BezierCurveT<Vertex>* border0,
+                                                               const BezierCurveT<Vertex>* border1,
+                                                               const BezierCurveT<Vertex>* border2,
+                                                               const BezierCurveT<Vertex>* border3) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch,border0,border1,border2,border3); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+}
diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h
new file mode 100644
index 0000000000..4cf9a7e98f
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h
@@ -0,0 +1,113 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gregory_patch.h"
+
+namespace embree
+{  
+  class __aligned(64) DenseGregoryPatch3fa
+  {
+    typedef Vec3fa Vec3fa_4x4[4][4];
+  public:
+
+    __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f);
+      
+      matrix[0][0].w = patch.f[0][0].x;
+      matrix[0][1].w = patch.f[0][0].y;
+      matrix[0][2].w = patch.f[0][0].z;
+      matrix[0][3].w = 0.0f;
+      
+      matrix[1][0].w = patch.f[0][1].x;
+      matrix[1][1].w = patch.f[0][1].y;
+      matrix[1][2].w = patch.f[0][1].z;
+      matrix[1][3].w = 0.0f;
+      
+      matrix[2][0].w = patch.f[1][1].x;
+      matrix[2][1].w = patch.f[1][1].y;
+      matrix[2][2].w = patch.f[1][1].z;
+      matrix[2][3].w = 0.0f;
+      
+      matrix[3][0].w = patch.f[1][0].x;
+      matrix[3][1].w = patch.f[1][0].y;
+      matrix[3][2].w = patch.f[1][0].z;
+      matrix[3][3].w = 0.0f;
+    }
+
+    __forceinline void extract_f_m(Vec3fa f_m[2][2]) const
+    {
+      f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );      
+    }
+
+    __forceinline Vec3fa eval(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline Vec3fa normal(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    template<class T>
+      __forceinline Vec3<T> eval(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+    
+    template<class T>
+      __forceinline Vec3<T> normal(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, 
+                            Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      if (P) {
+        *P    = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+        assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const 
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N);
+    }
+
+  private:
+    Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/gridrange.h b/thirdparty/embree/kernels/subdiv/gridrange.h
new file mode 100644
index 0000000000..4f2b90d7bd
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gridrange.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+namespace embree
+{
+  struct __aligned(16) GridRange
+  {
+    unsigned int u_start;
+    unsigned int u_end;
+    unsigned int v_start;
+    unsigned int v_end;
+
+    __forceinline GridRange() {}
+
+    __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) 
+      : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {}
+
+    __forceinline unsigned int width() const {
+      return u_end-u_start+1;
+    }
+
+    __forceinline unsigned int height() const {
+      return v_end-v_start+1;
+    }
+
+    __forceinline bool hasLeafSize() const
+    {
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      assert(u_size >= 1);
+      assert(v_size >= 1);
+      return u_size <= 3 && v_size <= 3;
+    }
+
+    static __forceinline unsigned int split(unsigned int start,unsigned int end)
+    {
+      const unsigned int center = (start+end)/2;
+      assert (center > start);
+      assert (center < end);
+      return center;
+    }
+
+    __forceinline void split(GridRange& r0, GridRange& r1) const
+    {
+      assert( hasLeafSize() == false );
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      r0 = *this;
+      r1 = *this;
+
+      if (u_size >= v_size)
+      {
+        const unsigned int u_mid = split(u_start,u_end);
+        r0.u_end   = u_mid;
+        r1.u_start = u_mid;
+      }
+      else
+      {
+        const unsigned int v_mid = split(v_start,v_end);
+        r0.v_end   = v_mid;
+        r1.v_start = v_mid;
+      }
+    }
+
+    __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const
+    {
+      assert( !hasLeafSize() );
+      unsigned int children = 0;
+      GridRange first,second;
+      split(first,second);
+
+      if (first.hasLeafSize()) {
+        r[0] = first;
+        children++;
+      } 
+      else {
+        first.split(r[0],r[1]);
+        children += 2;
+      }
+
+      if (second.hasLeafSize())	{
+        r[children] = second;
+        children++;
+      }
+      else {
+        second.split(r[children+0],r[children+1]);
+        children += 2;
+      }
+      return children;      
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/half_edge.h b/thirdparty/embree/kernels/subdiv/half_edge.h
new file mode 100644
index 0000000000..baf019cd79
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/half_edge.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  class __aligned(32) HalfEdge
+  {
+    friend class SubdivMesh;
+    public:
+
+    enum PatchType : char { 
+      BILINEAR_PATCH        = 0, //!< a bilinear patch
+      REGULAR_QUAD_PATCH    = 1, //!< a regular quad patch can be represented as a B-Spline
+      IRREGULAR_QUAD_PATCH  = 2, //!< an irregular quad patch can be represented as a Gregory patch
+      COMPLEX_PATCH         = 3  //!< these patches need subdivision and cannot be processed by the above fast code paths
+    };
+    
+    enum VertexType : char { 
+      REGULAR_VERTEX           = 0, //!< regular vertex
+      NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge
+    };
+    
+    __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) {
+      return (PatchType) max((int)ty0,(int)ty1);
+    }
+    
+    struct Edge 
+    {
+      /*! edge constructor */
+      __forceinline Edge(const uint32_t v0, const uint32_t v1)
+	: v0(v0), v1(v1) {}
+
+      /*! create an 64 bit identifier that is unique for the not oriented edge */
+      __forceinline operator uint64_t() const       
+      {
+	uint32_t p0 = v0, p1 = v1;
+	if (p0<p1) std::swap(p0,p1);
+	return (((uint64_t)p0) << 32) | (uint64_t)p1;
+      }
+
+    public:
+      uint32_t v0,v1;    //!< start and end vertex of the edge
+    };
+
+    HalfEdge () 
+      : vtx_index(-1), next_half_edge_ofs(0), prev_half_edge_ofs(0), opposite_half_edge_ofs(0), edge_crease_weight(0), 
+      vertex_crease_weight(0), edge_level(0), patch_type(COMPLEX_PATCH), vertex_type(REGULAR_VERTEX)
+    {
+      static_assert(sizeof(HalfEdge) == 32, "invalid half edge size");
+    }
+ 
+    __forceinline bool hasOpposite() const { return opposite_half_edge_ofs != 0; }
+    __forceinline void setOpposite(HalfEdge* opposite) { opposite_half_edge_ofs = int(opposite-this); }
+    
+    __forceinline       HalfEdge* next()       { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    __forceinline const HalfEdge* next() const { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* prev()       { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    __forceinline const HalfEdge* prev() const { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* opposite()       { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    __forceinline const HalfEdge* opposite() const { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* rotate()       { return opposite()->next(); }
+    __forceinline const HalfEdge* rotate() const { return opposite()->next(); }
+    
+    __forceinline unsigned int getStartVertexIndex() const { return vtx_index; }
+    __forceinline unsigned int getEndVertexIndex  () const { return next()->vtx_index; }
+    __forceinline Edge         getEdge            () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); }
+   
+    
+    /*! tests if the start vertex of the edge is regular */
+    __forceinline PatchType vertexType() const
+    {
+      const HalfEdge* p = this;
+      size_t face_valence = 0;
+      bool hasBorder = false;
+      
+      do
+      {
+        /* we need subdivision to handle edge creases */
+        if (p->hasOpposite() && p->edge_crease_weight > 0.0f) 
+          return COMPLEX_PATCH;
+        
+        face_valence++;
+        
+        /* test for quad */
+        const HalfEdge* pp = p;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp != p) return COMPLEX_PATCH;
+        
+        /* continue with next face */
+        p = p->prev();
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          face_valence++;
+          hasBorder = true;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->rotate();
+        }
+      } while (p != this); 
+      
+      /* calculate vertex type */
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return REGULAR_QUAD_PATCH;
+        else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH;
+        else                                         return COMPLEX_PATCH;
+      }
+      else if (vertex_crease_weight != 0.0f)         return COMPLEX_PATCH;
+      else if (face_valence == 3 &&  hasBorder)      return REGULAR_QUAD_PATCH;
+      else if (face_valence == 4 && !hasBorder)      return REGULAR_QUAD_PATCH;
+      else                                           return IRREGULAR_QUAD_PATCH;
+    }
+
+    /*! tests if this edge is part of a bilinear patch */
+    __forceinline bool bilinearVertex() const {
+      return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf);
+    }
+    
+    /*! calculates the type of the patch */
+    __forceinline PatchType patchType() const 
+    {
+      const HalfEdge* p = this;
+      PatchType ret = REGULAR_QUAD_PATCH;
+      bool bilinear = true;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) != this) return COMPLEX_PATCH;
+      
+      if (bilinear) return BILINEAR_PATCH;
+      return ret;
+    }
+    
+    /*! tests if the face is a regular b-spline face */
+    __forceinline bool isRegularFace() const {
+      return patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the face can be diced (using bspline or gregory patch) */
+    __forceinline bool isGregoryFace() const {
+      return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the base vertex of this half edge is a corner vertex */
+    __forceinline bool isCorner() const {
+      return !hasOpposite() && !prev()->hasOpposite();
+    }
+
+    /*! tests if the vertex is attached to any border */
+    __forceinline bool vertexHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (!p->hasOpposite()) return true;
+        p = p->rotate();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! tests if the face this half edge belongs to has some border */
+    __forceinline bool faceHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (p->vertexHasBorder() && (p->vertex_type != HalfEdge::NON_MANIFOLD_EDGE_VERTEX)) return true;
+        p = p->next();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! calculates conservative bounds of a catmull clark subdivision face */
+    __forceinline BBox3fa bounds(const BufferView<Vec3fa>& vertices) const
+    {
+      BBox3fa bounds = this->get1RingBounds(vertices);
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next())
+        bounds.extend(p->get1RingBounds(vertices));
+      return bounds;
+    }
+    
+    /*! tests if this is a valid patch */
+    __forceinline bool valid(const BufferView<Vec3fa>& vertices) const
+    {
+      size_t N = 1;
+      if (!this->validRing(vertices)) return false;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) {
+        if (!p->validRing(vertices)) return false;
+      }
+      return N >= 3 && N <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! counts number of polygon edges  */
+    __forceinline unsigned int numEdges() const
+    {
+      unsigned int N = 1;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++);
+      return N;
+    }
+
+    /*! calculates face and edge valence */
+    __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const 
+    {
+      faceValence = 0;
+      edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+         /* calculate bounds of current face */
+        unsigned int numEdges = p->numEdges();
+        assert(numEdges >= 3);
+        edgeValence += numEdges-2;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+    }
+
+    /*! stream output */
+    friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h)
+    {
+      return o << "{ " << 
+        "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << 
+        "prev = " << h.prev_half_edge_ofs << ", " << 
+        "next = " << h.next_half_edge_ofs << ", " << 
+        "opposite = " << h.opposite_half_edge_ofs << ", " << 
+        "edge_crease = " << h.edge_crease_weight << ", " << 
+        "vertex_crease = " << h.vertex_crease_weight << ", " << 
+        //"edge_level = " << h.edge_level << 
+        " }";
+    } 
+    
+  private:
+    
+    /*! calculates the bounds of the face associated with the half-edge */
+    __forceinline BBox3fa getFaceBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa b = vertices[getStartVertexIndex()];
+      for (const HalfEdge* p = next(); p!=this; p=p->next()) {
+        b.extend(vertices[p->getStartVertexIndex()]);
+      }
+      return b;
+    }
+    
+    /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */
+    __forceinline BBox3fa get1RingBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa bounds = empty;
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        bounds.extend(p->getFaceBounds(vertices));
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return bounds;
+    }
+    
+    /*! tests if this is a valid face */
+    __forceinline bool validFace(const BufferView<Vec3fa>& vertices, size_t& N) const 
+    {
+      const Vec3fa v = vertices[getStartVertexIndex()];
+      if (!isvalid(v)) return false;
+      size_t n = 1;
+      for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) {
+        const Vec3fa v = vertices[p->getStartVertexIndex()];
+        if (!isvalid(v)) return false;
+      }
+      N += n-2;
+      return n >= 3 && n <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! tests if this is a valid ring */
+    __forceinline bool validRing(const BufferView<Vec3fa>& vertices) const 
+    {
+      size_t faceValence = 0;
+      size_t edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        if (!p->validFace(vertices,edgeValence)) 
+          return false;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE;
+    }
+    
+  private:
+    unsigned int vtx_index;         //!< index of edge start vertex
+    int next_half_edge_ofs;         //!< relative offset to next half edge of face
+    int prev_half_edge_ofs;         //!< relative offset to previous half edge of face
+    int opposite_half_edge_ofs;     //!< relative offset to opposite half edge
+    
+  public:
+    float edge_crease_weight;       //!< crease weight attached to edge
+    float vertex_crease_weight;     //!< crease weight attached to start vertex
+    float edge_level;               //!< subdivision factor for edge
+    PatchType patch_type;           //!< stores type of subdiv patch
+    VertexType vertex_type;         //!< stores type of the start vertex
+    char align[2];
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/hermite_curve.h b/thirdparty/embree/kernels/subdiv/hermite_curve.h
new file mode 100644
index 0000000000..ffef5a4315
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/hermite_curve.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex>
+    struct HermiteCurveT : BezierCurveT<Vertex>
+    {
+      __forceinline HermiteCurveT() {}
+
+      __forceinline HermiteCurveT(const BezierCurveT<Vertex>& curve)
+        : BezierCurveT<Vertex>(curve) {}
+      
+      __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1)
+        : BezierCurveT<Vertex>(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {}
+
+      __forceinline HermiteCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w);
+        const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w);
+        const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w);
+        const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w);
+        return BezierCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+    };
+
+  template<typename CurveGeometry>
+  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
+    return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve)));
+  }
+  
+  typedef HermiteCurveT<Vec3fa> HermiteCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
new file mode 100644
index 0000000000..f8e8a25f35
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bezier_curve.h"
+
+namespace embree
+{
+  namespace isa
+  {   
+    template<typename V>
+      struct TensorLinearQuadraticBezierSurface
+      {
+        QuadraticBezierCurve<V> L;
+        QuadraticBezierCurve<V> R;
+        
+        __forceinline TensorLinearQuadraticBezierSurface() {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<V>& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+          __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<V>& L, const QuadraticBezierCurve<V>& R)
+            : L(L), R(R) {}
+        
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+      };
+    
+    template<>
+      struct TensorLinearQuadraticBezierSurface<Vec2fa>
+    {
+      QuadraticBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearQuadraticBezierSurface() {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<Vec2fa>& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+    };
+    
+    template<typename V>
+      struct TensorLinearCubicBezierSurface
+      {
+        CubicBezierCurve<V> L;
+        CubicBezierCurve<V> R;
+        
+        __forceinline TensorLinearCubicBezierSurface() {}
+        
+        __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+        __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<V>& L, const CubicBezierCurve<V>& R)
+          : L(L), R(R) {}
+
+        template<template<typename T> class SourceCurve>
+        __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve<Vec3ff>& center, const SourceCurve<Vec3fa>& normal)
+        {
+          SourceCurve<Vec3ff> vcurve = center;
+          SourceCurve<Vec3fa> ncurve = normal;
+          
+          /* here we construct a patch which follows the curve l(t) =
+           * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
+          
+          const Vec3ff p0   = vcurve.eval(0.0f);
+          const Vec3ff dp0  = vcurve.eval_du(0.0f);
+          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+
+          const Vec3fa n0   = ncurve.eval(0.0f);
+          const Vec3fa dn0  = ncurve.eval_du(0.0f);
+
+          const Vec3ff p1   = vcurve.eval(1.0f);
+          const Vec3ff dp1  = vcurve.eval_du(1.0f);
+          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+
+          const Vec3fa n1   = ncurve.eval(1.0f);
+          const Vec3fa dn1  = ncurve.eval_du(1.0f);
+
+          const Vec3fa bt0  = cross(n0,dp0);
+          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+
+          const Vec3fa bt1  = cross(n1,dp1);
+          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+            
+          const Vec3fa k0  = normalize(bt0);
+          const Vec3fa dk0 = dnormalize(bt0,dbt0);
+          
+          const Vec3fa k1 = normalize(bt1);
+          const Vec3fa dk1 = dnormalize(bt1,dbt1);
+                    
+          const Vec3fa l0 = p0 - p0.w*k0;
+          const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa r0 = p0 + p0.w*k0;
+          const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa l1 = p1 - p1.w*k1;
+          const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1);
+
+          const Vec3fa r1 = p1 + p1.w*k1;
+          const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1);
+
+          const float scale = 1.0f/3.0f;
+          CubicBezierCurve<V> L(l0,l0+scale*dl0,l1-scale*dl1,l1);
+          CubicBezierCurve<V> R(r0,r0+scale*dr0,r1-scale*dr1,r1);
+          return TensorLinearCubicBezierSurface(L,R);
+        }
+
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+
+        __forceinline BBox3fa accurateBounds() const {
+          return merge(L.accurateBounds(),R.accurateBounds());
+        }
+        
+        __forceinline CubicBezierCurve<Interval1f> reduce_v() const {
+          return merge(CubicBezierCurve<Interval<V>>(L),CubicBezierCurve<Interval<V>>(R));
+        }
+        
+        __forceinline LinearBezierCurve<Interval1f> reduce_u() const {
+          return LinearBezierCurve<Interval1f>(L.bounds(),R.bounds());
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx,p),R.xfm(dx,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+          return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const {
+          return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper)));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+          return clip_v(v).clip_u(u);
+        }
+        
+        __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+        {
+          CubicBezierCurve<V> L0,L1; L.split(L0,L1,u);
+          CubicBezierCurve<V> R0,R1; R.split(R0,R1,u);
+          new (&left ) TensorLinearCubicBezierSurface(L0,R0);
+          new (&right) TensorLinearCubicBezierSurface(L1,R1);
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+          valid = true; clear(valid,VSIZEX-1);
+          return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u));
+        }
+        
+        __forceinline V eval(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval(u);
+        }
+        
+        __forceinline V eval_du(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval_dt(u);
+        }
+        
+        __forceinline V eval_dv(const float u, const float v) const {
+          return (R-L).eval(u);
+        }
+        
+        __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const
+        {
+          V p0, dp0du; L.eval(u,p0,dp0du);
+          V p1, dp1du; R.eval(u,p1,dp1du);
+          p = lerp(p0,p1,v);
+          dpdu = lerp(dp0du,dp1du,v);
+          dpdv = p1-p0;
+        }
+        
+        __forceinline TensorLinearQuadraticBezierSurface<V> derivative_u() const {
+          return TensorLinearQuadraticBezierSurface<V>(L.derivative(),R.derivative());
+        }
+        
+        __forceinline CubicBezierCurve<V> derivative_v() const {
+          return R-L;
+        }
+        
+        __forceinline V axis_u() const {
+          return (L.end()-L.begin())+(R.end()-R.begin());
+        }
+        
+        __forceinline V axis_v() const {
+          return (R.begin()-L.begin())+(R.end()-L.end());
+        }
+        
+        friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+        {
+          return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                      << "{" << embree_endl
+                      << "  L = " << a.L << ", " << embree_endl
+                      << "  R = " << a.R << embree_endl
+                      << "}";
+        }
+
+        friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) {
+          return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t)));
+        }
+      };
+    
+    template<>
+      struct TensorLinearCubicBezierSurface<Vec2fa>
+    {
+      CubicBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearCubicBezierSurface() {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<Vec2fa>& L, const CubicBezierCurve<Vec2fa>& R)
+        : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {}
+      
+      __forceinline CubicBezierCurve<Vec2fa> getL() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3));
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> getR() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3)));
+      }
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+      
+      __forceinline BBox1f bounds(const Vec2fa& axis) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy);
+        const BBox<vfloat4> Lb = LRa.bounds();
+        const BBox<vfloat4> Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper));
+        const BBox<vfloat4> b = merge(Lb,Rb);
+        return BBox1f(b.lower[0],b.upper[0]);
+      }
+
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx, const Vec2fa& p) const
+      {
+        const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p));
+        const CubicBezierCurve<vfloat4> LRx = LR-pxyxy;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+
+      __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+        return TensorLinearCubicBezierSurface(LR.clip(u));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const
+      {
+        const CubicBezierCurve<vfloat4> LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3));
+        const CubicBezierCurve<vfloat4> RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3));
+        return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper)));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+        return clip_v(v).clip_u(u);
+      }
+      
+      __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+      {
+        CubicBezierCurve<vfloat4> LR0,LR1; LR.split(LR0,LR1,u);
+        new (&left ) TensorLinearCubicBezierSurface(LR0);
+        new (&right) TensorLinearCubicBezierSurface(LR1);
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+        valid = true; clear(valid,VSIZEX-1);
+        return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u));
+      }
+      
+      __forceinline Vec2fa eval(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v));
+      }
+      
+      __forceinline Vec2fa eval_du(const float u, const float v) const
+      {
+        const vfloat4 dpdu = LR.eval_dt(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v));
+      }
+      
+      __forceinline Vec2fa eval_dv(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p));
+      }
+      
+      __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const
+      {
+        vfloat4 p0, dp0du; LR.eval(u,p0,dp0du);
+        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v));
+        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v));
+        dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0));
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const {
+        return TensorLinearQuadraticBezierSurface<Vec2fa>(LR.derivative());
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> derivative_v() const {
+        return getR()-getL();
+      }
+      
+      __forceinline Vec2fa axis_u() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (L.end()-L.begin())+(R.end()-R.begin());
+      }
+      
+      __forceinline Vec2fa axis_v() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (R.begin()-L.begin())+(R.end()-L.end());
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+      {
+        return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                    << "{" << embree_endl
+                    << "  L = " << a.getL() << ", " << embree_endl
+                    << "  R = " << a.getR() << embree_endl
+                    << "}";
+      }
+    };
+
+    typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f;
+    typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa;
+    typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa;
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/patch.h b/thirdparty/embree/kernels/subdiv/patch.h
new file mode 100644
index 0000000000..c4340ea9b6
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bilinear_patch.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "tessellation_cache.h"
+
+#if 1
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)
+#else
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)            \
+  {                                                   \
+    size_t hex = (size_t)ptr;                          \
+    for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8);  \
+    const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \
+    if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f);         \
+    }               
+#endif
+
+#define PATCH_MAX_CACHE_DEPTH 2
+//#define PATCH_MIN_RESOLUTION 1     // FIXME: not yet completely implemented
+#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10     // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_MAX_EVAL_DEPTH_CREASE 10       // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_USE_GREGORY 1        // 0 = no gregory, 1 = fill, 2 = as early as possible
+
+#if PATCH_USE_GREGORY==2
+#define PATCH_USE_BEZIER_PATCH 1   // enable use of bezier instead of b-spline patches
+#else
+#define PATCH_USE_BEZIER_PATCH 0   // enable use of bezier instead of b-spline patches
+#endif
+
+#if PATCH_USE_BEZIER_PATCH
+#  define RegularPatch  BezierPatch
+#  define RegularPatchT BezierPatchT<Vertex,Vertex_t>
+#else
+#  define RegularPatch  BSplinePatch
+#  define RegularPatchT BSplinePatchT<Vertex,Vertex_t>
+#endif
+
+#if PATCH_USE_GREGORY
+#define IrregularFillPatch GregoryPatch
+#define IrregularFillPatchT GregoryPatchT<Vertex,Vertex_t>
+#else
+#define IrregularFillPatch BilinearPatch
+#define IrregularFillPatchT BilinearPatchT<Vertex,Vertex_t>
+#endif
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) PatchT
+    {
+    public:
+    
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+    typedef BezierCurveT<Vertex> BezierCurve;
+    
+    enum Type {
+      INVALID_PATCH = 0,
+      BILINEAR_PATCH = 1,
+      BSPLINE_PATCH = 2,  
+      BEZIER_PATCH = 3,  
+      GREGORY_PATCH = 4,
+      SUBDIVIDED_GENERAL_PATCH = 7,
+      SUBDIVIDED_QUAD_PATCH = 8,
+      EVAL_PATCH = 9,
+    };
+    
+    struct Ref
+    {
+      __forceinline Ref(void* p = nullptr) 
+        : ptr((size_t)p) {}
+
+      __forceinline operator bool() const { return ptr != 0; }
+      __forceinline operator size_t() const { return ptr; }
+
+      __forceinline Ref (Type ty, void* in) 
+        : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); }
+
+      __forceinline Type  type  () const { return (Type)(ptr & 0xF); }
+      __forceinline void* object() const { return (void*) (ptr & ~0xF); }
+
+      size_t ptr;
+    };
+
+    struct EvalPatch 
+    {
+      /* creates EvalPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) 
+      {
+        size_t ofs = 0, bytes = patch.bytes();
+        void* ptr = alloc(bytes);
+        patch.serialize(ptr,ofs);
+        assert(ofs == bytes);
+        return Ref(EVAL_PATCH, ptr);
+      }
+    };
+
+    struct BilinearPatch 
+    {
+      /* creates BilinearPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch));
+      }
+
+      __forceinline BilinearPatch (const CatmullClarkPatch& patch) 
+        : patch(patch) {}
+
+      /* creates BilinearPatch from 4 vertices */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+    public:
+      BilinearPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct BSplinePatch 
+    {
+      /* creates BSplinePatch from a half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride));
+      }
+      
+      __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates BSplinePatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BSplinePatchT<Vertex,Vertex_t> patch;
+    };
+
+    struct BezierPatch
+    {
+      /* creates BezierPatch from a half edge */
+      template<typename Allocator>
+        __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates Bezier from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BezierPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct GregoryPatch
+    {
+      /* creates GregoryPatch from half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride));
+      }
+      
+      __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(CatmullClarkPatch(edge,vertices,stride)) {}
+       
+      /* creates GregoryPatch from CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      GregoryPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct SubdividedQuadPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref children[4]) {
+        return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children));
+      }
+      
+      __forceinline SubdividedQuadPatch(Ref children[4]) {
+        for (size_t i=0; i<4; i++) child[i] = children[i];
+      }
+      
+    public:
+      Ref child[4];
+    };
+    
+    struct SubdividedGeneralPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) {
+        return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N));
+      }
+      
+      __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) {
+        for (unsigned i=0; i<N; i++) child[i] = children[i];
+      }
+      
+      unsigned N;
+      Ref child[MAX_PATCH_VALENCE];
+    };
+    
+    /*! Default constructor. */
+    __forceinline PatchT () {}
+    
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride)
+    {
+      if (PATCH_MAX_CACHE_DEPTH == 0) 
+        return nullptr;
+
+      Ref child(0);
+      switch (edge->patch_type) {
+      case HalfEdge::BILINEAR_PATCH:       child = BilinearPatch::create(alloc,edge,vertices,stride); break; 
+      case HalfEdge::REGULAR_QUAD_PATCH:   child = RegularPatch::create(alloc,edge,vertices,stride); break;
+#if PATCH_USE_GREGORY == 2
+      case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break;
+#endif
+      default: {
+        GeneralCatmullClarkPatch patch(edge,vertices,stride);
+        child = PatchT::create(alloc,patch,edge,vertices,stride,0);
+      }
+      }
+      return child;
+    }
+
+    template<typename Allocator>
+    __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth)
+    {  
+      /* convert into standard quad patch if possible */
+      if (likely(patch.isQuadPatch())) 
+      {
+        CatmullClarkPatch qpatch; patch.init(qpatch);
+        return PatchT::create(alloc,qpatch,edge,vertices,stride,depth);
+      }
+   
+      /* do only cache up to some depth */
+      if (depth >= PATCH_MAX_CACHE_DEPTH)
+        return nullptr;
+         
+      /* subdivide patch */
+      unsigned N;
+      array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+      patch.subdivide(patches,N);
+      
+      if (N == 4) 
+      {
+        Ref child[4];
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+        BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+        BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+        BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+        BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r);
+        child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr);
+        child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr);
+        child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+      else 
+      {
+        assert(N<MAX_PATCH_VALENCE);
+        Ref child[MAX_PATCH_VALENCE];
+        
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; 
+        patch.getLimitBorder(borders);
+
+        for (size_t i0=0; i0<N; i0++) {
+          const size_t i2 = i0==0 ? N-1 : i0-1; 
+          BezierCurve border0l,border0r; borders[i0].subdivide(border0l,border0r);
+          BezierCurve border2l,border2r; borders[i2].subdivide(border2l,border2r);
+          child[i0] = PatchT::create(alloc,patches[i0],edge,vertices,stride,depth+1, &border0l, nullptr, nullptr, &border2r);
+        }
+#else
+        for (size_t i=0; i<N; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedGeneralPatch::create(alloc,child,N);
+      }
+      
+      return nullptr;
+    }
+
+    static __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+    {
+      const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//      return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+      return depth>=max_eval_depth;
+//#endif
+    }
+
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth,
+                                   const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr)
+    {
+      const typename CatmullClarkPatch::Type ty = patch.type();
+      if (unlikely(final(patch,ty,depth))) {
+        if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+        else                                     return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+      else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+        assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#if PATCH_USE_GREGORY == 2
+      else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+        assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#endif
+      else if (depth >= PATCH_MAX_CACHE_DEPTH) {
+        return EvalPatch::create(alloc,patch); 
+      }
+      
+      else 
+      {
+        Ref child[4];
+        array_t<CatmullClarkPatch,4> patches; 
+        patch.subdivide(patches);
+        
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+    }
+  };
+
+  typedef PatchT<Vec3fa,Vec3fa_t> Patch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval.h b/thirdparty/embree/kernels/subdiv/patch_eval.h
new file mode 100644
index 0000000000..a3fafa72f4
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        
+        PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                   const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                   Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            },true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+
+          if (patch && allAllocationsValid &&  eval(patch,u,v,1.0f,0)) {
+            SharedLazyTessellationCache::unlock();
+            return;
+          }
+          SharedLazyTessellationCache::unlock();
+          FeatureAdaptiveEval<Vertex,Vertex_t>(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+          PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1);
+        }
+        
+        __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth)
+        {
+          if (v < 0.5f) {
+            if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+            else          return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          } else {
+            if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+            else          return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          }
+        }
+        
+        bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth)
+        {
+          const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; 
+          const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; 
+          const unsigned i = 4*h+l; assert(i<This->N);
+          return eval(This->child[i],u,v,1.0f,depth+1);
+        }
+        
+        bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          //PRINT(depth);
+          //PRINT2(u,v);
+          
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            //PRINT("bilinear");
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,c);
+            return true;
+          }
+          case Patch::BSPLINE_PATCH: {
+            //PRINT("bspline");
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::BEZIER_PATCH: {
+            //PRINT("bezier");
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::GREGORY_PATCH: {
+            //PRINT("gregory");
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,-1,c);
+            return true;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            //PRINT("subdivided quad");
+            return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            //PRINT("general_patch");
+            assert(dscale == 1.0f); 
+            return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            //PRINT("eval_patch");
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEval<Vertex,Vertex_t>(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+            return true;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
+  
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h
new file mode 100644
index 0000000000..167e1ebe1c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h
@@ -0,0 +1,245 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_grid.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct PatchEvalGrid
+    {
+      typedef Patch3fa Patch;
+      typedef Patch::Ref Ref;
+      typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth,dheight;
+      unsigned count;
+
+    public:      
+
+      PatchEvalGrid (Ref patch, unsigned subPatch,
+                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                     float* Px, float* Py, float* Pz, float* U, float* V, 
+                     float* Nx, float* Ny, float* Nz,
+                     const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1));
+        bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange);
+        assert(done);
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+
+      template<typename Patch>
+      __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch->patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch->patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+
+      bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) 
+      {
+        if (erange.empty())
+          return true;
+        
+        const int lx0 = (int) ceilf(erange.lower.x);
+        const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        const int ly0 = (int) ceilf(erange.lower.y);
+        const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) 
+          return true;
+
+        if (!This) 
+          return false;
+        
+        switch (This.type()) 
+        {
+        case Patch::BILINEAR_PATCH: {
+          evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BSPLINE_PATCH: {
+          evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BEZIER_PATCH: {
+          evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::GREGORY_PATCH: {
+          evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::SUBDIVIDED_QUAD_PATCH: 
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object();
+          eval(patch->child[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patch->child[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patch->child[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patch->child[3],srange3,intersect(srange3,erange),depth+1);
+          return true;
+        }
+        case Patch::EVAL_PATCH: { 
+          CatmullClarkPatch patch; patch.deserialize(This.object());
+          FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight);
+          count += (lx1-lx0)*(ly1-ly0);
+          return true;
+        }
+        default: 
+          assert(false); 
+          return false;
+        }
+      }
+
+      bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) 
+      {
+        if (!This) 
+          return false;
+
+        switch (This.type()) 
+        {
+        case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+          Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object();
+          assert(subPatch < patch->N);
+          return eval(patch->child[subPatch],srange,erange,1);
+        }
+        default: 
+          assert(subPatch == 0);
+          return eval(This,srange,erange,0);
+        }
+      }
+    };
+
+    __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h)
+    {
+      const unsigned N = h->numEdges();
+      if (N == 4) return 1;
+      else return N;
+    }
+    
+    template<typename Tessellator>
+      inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator)
+    {
+      const unsigned N = h->numEdges();
+      int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t
+      float levels[GeneralCatmullClarkPatch3fa::SIZE];
+      for (unsigned i=0; i<N; i++) {
+        assert(i<GeneralCatmullClarkPatch3fa::SIZE);
+        neighborSubdiv[i] = h->hasOpposite() ? h->opposite()->numEdges() != 4 : 0; 
+        levels[i] = h->edge_level;
+        h = h->next();
+      }      
+      if (N == 4)
+      {
+        const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) };
+        tessellator(uv,neighborSubdiv,levels,0);
+      }
+      else
+      {
+        for (unsigned i=0; i<N; i++) 
+        {
+          assert(i<MAX_PATCH_VALENCE);
+          static_assert(MAX_PATCH_VALENCE <= 16, "MAX_PATCH_VALENCE > 16");
+          const int h = (i >> 2) & 3, l = i & 3;
+          const Vec2f subPatchID((float)l,(float)h);
+          const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) };
+          const int neighborSubdiv1[4] = { 0,0,0,0 }; 
+          const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] };
+          tessellator(uv,neighborSubdiv1,levels1,i);
+        }
+      }
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h
new file mode 100644
index 0000000000..fef88a4492
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_simd.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+
+        PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                       const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, 
+                       float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            }, true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+          
+          patch = allAllocationsValid ? patch : nullptr;
+
+          /* use cached data structure for calculations */
+          const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false);
+          SharedLazyTessellationCache::unlock();
+          const vbool valid2 = valid0 & !valid1;
+          if (any(valid2)) {
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+          }
+        }
+        
+        vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth)
+        {
+          vbool ret = false;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          return ret;
+        }
+        
+        vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth)
+        {
+          vbool ret = false;
+          const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
+          const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
+          const vint i = (h<<2)+l; assert(all(valid,i<patch->N));
+          foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+              ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1);
+            });
+          return ret;
+        }
+        
+        vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::BSPLINE_PATCH: {
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::BEZIER_PATCH: {
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::GREGORY_PATCH: {
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            assert(dscale == 1.0f); 
+            return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+            return valid;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h
new file mode 100644
index 0000000000..c3069dadee
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "gregory_patch_dense.h"
+#include "tessellation.h"
+#include "tessellation_cache.h"
+#include "gridrange.h"
+#include "patch_eval_grid.h"
+#include "feature_adaptive_eval_grid.h"
+#include "../common/scene_subdiv_mesh.h"
+
+namespace embree
+{
+  struct __aligned(64) SubdivPatch1Base
+  {
+  public:
+
+    enum Type {
+      INVALID_PATCH          = 0,
+      BSPLINE_PATCH          = 1,  
+      BEZIER_PATCH           = 2,  
+      GREGORY_PATCH          = 3,
+      EVAL_PATCH             = 5,
+      BILINEAR_PATCH         = 6,
+    };
+
+    enum Flags {
+      TRANSITION_PATCH       = 16, 
+    };
+
+    /*! Default constructor. */
+    __forceinline SubdivPatch1Base () {}
+
+    SubdivPatch1Base (const unsigned int gID,
+                      const unsigned int pID,
+                      const unsigned int subPatch,
+                      const SubdivMesh *const mesh,
+                      const size_t time,
+                      const Vec2f uv[4],
+                      const float edge_level[4],
+                      const int subdiv[4],
+                      const int simd_width);
+
+    __forceinline bool needsStitching() const {
+      return flags & TRANSITION_PATCH;      
+    }
+
+    __forceinline Vec2f getUV(const size_t i) const {
+      return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000);
+    }
+
+    static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]);
+    static Vec2i computeGridSize(const float level[4]);
+    bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width);
+
+  public:
+
+    __forceinline size_t getGridBytes() const {
+      const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4;
+      return 64*((grid_size_xyzuv+15) / 16);
+    }
+
+    __forceinline void write_lock()     { mtx.lock();   }
+    __forceinline void write_unlock()   { mtx.unlock(); }
+    __forceinline bool try_write_lock() { return mtx.try_lock(); }
+    //__forceinline bool try_read_lock()  { return mtx.try_read_lock(); }
+
+    __forceinline void resetRootRef() {
+      //assert( mtx.hasInitialState() );
+      root_ref = SharedLazyTessellationCache::Tag();
+    }
+
+    __forceinline SharedLazyTessellationCache::CacheEntry& entry() {
+      return (SharedLazyTessellationCache::CacheEntry&) root_ref;
+    }
+
+  public:    
+    __forceinline unsigned int geomID() const  {
+      return geom;
+    } 
+
+    __forceinline unsigned int primID() const  {
+      return prim;
+    } 
+
+  public:
+    SharedLazyTessellationCache::Tag root_ref;
+    SpinLock mtx;
+
+    unsigned short u[4];                        //!< 16bit discretized u,v coordinates
+    unsigned short v[4];
+    float level[4];
+
+    unsigned char flags;
+    unsigned char type;
+    unsigned short grid_u_res;
+    unsigned int geom;                          //!< geometry ID of the subdivision mesh this patch belongs to
+    unsigned int prim;                          //!< primitive ID of this subdivision patch
+    unsigned short grid_v_res;
+
+    unsigned short grid_size_simd_blocks;
+    unsigned int time_;
+
+    struct PatchHalfEdge {
+      const HalfEdge* edge;
+      unsigned subPatch;
+    };
+
+    Vec3fa patch_v[4][4];
+
+    const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; }
+    unsigned time() const { return time_; }
+    unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; }
+
+    void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; }
+    void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; }
+  };
+
+  namespace isa
+  {
+    Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv);
+    Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv);
+    
+    template<typename simdf>
+      Vec3<simdf> patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+
+    template<typename simdf>
+      Vec3<simdf> patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+   
+
+    /* eval grid over patch and stich edges when required */      
+    void evalGrid(const SubdivPatch1Base& patch,
+                  const unsigned x0, const unsigned x1,
+                  const unsigned y0, const unsigned y1,
+                  const unsigned swidth, const unsigned sheight,
+                  float *__restrict__ const grid_x,
+                  float *__restrict__ const grid_y,
+                  float *__restrict__ const grid_z,
+                  float *__restrict__ const grid_u,
+                  float *__restrict__ const grid_v,
+                  const SubdivMesh* const geom);
+
+    /* eval grid over patch and stich edges when required */      
+    BBox3fa evalGridBounds(const SubdivPatch1Base& patch,
+                           const unsigned x0, const unsigned x1,
+                           const unsigned y0, const unsigned y1,
+                           const unsigned swidth, const unsigned sheight,
+                           const SubdivMesh* const geom);
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/tessellation.h b/thirdparty/embree/kernels/subdiv/tessellation.h
new file mode 100644
index 0000000000..abde4f2bde
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/tessellation.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* adjust discret tessellation level for feature-adaptive pre-subdivision */
+  __forceinline float adjustTessellationLevel(float l, const size_t sublevel)
+  {
+    for (size_t i=0; i<sublevel; i++) l *= 0.5f;
+    float r = ceilf(l);      
+    for (size_t i=0; i<sublevel; i++) r *= 2.0f;
+    return r;
+  }
+  
+  __forceinline int stitch(const int x, const int fine, const int coarse) {
+    return (2*x+1)*coarse/(2*fine);
+  }
+
+  __forceinline void stitchGridEdges(const unsigned int low_rate,
+                                     const unsigned int high_rate,
+                                     const unsigned int x0,
+                                     const unsigned int x1,
+				    float * __restrict__ const uv_array,
+				    const unsigned int uv_array_step)
+  {
+#if 1
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    for (unsigned x=x0; x<=x1; x++) {
+      uv_array[(x-x0)*uv_array_step] = float(stitch(x,high_rate-1,low_rate-1))*inv_low_rate;
+    }
+    if (unlikely(x1 == high_rate-1))
+      uv_array[(x1-x0)*uv_array_step] = 1.0f;
+#else
+    assert(low_rate < high_rate);
+    assert(high_rate >= 2);
+    
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    const unsigned int dy = low_rate  - 1; 
+    const unsigned int dx = high_rate - 1;
+    
+    int p = 2*dy-dx;  
+    
+    unsigned int offset = 0;
+    unsigned int y = 0;
+    float value = 0.0f;
+    for(unsigned int x=0;x<high_rate-1; x++) // '<=' would be correct but we will leave the 1.0f at the end
+    {
+      uv_array[offset] = value;
+      
+      offset += uv_array_step;      
+      if (unlikely(p > 0))
+      {
+	y++;
+	value = (float)y * inv_low_rate;
+	p -= 2*dx;
+      }
+      p += 2*dy;
+    }
+#endif
+  }
+  
+  __forceinline void stitchUVGrid(const float edge_levels[4],
+                                  const unsigned int swidth,
+                                  const unsigned int sheight,
+                                  const unsigned int x0,
+                                  const unsigned int y0,
+				  const unsigned int grid_u_res,
+				  const unsigned int grid_v_res,
+				  float * __restrict__ const u_array,
+				  float * __restrict__ const v_array)
+  {
+    const unsigned int x1 = x0+grid_u_res-1;
+    const unsigned int y1 = y0+grid_v_res-1;
+    const unsigned int int_edge_points0 = (unsigned int)edge_levels[0] + 1;
+    const unsigned int int_edge_points1 = (unsigned int)edge_levels[1] + 1;
+    const unsigned int int_edge_points2 = (unsigned int)edge_levels[2] + 1;
+    const unsigned int int_edge_points3 = (unsigned int)edge_levels[3] + 1;
+    
+    if (unlikely(y0 == 0 && int_edge_points0 < swidth))
+      stitchGridEdges(int_edge_points0,swidth,x0,x1,u_array,1);
+    
+    if (unlikely(y1 == sheight-1 && int_edge_points2 < swidth))
+      stitchGridEdges(int_edge_points2,swidth,x0,x1,&u_array[(grid_v_res-1)*grid_u_res],1);
+    
+    if (unlikely(x0 == 0 && int_edge_points1 < sheight))
+      stitchGridEdges(int_edge_points1,sheight,y0,y1,&v_array[grid_u_res-1],grid_u_res);
+    
+    if (unlikely(x1 == swidth-1 && int_edge_points3 < sheight))
+      stitchGridEdges(int_edge_points3,sheight,y0,y1,v_array,grid_u_res);  
+  }
+  
+  __forceinline void gridUVTessellator(const float edge_levels[4],  
+                                       const unsigned int swidth,
+                                       const unsigned int sheight,
+                                       const unsigned int x0,
+                                       const unsigned int y0,
+				       const unsigned int grid_u_res,
+				       const unsigned int grid_v_res,
+				       float * __restrict__ const u_array,
+				       float * __restrict__ const v_array)
+  {
+    assert( grid_u_res >= 1);
+    assert( grid_v_res >= 1);
+    assert( edge_levels[0] >= 1.0f );
+    assert( edge_levels[1] >= 1.0f );
+    assert( edge_levels[2] >= 1.0f );
+    assert( edge_levels[3] >= 1.0f );
+    
+#if defined(__AVX__)
+    const vint8 grid_u_segments = vint8(swidth)-1;
+    const vint8 grid_v_segments = vint8(sheight)-1;
+    
+    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments));
+    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint8 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint8 u_i ( step );
+      
+      const vbool8 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8)
+      {
+        const vbool8 m_u = u_i < grid_u_segments;
+	const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f);
+	vfloat8::storeu(&u_array[index + x],u);
+	vfloat8::storeu(&v_array[index + x],v);	   
+      }
+    }       
+ #else   
+    const vint4 grid_u_segments = vint4(swidth)-1;
+    const vint4 grid_v_segments = vint4(sheight)-1;
+    
+    const vfloat4 inv_grid_u_segments = rcp(vfloat4(grid_u_segments));
+    const vfloat4 inv_grid_v_segments = rcp(vfloat4(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint4 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint4 u_i ( step );
+      
+      const vbool4 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=4, u_i += 4)
+      {
+        const vbool4 m_u = u_i < grid_u_segments;
+	const vfloat4 u = select(m_u, vfloat4(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat4 v = select(m_v, vfloat4(y0+v_i) * inv_grid_v_segments, 1.0f);
+        vfloat4::storeu(&u_array[index + x],u);
+	vfloat4::storeu(&v_array[index + x],v);	   
+      }
+    }       
+#endif
+  } 
+}
diff --git a/thirdparty/embree/kernels/subdiv/tessellation_cache.h b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
new file mode 100644
index 0000000000..99edf49be4
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+/* force a complete cache invalidation when running out of allocation space */
+#define FORCE_SIMPLE_FLUSH 0
+
+#define THREAD_BLOCK_ATOMIC_ADD 4
+
+#if defined(DEBUG)
+#define CACHE_STATS(x) 
+#else
+#define CACHE_STATS(x) 
+#endif
+
+namespace embree
+{
+  class SharedTessellationCacheStats
+  {
+  public:
+    /* stats */
+    static std::atomic<size_t> cache_accesses;
+    static std::atomic<size_t> cache_hits;
+    static std::atomic<size_t> cache_misses;
+    static std::atomic<size_t> cache_flushes;                
+    static size_t        cache_num_patches;
+    __aligned(64) static SpinLock mtx;
+    
+    /* print stats for debugging */                 
+    static void printStats();
+    static void clearStats();
+  };
+  
+  void resizeTessellationCache(size_t new_size);
+  void resetTessellationCache();
+  
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(64) ThreadWorkState 
+ {
+   ALIGNED_STRUCT_(64);
+
+   std::atomic<size_t> counter;
+   ThreadWorkState* next;
+   bool allocated;
+
+   __forceinline ThreadWorkState(bool allocated = false) 
+     : counter(0), next(nullptr), allocated(allocated) 
+   {
+     assert( ((size_t)this % 64) == 0 ); 
+   }   
+ };
+
+ class __aligned(64) SharedLazyTessellationCache 
+ {
+ public:
+   
+   static const size_t NUM_CACHE_SEGMENTS              = 8;
+   static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
+   static const size_t COMMIT_INDEX_SHIFT              = 32+8;
+#if defined(__64BIT__)
+   static const size_t REF_TAG_MASK                    = 0xffffffffff;
+#else
+   static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
+#endif
+   static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1;
+   static const size_t BLOCK_SIZE                      = 64;
+   
+
+    /*! Per thread tessellation ref cache */
+   static __thread ThreadWorkState* init_t_state;
+   static ThreadWorkState* current_t_state;
+   
+   static __forceinline ThreadWorkState *threadState() 
+   {
+     if (unlikely(!init_t_state))
+       /* sets init_t_state, can't return pointer due to macosx icc bug*/
+       SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
+     return init_t_state;
+   }
+
+   struct Tag
+   {
+     __forceinline Tag() : data(0) {}
+
+     __forceinline Tag(void* ptr, size_t combinedTime) { 
+       init(ptr,combinedTime);
+     }
+
+     __forceinline Tag(size_t ptr, size_t combinedTime) {
+       init((void*)ptr,combinedTime); 
+     }
+
+     __forceinline void init(void* ptr, size_t combinedTime)
+     {
+       if (ptr == nullptr) {
+         data = 0;
+         return;
+       }
+       int64_t new_root_ref = (int64_t) ptr;
+       new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                
+       assert( new_root_ref <= (int64_t)REF_TAG_MASK );
+       new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; 
+       data = new_root_ref;
+     }
+
+     __forceinline int64_t get() const { return data.load(); }
+     __forceinline void set( int64_t v ) { data.store(v); }
+     __forceinline void reset() { data.store(0); }
+
+   private:
+     atomic<int64_t> data;
+   };
+
+   static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
+
+   struct CacheEntry
+   {
+     Tag tag;
+     SpinLock mutex;
+   };
+
+ private:
+
+   float *data;
+   bool hugepages;
+   size_t size;
+   size_t maxBlocks;
+   ThreadWorkState *threadWorkState;
+      
+   __aligned(64) std::atomic<size_t> localTime;
+   __aligned(64) std::atomic<size_t> next_block;
+   __aligned(64) SpinLock   reset_state;
+   __aligned(64) SpinLock   linkedlist_mtx;
+   __aligned(64) std::atomic<size_t> switch_block_threshold;
+   __aligned(64) std::atomic<size_t> numRenderThreads;
+
+
+ public:
+
+      
+   SharedLazyTessellationCache();
+   ~SharedLazyTessellationCache();
+
+   void getNextRenderThreadWorkState();
+
+   __forceinline size_t maxAllocSize() const {
+     return switch_block_threshold;
+   }
+
+   __forceinline size_t getCurrentIndex() { return localTime.load(); }
+   __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
+
+   __forceinline size_t getTime(const size_t globalTime) {
+     return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
+   }
+
+
+   __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  }
+   __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
+
+   __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
+
+   static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); }
+   static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
+   static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
+   static __forceinline size_t getState() { return threadState()->counter.load(); }
+   static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
+
+   static __forceinline size_t getTCacheTime(const size_t globalTime) {
+     return sharedLazyTessellationCache.getTime(globalTime);
+   }
+
+   /* per thread lock */
+   __forceinline void lockThreadLoop (ThreadWorkState *const t_state) 
+   { 
+     while(1)
+     {
+       size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
+       if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
+       {
+         /* lock failed wait until sync phase is over */
+         sharedLazyTessellationCache.unlockThread(t_state,-1);	       
+         sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
+       }
+       else
+         break;
+     }
+   }
+
+   static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
+   {   
+     const int64_t subdiv_patch_root_ref = entry.tag.get(); 
+     CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
+     
+     if (likely(subdiv_patch_root_ref != 0)) 
+     {
+       const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
+       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+       
+       if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
+       {
+         CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
+         return (void*) subdiv_patch_root;
+       }
+     }
+     CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
+     return nullptr;
+   }
+
+   template<typename Constructor>
+     static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
+   {
+     ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
+
+     while (true)
+     {
+       sharedLazyTessellationCache.lockThreadLoop(t_state);
+       void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
+       if (patch) return (decltype(constructor())) patch;
+       
+       if (entry.mutex.try_lock())
+       {
+         if (!validTag(entry.tag,globalTime)) 
+         {
+           auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
+           auto ret = constructor(); // thread is locked here!
+           assert(ret);
+           /* this should never return nullptr */
+           auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
+           auto time = before ? timeBefore : timeAfter;
+           __memory_barrier();
+           entry.tag = SharedLazyTessellationCache::Tag(ret,time);
+           __memory_barrier();
+           entry.mutex.unlock();
+           return ret;
+         }
+         entry.mutex.unlock();
+       }
+       SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
+     }
+   }
+   
+   __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
+   {
+#if FORCE_SIMPLE_FLUSH == 1
+     return i == getTime(globalTime);
+#else
+     return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
+#endif
+   }
+
+   static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
+   {
+     return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
+   }
+
+
+    static __forceinline bool validTag(const Tag& tag, size_t globalTime)
+    {
+      const int64_t subdiv_patch_root_ref = tag.get(); 
+      if (subdiv_patch_root_ref == 0) return false;
+      const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+      return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
+    }
+
+   void waitForUsersLessEqual(ThreadWorkState *const t_state,
+			      const unsigned int users);
+    
+   __forceinline size_t alloc(const size_t blocks)
+   {
+     if (unlikely(blocks >= switch_block_threshold))
+       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
+
+     assert(blocks < switch_block_threshold);
+     size_t index = next_block.fetch_add(blocks);
+     if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
+     return index;
+   }
+
+   static __forceinline void* malloc(const size_t bytes)
+   {
+     size_t block_index = -1;
+     ThreadWorkState *const t_state = threadState();
+     while (true)
+     {
+       block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
+       if (block_index == (size_t)-1)
+       {
+         sharedLazyTessellationCache.unlockThread(t_state);		  
+         sharedLazyTessellationCache.allocNextSegment();
+         sharedLazyTessellationCache.lockThread(t_state);
+         continue; 
+       }
+       break;
+     }
+     return sharedLazyTessellationCache.getBlockPtr(block_index);
+   }
+
+   __forceinline void *getBlockPtr(const size_t block_index)
+   {
+     assert(block_index < maxBlocks);
+     assert(data);
+     assert(block_index*16 <= size);
+     return (void*)&data[block_index*16];
+   }
+
+   __forceinline void*  getDataPtr()      { return data; }
+   __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
+   __forceinline size_t getMaxBlocks()    { return maxBlocks; }
+   __forceinline size_t getSize()         { return size; }
+
+   void allocNextSegment();
+   void realloc(const size_t newSize);
+
+   void reset();
+
+   static SharedLazyTessellationCache sharedLazyTessellationCache;
+ };
+}
diff --git a/thirdparty/embree/patches/godot-changes-android.patch b/thirdparty/embree/patches/godot-changes-android.patch
new file mode 100644
index 0000000000..a27f924bde
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-android.patch
@@ -0,0 +1,103 @@
+diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
+index ba97dc227b..1679599608 100644
+--- a/thirdparty/embree/common/sys/sysinfo.cpp
++++ b/thirdparty/embree/common/sys/sysinfo.cpp
+@@ -618,7 +618,10 @@ namespace embree
+     static int nThreads = -1;
+     if (nThreads != -1) return nThreads;
+ 
+-#if defined(__MACOSX__)
++// -- GODOT start --
++// #if defined(__MACOSX__)
++#if defined(__MACOSX__) || defined(__ANDROID__)
++// -- GODOT end --
+     nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+     assert(nThreads);
+ #else
+diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
+index a7827e18f7..f4014be89b 100644
+--- a/thirdparty/embree/common/sys/thread.cpp
++++ b/thirdparty/embree/common/sys/thread.cpp
+@@ -158,7 +158,9 @@ namespace embree
+ /// Linux Platform
+ ////////////////////////////////////////////////////////////////////////////////
+ 
+-#if defined(__LINUX__)
++// -- GODOT start --
++#if defined(__LINUX__) && !defined(__ANDROID__)
++// -- GODOT end --
+ 
+ #include <fstream>
+ #include <sstream>
+@@ -247,6 +249,28 @@ namespace embree
+ }
+ #endif
+ 
++// -- GODOT start --
++////////////////////////////////////////////////////////////////////////////////
++/// Android Platform
++////////////////////////////////////////////////////////////////////////////////
++
++#if defined(__ANDROID__)
++
++namespace embree
++{
++  /*! set affinity of the calling thread */
++  void setAffinity(ssize_t affinity)
++  {
++    cpu_set_t cset;
++    CPU_ZERO(&cset);
++    CPU_SET(affinity, &cset);
++
++    sched_setaffinity(0, sizeof(cset), &cset);
++  }
++}
++#endif
++// -- GODOT end --
++
+ ////////////////////////////////////////////////////////////////////////////////
+ /// FreeBSD Platform
+ ////////////////////////////////////////////////////////////////////////////////
+@@ -355,7 +379,9 @@ namespace embree
+     pthread_attr_destroy(&attr);
+ 
+     /* set affinity */
+-#if defined(__LINUX__)
++// -- GODOT start --
++#if defined(__LINUX__) && !defined(__ANDROID__)
++// -- GODOT end --
+     if (threadID >= 0) {
+       cpu_set_t cset;
+       CPU_ZERO(&cset);
+@@ -370,7 +396,16 @@ namespace embree
+       CPU_SET(threadID, &cset);
+       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+     }
++// -- GODOT start --
++#elif defined(__ANDROID__)
++    if (threadID >= 0) {
++      cpu_set_t cset;
++      CPU_ZERO(&cset);
++      CPU_SET(threadID, &cset);
++      sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
++    }
+ #endif
++// -- GODOT end --
+ 
+     return thread_t(tid);
+   }
+@@ -389,8 +424,14 @@ namespace embree
+ 
+   /*! destroy a hardware thread by its handle */
+   void destroyThread(thread_t tid) {
++// -- GODOT start --
++#if defined(__ANDROID__)
++    FATAL("Can't destroy threads on Android.");
++#else
+     pthread_cancel(*(pthread_t*)tid);
+     delete (pthread_t*)tid;
++#endif
++// -- GODOT end --
+   }
+ 
+   /*! creates thread local storage */
diff --git a/thirdparty/embree/patches/godot-changes-misc.patch b/thirdparty/embree/patches/godot-changes-misc.patch
new file mode 100644
index 0000000000..8bf0d9fa97
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-misc.patch
@@ -0,0 +1,105 @@
+diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
+index 79729c87ab..ed8dd7d40a 100644
+--- a/thirdparty/embree/common/sys/intrinsics.h
++++ b/thirdparty/embree/common/sys/intrinsics.h
+@@ -34,8 +34,14 @@
+ #endif
+ 
+ #if defined(__WIN32__)
+-#  define NOMINMAX
+-#  include <windows.h>
++// -- GODOT start --
++#if !defined(NOMINMAX)
++// -- GODOT end --
++#define NOMINMAX
++// -- GODOT start --
++#endif
++#include "windows.h"
++// -- GODOT end --
+ #endif
+ 
+ /* normally defined in pmmintrin.h, but we always need this */
+diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
+index 3fc5e99b8d..697e07bb86 100644
+--- a/thirdparty/embree/common/sys/platform.h
++++ b/thirdparty/embree/common/sys/platform.h
+@@ -99,7 +99,9 @@
+ #define dll_import 
+ #endif
+ 
+-#ifdef __WIN32__
++// -- GODOT start --
++#if defined(__WIN32__) && !defined(__MINGW32__)
++// -- GODOT end --
+ #if !defined(__noinline)
+ #define __noinline             __declspec(noinline)
+ #endif
+@@ -149,6 +151,9 @@
+   #define DELETED  = delete
+ #endif
+ 
++// -- GODOT start --
++#if !defined(likely)
++// -- GODOT end --
+ #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+ #define   likely(expr) (expr)
+ #define unlikely(expr) (expr)
+@@ -156,6 +161,9 @@
+ #define   likely(expr) __builtin_expect((bool)(expr),true )
+ #define unlikely(expr) __builtin_expect((bool)(expr),false)
+ #endif
++// -- GODOT start --
++#endif
++// -- GODOT end --
+ 
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Error handling and debugging
+diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
+index ba97dc227b..f1a59e511e 100644
+--- a/thirdparty/embree/common/sys/sysinfo.cpp
++++ b/thirdparty/embree/common/sys/sysinfo.cpp
+@@ -248,7 +248,9 @@ namespace embree
+ #if defined(__X86_ASM__)
+   __noinline int64_t get_xcr0() 
+   {
+-#if defined (__WIN32__)
++// -- GODOT start --
++#if defined (__WIN32__) && !defined (__MINGW32__)
++// -- GODOT end --
+     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+     xcr0 = _xgetbv(0);
+     return xcr0;
+diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
+index 9c14b28745..4857e1e05e 100644
+--- a/thirdparty/embree/include/embree3/rtcore_common.h
++++ b/thirdparty/embree/include/embree3/rtcore_common.h
+@@ -19,7 +19,9 @@ typedef int ssize_t;
+ #endif
+ #endif
+ 
+-#ifdef _WIN32
++// -- GODOT start --
++#if defined(_WIN32) && defined(_MSC_VER)
++// -- GODOT end --
+ #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+ #else
+ #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
+index 3fd15816e9..35bd49849f 100644
+--- a/thirdparty/embree/common/tasking/taskschedulertbb.h
++++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
+@@ -12,7 +12,13 @@
+ #include "../sys/ref.h"
+ 
+ #if defined(__WIN32__)
++// -- GODOT start --
++#if !defined(NOMINMAX)
++// -- GODOT end --
+ #  define NOMINMAX
++// -- GODOT start --
++#endif
++// -- GODOT end --
+ #endif
+ 
+ // We need to define these to avoid implicit linkage against
+ 
+\ No newline at end of file
diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch
new file mode 100644
index 0000000000..c587a0e2be
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-noexcept.patch
@@ -0,0 +1,630 @@
+diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
+index f052d8b468..645681ac63 100644
+--- a/thirdparty/embree/common/algorithms/parallel_for.h
++++ b/thirdparty/embree/common/algorithms/parallel_for.h
+@@ -21,7 +21,10 @@ namespace embree
+           func(r.begin());
+         });
+       if (!TaskScheduler::wait())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     }
+     
+ #elif defined(TASKING_TBB)
+@@ -31,13 +34,19 @@ namespace embree
+         func(i);
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+         func(i);
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -57,7 +66,10 @@ namespace embree
+ #if defined(TASKING_INTERNAL)
+     TaskScheduler::spawn(first,last,minStepSize,func);
+     if (!TaskScheduler::wait())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+ 
+ #elif defined(TASKING_TBB)
+   #if TBB_INTERFACE_VERSION >= 12002
+@@ -66,13 +78,19 @@ namespace embree
+         func(range<Index>(r.begin(),r.end()));
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+         func(range<Index>(r.begin(),r.end()));
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -104,13 +122,19 @@ namespace embree
+           func(i);
+         },tbb::simple_partitioner(),context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },tbb::simple_partitioner());
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #endif
+   }
+ 
+@@ -125,13 +149,19 @@ namespace embree
+           func(i);
+         },ap,context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },ap);
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #endif
+   }
+ 
+diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
+index f42ae2ec50..8271372ea4 100644
+--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
++++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
+@@ -58,15 +58,19 @@ namespace embree
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction,context);
+-    if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (context.is_group_execution_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #else
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction);
+-    if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (tbb::task::self().is_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #endif
+ #else // TASKING_PPL
+diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
+index 42ffb10176..a037869506 100644
+--- a/thirdparty/embree/common/lexers/stringstream.cpp
++++ b/thirdparty/embree/common/lexers/stringstream.cpp
+@@ -39,7 +39,10 @@ namespace embree
+     std::vector<char> str; str.reserve(64);
+     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+       int c = cin->get();
+-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      // -- GODOT start --
++      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      if (!isValidChar(c)) abort();
++      // -- GODOT end --
+       str.push_back((char)c);
+     }
+     str.push_back(0);
+diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
+index 1bc30fe9a5..abdd269069 100644
+--- a/thirdparty/embree/common/sys/alloc.cpp
++++ b/thirdparty/embree/common/sys/alloc.cpp
+@@ -21,7 +21,10 @@ namespace embree
+     void* ptr = _mm_malloc(size,align);
+ 
+     if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+     
+     return ptr;
+   }
+@@ -128,7 +131,10 @@ namespace embree
+     /* fall back to 4k pages */
+     int flags = MEM_COMMIT | MEM_RESERVE;
+     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+-    if (ptr == nullptr) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == nullptr) throw std::bad_alloc();
++    if (ptr == nullptr) abort();
++    // -- GODOT end --
+     hugepages = false;
+     return ptr;
+   }
+@@ -145,7 +151,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -156,7 +165,10 @@ namespace embree
+       return;
+ 
+     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   void os_advise(void *ptr, size_t bytes)
+@@ -260,7 +272,10 @@ namespace embree
+ 
+     /* fallback to 4k pages */
+     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+-    if (ptr == MAP_FAILED) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == MAP_FAILED) throw std::bad_alloc();
++    if (ptr == MAP_FAILED) abort();
++    // -- GODOT end --
+     hugepages = false;
+ 
+     /* advise huge page hint for THP */
+@@ -277,7 +292,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -291,7 +309,10 @@ namespace embree
+     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+     bytes = (bytes+pageSize-1) & ~(pageSize-1);
+     if (munmap(ptr,bytes) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
+index 8a6d9fa0a9..697e07bb86 100644
+--- a/thirdparty/embree/common/sys/platform.h
++++ b/thirdparty/embree/common/sys/platform.h
+@@ -179,11 +179,19 @@
+ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(str);
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define FATAL(x)   THROW_RUNTIME_ERROR(x)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+index dca835a716..ad438588a3 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+@@ -48,13 +48,15 @@ namespace embree
+     {
+       Task* prevTask = thread.task;
+       thread.task = this;
+-      try {
+-        if (thread.scheduler->cancellingException == nullptr)
++      // -- GODOT start --
++      // try {
++      // if (thread.scheduler->cancellingException == nullptr)
+           closure->execute();
+-      } catch (...) {
+-        if (thread.scheduler->cancellingException == nullptr)
+-          thread.scheduler->cancellingException = std::current_exception();
+-      }
++      // } catch (...) {
++      //   if (thread.scheduler->cancellingException == nullptr)
++      //     thread.scheduler->cancellingException = std::current_exception();
++      // }
++      // -- GODOT end --
+       thread.task = prevTask;
+       add_dependencies(-1);
+     }
+@@ -291,8 +293,11 @@ namespace embree
+     size_t threadIndex = allocThreadIndex();
+     condition.wait(mutex, [&] () { return hasRootTask.load(); });
+     mutex.unlock();
+-    std::exception_ptr except = thread_loop(threadIndex);
+-    if (except != nullptr) std::rethrow_exception(except);
++    // -- GODOT start --
++    // std::exception_ptr except = thread_loop(threadIndex);
++    // if (except != nullptr) std::rethrow_exception(except);
++    thread_loop(threadIndex);
++    // -- GODOT end --
+   }
+ 
+   void TaskScheduler::reset() {
+@@ -324,7 +329,10 @@ namespace embree
+     return thread->scheduler->cancellingException == nullptr;
+   }
+ 
+-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT start --
++//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++  void TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT end --
+   {
+     /* allocate thread structure */
+     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+@@ -347,9 +355,10 @@ namespace embree
+     swapThread(oldThread);
+ 
+     /* remember exception to throw */
+-    std::exception_ptr except = nullptr;
+-    if (cancellingException != nullptr) except = cancellingException;
+-
++    // -- GODOT start --
++    // std::exception_ptr except = nullptr;
++    // if (cancellingException != nullptr) except = cancellingException;
++    // -- GODOT end --
+     /* wait for all threads to terminate */
+     threadCounter--;
+ #if defined(__WIN32__)
+@@ -367,7 +376,10 @@ namespace embree
+           yield();
+ #endif
+ 	}
+-    return except;
++     // -- GODOT start --
++     // return except;
++     return;
++     // -- GODOT end --
+   }
+ 
+   bool TaskScheduler::steal_from_other_threads(Thread& thread)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+index c766a0bb6a..8fa6bb12fa 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+@@ -123,7 +123,10 @@ namespace embree
+       {
+         size_t ofs = bytes + ((align - stackPtr) & (align-1));
+         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-          throw std::runtime_error("closure stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("closure stack overflow");
++          abort();
++          // -- GODOT end --
+         stackPtr += ofs;
+         return &stack[stackPtr-bytes];
+       }
+@@ -132,7 +135,10 @@ namespace embree
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+       {
+         if (right >= TASK_STACK_SIZE)
+-          throw std::runtime_error("task stack overflow");
++           // -- GODOT start --
++           // throw std::runtime_error("task stack overflow");
++           abort();
++           // -- GODOT end --
+ 
+ 	/* allocate new task on right side of stack */
+         size_t oldStackPtr = stackPtr;
+@@ -238,7 +244,10 @@ namespace embree
+     void wait_for_threads(size_t threadCount);
+ 
+     /*! thread loop for all worker threads */
+-    std::exception_ptr thread_loop(size_t threadIndex);
++    // -- GODOT start --
++    // std::exception_ptr thread_loop(size_t threadIndex);
++    void thread_loop(size_t threadIndex);
++    // -- GODOT end --
+ 
+     /*! steals a task from a different thread */
+     bool steal_from_other_threads(Thread& thread);
+diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+index d8da78eed7..d857ff7d95 100644
+--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
++++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+@@ -150,7 +150,10 @@ namespace embree
+       }
+     }
+     else {
+-      throw std::runtime_error("not supported node type in bvh_statistics");
++      // -- GODOT start --
++      // throw std::runtime_error("not supported node type in bvh_statistics");
++      abort();
++      // -- GODOT end --
+     }
+     return s;
+   } 
+diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
+index 74e9fb335c..94b3819e42 100644
+--- a/thirdparty/embree/kernels/common/rtcore.cpp
++++ b/thirdparty/embree/kernels/common/rtcore.cpp
+@@ -197,7 +197,10 @@ RTC_NAMESPACE_BEGIN;
+     if (quality != RTC_BUILD_QUALITY_LOW &&
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     scene->setBuildQuality(quality);
+     RTC_CATCH_END2(scene);
+   }
+@@ -1350,7 +1353,10 @@ RTC_NAMESPACE_BEGIN;
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH &&
+         quality != RTC_BUILD_QUALITY_REFIT)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     geometry->setBuildQuality(quality);
+     RTC_CATCH_END2(geometry);
+   }
+diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
+index 4e4b24e9c2..373e49a689 100644
+--- a/thirdparty/embree/kernels/common/rtcore.h
++++ b/thirdparty/embree/kernels/common/rtcore.h
+@@ -25,52 +25,58 @@ namespace embree
+ #endif
+ 
+ /*! Macros used in the rtcore API implementation */
+-#define RTC_CATCH_BEGIN try {
++// -- GODOT start --
++// #define RTC_CATCH_BEGIN try {
++#define RTC_CATCH_BEGIN
+   
+-#define RTC_CATCH_END(device)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END(device)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END(device)
+   
+-#define RTC_CATCH_END2(scene)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END2(scene)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END2(scene)
+ 
+-#define RTC_CATCH_END2_FALSE(scene)                                             \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-    return false;                                                               \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-    return false;                                                               \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-    return false;                                                               \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-    return false;                                                               \
+-  }
++// #define RTC_CATCH_END2_FALSE(scene)                                             \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//     return false;                                                               \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//     return false;                                                               \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//     return false;                                                               \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//     return false;                                                               \
++//   }
++#define RTC_CATCH_END2_FALSE(scene) return false;
++// -- GODOT end --
+ 
+ #define RTC_VERIFY_HANDLE(handle)                               \
+   if (handle == nullptr) {                                         \
+@@ -97,28 +103,38 @@ namespace embree
+ #define RTC_TRACE(x) 
+ #endif
+ 
+-  /*! used to throw embree API errors */
+-  struct rtcore_error : public std::exception
+-  {
+-    __forceinline rtcore_error(RTCError error, const std::string& str)
+-      : error(error), str(str) {}
+-    
+-    ~rtcore_error() throw() {}
+-    
+-    const char* what () const throw () {
+-      return str.c_str();
+-    }
+-    
+-    RTCError error;
+-    std::string str;
+-  };
++// -- GODOT begin --
++//   /*! used to throw embree API errors */
++//   struct rtcore_error : public std::exception
++//   {
++//     __forceinline rtcore_error(RTCError error, const std::string& str)
++//       : error(error), str(str) {}
++//     
++//     ~rtcore_error() throw() {}
++//     
++//     const char* what () const throw () {
++//       return str.c_str();
++//     }
++//     
++//     RTCError error;
++//     std::string str;
++//   };
++// -- GODOT end --
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,str);
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
+index 0149055f2c..408d7eae6f 100644
+--- a/thirdparty/embree/kernels/common/scene.cpp
++++ b/thirdparty/embree/kernels/common/scene.cpp
+@@ -792,16 +792,18 @@ namespace embree
+     }
+ 
+     /* initiate build */
+-    try {
++    // -- GODOT start --
++    // try {
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+-    }
+-    catch (...) {
+-      accels_clear();
+-      updateInterface();
+-      Lock<MutexSys> lock(schedulerMutex);
+-      this->scheduler = nullptr;
+-      throw;
+-    }
++    // }
++    // catch (...) {
++    //   accels_clear();
++    //   updateInterface();
++    //   Lock<MutexSys> lock(schedulerMutex);
++    //   this->scheduler = nullptr;
++    //   throw;
++    // }
++    // -- GODOT end --
+   }
+ 
+ #endif
diff --git a/thirdparty/embree/patches/godot-changes-ubsan.patch b/thirdparty/embree/patches/godot-changes-ubsan.patch
new file mode 100644
index 0000000000..1336246f0d
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-ubsan.patch
@@ -0,0 +1,24 @@
+diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
+index bb4fc81dfe..d279dc4993 100644
+--- a/thirdparty/embree/kernels/builders/primrefgen.cpp
++++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
+@@ -184,6 +184,9 @@ namespace embree
+ 
+     // special variants for grid meshes
+ 
++// -- GODOT start --
++#if defined(EMBREE_GEOMETRY_GRID)
++// -- GODOT end --
+     PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+     {
+       PrimInfo pinfo(empty);
+@@ -293,6 +296,9 @@ namespace embree
+ 
+       return pinfo;
+     }
++// -- GODOT start --
++#endif
++// -- GODOT end --
+     
+     // ====================================================================================================
+     // ====================================================================================================
diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp
index 73fa3c62a2..fd7968204b 100644
--- a/thirdparty/enet/godot.cpp
+++ b/thirdparty/enet/godot.cpp
@@ -45,9 +45,10 @@
 /// Abstract ENet interface for UDP/DTLS.
 class ENetGodotSocket {
 public:
-	virtual Error bind(IP_Address p_ip, uint16_t p_port) = 0;
-	virtual Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) = 0;
-	virtual Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IP_Address &r_ip, uint16_t &r_port) = 0;
+	virtual Error bind(IPAddress p_ip, uint16_t p_port) = 0;
+	virtual Error get_socket_address(IPAddress *r_ip, uint16_t *r_port) = 0;
+	virtual Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) = 0;
+	virtual Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IPAddress &r_ip, uint16_t &r_port) = 0;
 	virtual int set_option(ENetSocketOption p_option, int p_value) = 0;
 	virtual void close() = 0;
 	virtual void set_refuse_new_connections(bool p_enable) {} /* Only used by dtls server */
@@ -64,8 +65,7 @@ class ENetUDP : public ENetGodotSocket {
 
 private:
 	Ref<NetSocket> sock;
-	IP_Address address;
-	uint16_t port = 0;
+	IPAddress local_address;
 	bool bound = false;
 
 public:
@@ -79,18 +79,21 @@ public:
 		sock->close();
 	}
 
-	Error bind(IP_Address p_ip, uint16_t p_port) {
-		address = p_ip;
-		port = p_port;
+	Error bind(IPAddress p_ip, uint16_t p_port) {
+		local_address = p_ip;
 		bound = true;
-		return sock->bind(address, port);
+		return sock->bind(p_ip, p_port);
 	}
 
-	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
+	Error get_socket_address(IPAddress *r_ip, uint16_t *r_port) {
+		return sock->get_socket_address(r_ip, r_port);
+	}
+
+	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) {
 		return sock->sendto(p_buffer, p_len, r_sent, p_ip, p_port);
 	}
 
-	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IP_Address &r_ip, uint16_t &r_port) {
+	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IPAddress &r_ip, uint16_t &r_port) {
 		Error err = sock->poll(NetSocket::POLL_TYPE_IN, 0);
 		if (err != OK) {
 			return err;
@@ -142,6 +145,7 @@ public:
 
 	void close() {
 		sock->close();
+		local_address.clear();
 	}
 };
 
@@ -153,6 +157,7 @@ class ENetDTLSClient : public ENetGodotSocket {
 	bool verify = false;
 	String for_hostname;
 	Ref<X509Certificate> cert;
+	IPAddress local_address;
 
 public:
 	ENetDTLSClient(ENetUDP *p_base, Ref<X509Certificate> p_cert, bool p_verify, String p_for_hostname) {
@@ -161,9 +166,11 @@ public:
 		cert = p_cert;
 		udp.instance();
 		dtls = Ref<PacketPeerDTLS>(PacketPeerDTLS::create());
-		p_base->close();
 		if (p_base->bound) {
-			bind(p_base->address, p_base->port);
+			uint16_t port;
+			p_base->get_socket_address(&local_address, &port);
+			p_base->close();
+			bind(local_address, port);
 		}
 	}
 
@@ -171,11 +178,21 @@ public:
 		close();
 	}
 
-	Error bind(IP_Address p_ip, uint16_t p_port) {
-		return udp->listen(p_port, p_ip);
+	Error bind(IPAddress p_ip, uint16_t p_port) {
+		local_address = p_ip;
+		return udp->bind(p_port, p_ip);
+	}
+
+	Error get_socket_address(IPAddress *r_ip, uint16_t *r_port) {
+		if (!udp->is_bound()) {
+			return ERR_UNCONFIGURED;
+		}
+		*r_ip = local_address;
+		*r_port = udp->get_local_port();
+		return OK;
 	}
 
-	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
+	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) {
 		if (!connected) {
 			udp->connect_to_host(p_ip, p_port);
 			dtls->connect_to_peer(udp, verify, for_hostname, cert);
@@ -191,7 +208,7 @@ public:
 		return dtls->put_packet(p_buffer, p_len);
 	}
 
-	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IP_Address &r_ip, uint16_t &r_port) {
+	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IPAddress &r_ip, uint16_t &r_port) {
 		dtls->poll();
 		if (dtls->get_status() == PacketPeerDTLS::STATUS_HANDSHAKING) {
 			return ERR_BUSY;
@@ -211,7 +228,7 @@ public:
 		ERR_FAIL_COND_V(err != OK, err);
 		ERR_FAIL_COND_V(p_len < r_read, ERR_OUT_OF_MEMORY);
 
-		copymem(p_buffer, buffer, r_read);
+		memcpy(p_buffer, buffer, r_read);
 		r_ip = udp->get_packet_address();
 		r_port = udp->get_packet_port();
 		return err;
@@ -233,13 +250,16 @@ class ENetDTLSServer : public ENetGodotSocket {
 	Ref<UDPServer> udp_server;
 	Map<String, Ref<PacketPeerDTLS>> peers;
 	int last_service = 0;
+	IPAddress local_address;
 
 public:
 	ENetDTLSServer(ENetUDP *p_base, Ref<CryptoKey> p_key, Ref<X509Certificate> p_cert) {
 		udp_server.instance();
-		p_base->close();
 		if (p_base->bound) {
-			bind(p_base->address, p_base->port);
+			uint16_t port;
+			p_base->get_socket_address(&local_address, &port);
+			p_base->close();
+			bind(local_address, port);
 		}
 		server = Ref<DTLSServer>(DTLSServer::create());
 		server->setup(p_key, p_cert);
@@ -253,11 +273,21 @@ public:
 		udp_server->set_max_pending_connections(p_refuse ? 0 : 16);
 	}
 
-	Error bind(IP_Address p_ip, uint16_t p_port) {
+	Error bind(IPAddress p_ip, uint16_t p_port) {
+		local_address = p_ip;
 		return udp_server->listen(p_port, p_ip);
 	}
 
-	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
+	Error get_socket_address(IPAddress *r_ip, uint16_t *r_port) {
+		if (!udp_server->is_listening()) {
+			return ERR_UNCONFIGURED;
+		}
+		*r_ip = local_address;
+		*r_port = udp_server->get_local_port();
+		return OK;
+	}
+
+	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) {
 		String key = String(p_ip) + ":" + itos(p_port);
 		ERR_FAIL_COND_V(!peers.has(key), ERR_UNAVAILABLE);
 		Ref<PacketPeerDTLS> peer = peers[key];
@@ -272,12 +302,12 @@ public:
 		return err;
 	}
 
-	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IP_Address &r_ip, uint16_t &r_port) {
+	Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IPAddress &r_ip, uint16_t &r_port) {
 		udp_server->poll();
 		// TODO limits? Maybe we can better enforce allowed connections!
 		if (udp_server->is_connection_available()) {
 			Ref<PacketPeerUDP> udp = udp_server->take_connection();
-			IP_Address peer_ip = udp->get_packet_address();
+			IPAddress peer_ip = udp->get_packet_address();
 			int peer_port = udp->get_packet_port();
 			Ref<PacketPeerDTLS> peer = server->take_connection(udp);
 			PacketPeerDTLS::Status status = peer->get_status();
@@ -315,7 +345,7 @@ public:
 				Vector<String> s = E->key().rsplit(":", false, 1);
 				ERR_CONTINUE(s.size() != 2); // BUG!
 
-				copymem(p_buffer, buffer, r_read);
+				memcpy(p_buffer, buffer, r_read);
 				r_ip = s[0];
 				r_port = s[1].to_int();
 				break; // err = OK
@@ -341,6 +371,7 @@ public:
 		peers.clear();
 		udp_server->stop();
 		server->stop();
+		local_address.clear();
 	}
 };
 
@@ -366,7 +397,7 @@ void enet_time_set(enet_uint32 newTimeBase) {
 }
 
 int enet_address_set_host(ENetAddress *address, const char *name) {
-	IP_Address ip = IP::get_singleton()->resolve_hostname(name);
+	IPAddress ip = IP::get_singleton()->resolve_hostname(name);
 	ERR_FAIL_COND_V(!ip.is_valid(), -1);
 
 	enet_address_set_ip(address, ip.get_ipv6(), 16);
@@ -411,9 +442,9 @@ void enet_host_refuse_new_connections(ENetHost *host, int p_refuse) {
 }
 
 int enet_socket_bind(ENetSocket socket, const ENetAddress *address) {
-	IP_Address ip;
+	IPAddress ip;
 	if (address->wildcard) {
-		ip = IP_Address("*");
+		ip = IPAddress("*");
 	} else {
 		ip.set_ipv6(address->host);
 	}
@@ -435,7 +466,7 @@ int enet_socket_send(ENetSocket socket, const ENetAddress *address, const ENetBu
 	ERR_FAIL_COND_V(address == nullptr, -1);
 
 	ENetGodotSocket *sock = (ENetGodotSocket *)socket;
-	IP_Address dest;
+	IPAddress dest;
 	Error err;
 	size_t i = 0;
 
@@ -477,7 +508,7 @@ int enet_socket_receive(ENetSocket socket, ENetAddress *address, ENetBuffer *buf
 	ENetGodotSocket *sock = (ENetGodotSocket *)socket;
 
 	int read;
-	IP_Address ip;
+	IPAddress ip;
 
 	Error err = sock->recvfrom((uint8_t *)buffers[0].data, buffers[0].dataLength, read, ip, address->port);
 	if (err == ERR_BUSY) {
@@ -493,15 +524,26 @@ int enet_socket_receive(ENetSocket socket, ENetAddress *address, ENetBuffer *buf
 	return read;
 }
 
+int enet_socket_get_address (ENetSocket socket, ENetAddress * address) {
+	IPAddress ip;
+	uint16_t port;
+	ENetGodotSocket *sock = (ENetGodotSocket *)socket;
+
+	if (sock->get_socket_address(&ip, &port) != OK) {
+		return -1;
+	}
+
+	enet_address_set_ip(address, ip.get_ipv6(), 16);
+	address->port = port;
+
+	return 0;
+}
+
 // Not implemented
 int enet_socket_wait(ENetSocket socket, enet_uint32 *condition, enet_uint32 timeout) {
 	return 0; // do we need this function?
 }
 
-int enet_socket_get_address(ENetSocket socket, ENetAddress *address) {
-	return -1; // do we need this function?
-}
-
 int enet_socketset_select(ENetSocket maxSocket, ENetSocketSet *readSet, ENetSocketSet *writeSet, enet_uint32 timeout) {
 	return -1;
 }
diff --git a/thirdparty/etc2comp/AUTHORS b/thirdparty/etc2comp/AUTHORS
deleted file mode 100644
index e78a7f4d21..0000000000
--- a/thirdparty/etc2comp/AUTHORS
+++ /dev/null
@@ -1,7 +0,0 @@
-# This is the list of Etc2Comp authors for copyright purposes.
-#
-# This does not necessarily list everyone who has contributed code, since in
-# some cases, their employer may be the copyright holder.  To see the full list
-# of contributors, see the revision history in source control.
-Google Inc.
-Blue Shift Inc.
diff --git a/thirdparty/etc2comp/Etc.cpp b/thirdparty/etc2comp/Etc.cpp
deleted file mode 100644
index a5ee706048..0000000000
--- a/thirdparty/etc2comp/Etc.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "Etc.h"
-#include "EtcFilter.h"
-
-#include <string.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	// C-style inteface to the encoder
-	//
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth, 
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uiMaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight, 
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput)
-	{
-
-		Image image(a_pafSourceRGBA, a_uiSourceWidth,
-					a_uiSourceHeight,
-					a_eErrMetric);
-		image.m_bVerboseOutput = a_bVerboseOutput;
-		image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-		*a_ppaucEncodingBits = image.GetEncodingBits();
-		*a_puiEncodingBitsBytes = image.GetEncodingBitsBytes();
-		*a_puiExtendedWidth = image.GetExtendedWidth();
-		*a_puiExtendedHeight = image.GetExtendedHeight();
-		*a_piEncodingTime_ms = image.GetEncodingTimeMs();
-	}
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmapImages,
-		int *a_piEncodingTime_ms, 
-		bool a_bVerboseOutput)
-	{
-		auto mipWidth = a_uiSourceWidth;
-		auto mipHeight = a_uiSourceHeight;
-		int totalEncodingTime = 0;
-		for(unsigned int mip = 0; mip < a_uiMaxMipmaps && mipWidth >= 1 && mipHeight >= 1; mip++)
-		{
-			float* pImageData = nullptr;
-			float* pMipImage = nullptr;
-
-			if(mip == 0)
-			{
-				pImageData = a_pafSourceRGBA;
-			}
-			else
-			{
-				pMipImage = new float[mipWidth*mipHeight*4];
-				if(FilterTwoPass(a_pafSourceRGBA, a_uiSourceWidth, a_uiSourceHeight, pMipImage, mipWidth, mipHeight, a_uiMipFilterFlags, Etc::FilterLanczos3) )
-				{
-					pImageData = pMipImage;
-				}
-			}
-
-			if ( pImageData )
-			{
-			
-				Image image(pImageData, mipWidth, mipHeight,	a_eErrMetric);
-
-			image.m_bVerboseOutput = a_bVerboseOutput;
-			image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-			a_pMipmapImages[mip].paucEncodingBits = std::shared_ptr<unsigned char>(image.GetEncodingBits(), [](unsigned char *p) { delete[] p; });
-			a_pMipmapImages[mip].uiEncodingBitsBytes = image.GetEncodingBitsBytes();
-			a_pMipmapImages[mip].uiExtendedWidth = image.GetExtendedWidth();
-			a_pMipmapImages[mip].uiExtendedHeight = image.GetExtendedHeight();
-
-			totalEncodingTime += image.GetEncodingTimeMs();
-			}
-
-			if(pMipImage)
-			{
-				delete[] pMipImage;
-			}
-
-			if (!pImageData)
-			{
-				break;
-			}
-
-			mipWidth >>= 1;
-			mipHeight >>= 1;
-		}
-
-		*a_piEncodingTime_ms = totalEncodingTime;
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
diff --git a/thirdparty/etc2comp/Etc.h b/thirdparty/etc2comp/Etc.h
deleted file mode 100644
index 439388d649..0000000000
--- a/thirdparty/etc2comp/Etc.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-#include "EtcImage.h"
-#include "EtcColor.h"
-#include "EtcErrorMetric.h"
-#include <memory>
-
-#define ETCCOMP_MIN_EFFORT_LEVEL (0.0f)
-#define ETCCOMP_DEFAULT_EFFORT_LEVEL (40.0f)
-#define ETCCOMP_MAX_EFFORT_LEVEL (100.0f)
-
-namespace Etc
-{
-	class Block4x4EncodingBits;
-
-	struct RawImage
-	{
-		int uiExtendedWidth;
-		int uiExtendedHeight;
-		unsigned int uiEncodingBitsBytes;
-		std::shared_ptr<unsigned char> paucEncodingBits;
-	};
-
-
-
-	// C-style inteface to the encoder
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth,
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uimaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight,
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmaps,
-		int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4.cpp b/thirdparty/etc2comp/EtcBlock4x4.cpp
deleted file mode 100644
index 3082fe60db..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* 
-EtcBlock4x4.cpp
-
-Implements the state associated with each 4x4 block of pixels in an image
-
-Source images that are not a multiple of 4x4 are extended to fill the Block4x4 using pixels with an 
-alpha of NAN
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcColor.h"
-#include "EtcImage.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-#include "EtcBlock4x4Encoding_R11.h"
-#include "EtcBlock4x4Encoding_RG11.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ETC pixels are scanned vertically.  
-	// this mapping is for when someone wants to scan the ETC pixels horizontally
-	const unsigned int Block4x4::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4::Block4x4(void)
-	{
-		m_pimageSource = nullptr;
-		m_uiSourceH = 0;
-		m_uiSourceV = 0;
-
-		m_sourcealphamix = SourceAlphaMix::UNKNOWN;
-		m_boolBorderPixels = false;
-		m_boolPunchThroughPixels = false;
-
-		m_pencoding = nullptr;
-
-		m_errormetric = ErrorMetric::NUMERIC;
-
-	}
-	Block4x4::~Block4x4()
-	{
-		m_pimageSource = nullptr;
-		if (m_pencoding)
-		{
-			delete m_pencoding;
-			m_pencoding = nullptr;
-		}
-	}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding from a source image
-	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
-	// a_paucEncodingBits is the place to store the final encoding
-	// a_errormetric is used for finding the best encoding
-	//
-	void Block4x4::InitFromSource(Image *a_pimageSource, 
-									unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric)
-	{
-
-		Block4x4();
-
-		m_pimageSource = a_pimageSource;
-		m_uiSourceH = a_uiSourceH;
-		m_uiSourceV = a_uiSourceV;
-		m_errormetric = a_errormetric;
-
-		SetSourcePixels();
-
-		// set block encoder function
-		switch (m_pimageSource->GetFormat())
-		{
-		case Image::Format::ETC1:
-			m_pencoding = new Block4x4Encoding_ETC1;
-			break;
-
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_pencoding = new Block4x4Encoding_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			if (a_errormetric == RGBX)
-			{
-				m_pencoding = new Block4x4Encoding_RGBA8;
-			}
-			else
-			{
-				switch (m_sourcealphamix)
-				{
-				case SourceAlphaMix::OPAQUE:
-					m_pencoding = new Block4x4Encoding_RGBA8_Opaque;
-					break;
-
-				case SourceAlphaMix::TRANSPARENT:
-					m_pencoding = new Block4x4Encoding_RGBA8_Transparent;
-					break;
-
-				case SourceAlphaMix::TRANSLUCENT:
-					m_pencoding = new Block4x4Encoding_RGBA8;
-					break;
-
-				default:
-					assert(0);
-					break;
-				}
-				break;
-			}
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			switch (m_sourcealphamix)
-			{
-			case SourceAlphaMix::OPAQUE:
-				m_pencoding = new Block4x4Encoding_RGB8A1_Opaque;
-				break;
-
-			case SourceAlphaMix::TRANSPARENT:
-				m_pencoding = new Block4x4Encoding_RGB8A1_Transparent;
-				break;
-
-			case SourceAlphaMix::TRANSLUCENT:
-				if (m_boolPunchThroughPixels)
-				{
-					m_pencoding = new Block4x4Encoding_RGB8A1;
-				}
-				else
-				{
-					m_pencoding = new Block4x4Encoding_RGB8A1_Opaque;
-				}
-				break;
-
-			default:
-				assert(0);
-				break;
-			}
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			m_pencoding = new Block4x4Encoding_R11;
-			break;
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			m_pencoding = new Block4x4Encoding_RG11;
-			break;
-		default:
-			assert(0);
-			break;
-		}
-
-		m_pencoding->InitFromSource(this, m_afrgbaSource,
-									a_paucEncodingBits, a_errormetric);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization of encoding state from a prior encoding using encoding bits
-	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
-	// a_paucEncodingBits is the place to read the prior encoding
-	// a_imageformat is used to determine how to interpret a_paucEncodingBits
-	// a_errormetric was used for the prior encoding
-	//
-	void Block4x4::InitFromEtcEncodingBits(Image::Format a_imageformat,
-											unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-											unsigned char *a_paucEncodingBits,
-											Image *a_pimageSource,
-											ErrorMetric a_errormetric)
-	{
-		Block4x4();
-
-		m_pimageSource = a_pimageSource;
-		m_uiSourceH = a_uiSourceH;
-		m_uiSourceV = a_uiSourceV;
-		m_errormetric = a_errormetric;
-
-		SetSourcePixels();
-
-		// set block encoder function
-		switch (a_imageformat)
-		{
-		case Image::Format::ETC1:
-			m_pencoding = new Block4x4Encoding_ETC1;
-			break;
-
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_pencoding = new Block4x4Encoding_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			m_pencoding = new Block4x4Encoding_RGBA8;
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			m_pencoding = new Block4x4Encoding_RGB8A1;
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			m_pencoding = new Block4x4Encoding_R11;
-			break;
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			m_pencoding = new Block4x4Encoding_RG11;
-			break;
-		default:
-			assert(0);
-			break;
-		}
-
-		m_pencoding->InitFromEncodingBits(this, a_paucEncodingBits, m_afrgbaSource,
-										m_pimageSource->GetErrorMetric());
-
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// set source pixels from m_pimageSource
-	// set m_alphamix
-	//
-	void Block4x4::SetSourcePixels(void)
-	{
-
-		Image::Format imageformat = m_pimageSource->GetFormat();
-
-		// alpha census
-		unsigned int uiTransparentSourcePixels = 0;
-		unsigned int uiOpaqueSourcePixels = 0;
-
-		// copy source to consecutive memory locations
-		// convert from image horizontal scan to block vertical scan
-		unsigned int uiPixel = 0;
-		for (unsigned int uiBlockPixelH = 0; uiBlockPixelH < Block4x4::COLUMNS; uiBlockPixelH++)
-		{
-			unsigned int uiSourcePixelH = m_uiSourceH + uiBlockPixelH;
-
-			for (unsigned int uiBlockPixelV = 0; uiBlockPixelV < Block4x4::ROWS; uiBlockPixelV++)
-			{
-				unsigned int uiSourcePixelV = m_uiSourceV + uiBlockPixelV;
-
-				ColorFloatRGBA *pfrgbaSource = m_pimageSource->GetSourcePixel(uiSourcePixelH, uiSourcePixelV);
-
-				// if pixel extends beyond source image because of block padding
-				if (pfrgbaSource == nullptr)
-				{
-					m_afrgbaSource[uiPixel] = ColorFloatRGBA(0.0f, 0.0f, 0.0f, NAN);	// denotes border pixel
-					m_boolBorderPixels = true;
-					uiTransparentSourcePixels++;
-				}
-				else
-				{
-					//get teh current pixel data, and store some of the attributes
-					//before capping values to fit the encoder type
-					
-					m_afrgbaSource[uiPixel] = (*pfrgbaSource).ClampRGBA();
-
-					if (m_afrgbaSource[uiPixel].fA == 1.0f || m_errormetric == RGBX)
-					{
-						m_pimageSource->m_iNumOpaquePixels++;
-					}
-					else if (m_afrgbaSource[uiPixel].fA == 0.0f)
-					{
-						m_pimageSource->m_iNumTransparentPixels++;
-					}
-					else if(m_afrgbaSource[uiPixel].fA > 0.0f && m_afrgbaSource[uiPixel].fA < 1.0f)
-					{
-						m_pimageSource->m_iNumTranslucentPixels++;
-					}
-					else
-					{
-						m_pimageSource->m_numOutOfRangeValues.fA++;
-					}
-
-					if (m_afrgbaSource[uiPixel].fR != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fR++;
-						//make sure we are getting a float between 0-1
-						if (m_afrgbaSource[uiPixel].fR - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fR++;
-						}
-					}
-
-					if (m_afrgbaSource[uiPixel].fG != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fG++;
-						if (m_afrgbaSource[uiPixel].fG - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fG++;
-						}
-					}
-					if (m_afrgbaSource[uiPixel].fB != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fB++;
-						if (m_afrgbaSource[uiPixel].fB - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fB++;
-						}
-					}
-					// for formats with no alpha, set source alpha to 1
-					if (imageformat == Image::Format::ETC1 ||
-						imageformat == Image::Format::RGB8 ||
-						imageformat == Image::Format::SRGB8)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-					}
-
-					if (imageformat == Image::Format::R11 ||
-						imageformat == Image::Format::SIGNED_R11)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-						m_afrgbaSource[uiPixel].fG = 0.0f;
-						m_afrgbaSource[uiPixel].fB = 0.0f;
-					}
-
-					if (imageformat == Image::Format::RG11 ||
-						imageformat == Image::Format::SIGNED_RG11)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-						m_afrgbaSource[uiPixel].fB = 0.0f;
-					}
-
-				
-					// for RGB8A1, set source alpha to 0.0 or 1.0
-					// set punch through flag
-					if (imageformat == Image::Format::RGB8A1 ||
-						imageformat == Image::Format::SRGB8A1)
-					{
-						if (m_afrgbaSource[uiPixel].fA >= 0.5f)
-						{
-							m_afrgbaSource[uiPixel].fA = 1.0f;
-						}
-						else
-						{
-							m_afrgbaSource[uiPixel].fA = 0.0f;
-							m_boolPunchThroughPixels = true;
-						}
-					}
-
-					if (m_afrgbaSource[uiPixel].fA == 1.0f || m_errormetric == RGBX)
-					{
-						uiOpaqueSourcePixels++;
-					}
-					else if (m_afrgbaSource[uiPixel].fA == 0.0f)
-					{
-						uiTransparentSourcePixels++;
-					}
-
-				}
-
-				uiPixel += 1;
-			}
-		}
-
-		if (uiOpaqueSourcePixels == PIXELS)
-		{
-			m_sourcealphamix = SourceAlphaMix::OPAQUE;
-		}
-		else if (uiTransparentSourcePixels == PIXELS)
-		{
-			m_sourcealphamix = SourceAlphaMix::TRANSPARENT;
-		}
-		else
-		{
-			m_sourcealphamix = SourceAlphaMix::TRANSLUCENT;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return a name for the encoding mode
-	//
-	const char * Block4x4::GetEncodingModeName(void)
-	{
-
-		switch (m_pencoding->GetMode())
-		{
-		case Block4x4Encoding::MODE_ETC1:
-			return "ETC1";
-		case Block4x4Encoding::MODE_T:
-			return "T";
-		case Block4x4Encoding::MODE_H:
-			return "H";
-		case Block4x4Encoding::MODE_PLANAR:
-			return "PLANAR";
-		default:
-			return "???";
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4.h b/thirdparty/etc2comp/EtcBlock4x4.h
deleted file mode 100644
index 0fd30c598d..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColor.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcErrorMetric.h"
-#include "EtcImage.h"
-#include "EtcBlock4x4Encoding.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits;
-
-	class Block4x4
-	{
-	public:
-
-		static const unsigned int ROWS = 4;
-		static const unsigned int COLUMNS = 4;
-		static const unsigned int PIXELS = ROWS * COLUMNS;
-
-		// the alpha mix for a 4x4 block of pixels
-		enum class SourceAlphaMix
-		{
-			UNKNOWN,
-			//
-			OPAQUE,			// all 1.0
-			TRANSPARENT,	// all 0.0 or NAN
-			TRANSLUCENT		// not all opaque or transparent
-		};
-
-		typedef void (Block4x4::*EncoderFunctionPtr)(void);
-
-		Block4x4(void);
-		~Block4x4();
-		void InitFromSource(Image *a_pimageSource,
-							unsigned int a_uiSourceH,
-							unsigned int a_uiSourceV,
-							unsigned char *a_paucEncodingBits,
-							ErrorMetric a_errormetric);
-
-		void InitFromEtcEncodingBits(Image::Format a_imageformat,
-										unsigned int a_uiSourceH,
-										unsigned int a_uiSourceV,
-										unsigned char *a_paucEncodingBits,
-										Image *a_pimageSource,
-										ErrorMetric a_errormetric);
-
-		// return true if final iteration was performed
-		inline void PerformEncodingIteration(float a_fEffort)
-		{
-			m_pencoding->PerformIteration(a_fEffort);
-		}
-
-		inline void SetEncodingBitsFromEncoding(void)
-		{
-			m_pencoding->SetEncodingBits();
-		}
-
-		inline unsigned int GetSourceH(void)
-		{
-			return m_uiSourceH;
-		}
-
-		inline unsigned int GetSourceV(void)
-		{
-			return m_uiSourceV;
-		}
-
-		inline float GetError(void)
-		{
-			return m_pencoding->GetError();
-		}
-
-		static const unsigned int s_auiPixelOrderHScan[PIXELS];
-
-		inline ColorFloatRGBA * GetDecodedColors(void)
-		{
-			return m_pencoding->GetDecodedColors();
-		}
-
-		inline float * GetDecodedAlphas(void)
-		{
-			return m_pencoding->GetDecodedAlphas();
-		}
-
-		inline Block4x4Encoding::Mode GetEncodingMode(void)
-		{
-			return m_pencoding->GetMode();
-		}
-
-		inline bool GetFlip(void)
-		{
-			return m_pencoding->GetFlip();
-		}
-
-		inline bool IsDifferential(void)
-		{
-			return m_pencoding->IsDifferential();
-		}
-
-		inline ColorFloatRGBA * GetSource()
-		{
-			return m_afrgbaSource;
-		}
-
-		inline ErrorMetric GetErrorMetric()
-		{
-			return m_errormetric;
-		}
-
-		const char * GetEncodingModeName(void);
-
-		inline Block4x4Encoding * GetEncoding(void)
-		{
-			return m_pencoding;
-		}
-
-		inline SourceAlphaMix GetSourceAlphaMix(void)
-		{
-			return m_sourcealphamix;
-		}
-
-		inline Image * GetImageSource(void)
-		{
-			return m_pimageSource;
-		}
-
-		inline bool HasBorderPixels(void)
-		{
-			return m_boolBorderPixels;
-		}
-
-		inline bool HasPunchThroughPixels(void)
-		{
-			return m_boolPunchThroughPixels;
-		}
-
-	private:
-
-		void SetSourcePixels(void);
-
-		Image				*m_pimageSource;
-		unsigned int		m_uiSourceH;
-		unsigned int		m_uiSourceV;
-		ErrorMetric			m_errormetric;
-		ColorFloatRGBA		m_afrgbaSource[PIXELS];		// vertical scan
-
-		SourceAlphaMix		m_sourcealphamix;
-		bool				m_boolBorderPixels;			// marked as rgba(NAN, NAN, NAN, NAN)
-		bool				m_boolPunchThroughPixels;	// RGB8A1 or SRGB8A1 with any pixels with alpha < 0.5
-
-		Block4x4Encoding	*m_pencoding;
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp
deleted file mode 100644
index 7a9e68c4cf..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding.cpp
-
-Block4x4Encoding is the abstract base class for the different encoders.  Each encoder targets a 
-particular file format (e.g. ETC1, RGB8, RGBA8, R11)
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	//
-	const float Block4x4Encoding::LUMA_WEIGHT = 3.0f;
-	const float Block4x4Encoding::CHROMA_BLUE_WEIGHT = 0.5f;
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding::Block4x4Encoding(void)
-	{
-
-		m_pblockParent = nullptr;
-
-		m_pafrgbaSource = nullptr;
-
-		m_boolBorderPixels = false;
-
-		m_fError = -1.0f;
-
-		m_mode = MODE_UNKNOWN;
-
-		m_uiEncodingIterations = 0;
-		m_boolDone = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(-1.0f, -1.0f, -1.0f, -1.0f);
-			m_afDecodedAlphas[uiPixel] = -1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialize the generic encoding for a 4x4 block
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// init the decoded pixels to -1 to mark them as undefined
-	// init the error to -1 to mark it as undefined
-	//
-	void Block4x4Encoding::Init(Block4x4 *a_pblockParent,
-								ColorFloatRGBA *a_pafrgbaSource,
-								ErrorMetric a_errormetric)
-	{
-
-		m_pblockParent = a_pblockParent;
-
-		m_pafrgbaSource = a_pafrgbaSource;
-
-		m_boolBorderPixels = m_pblockParent->HasBorderPixels();
-
-		m_fError = -1.0f;
-
-		m_uiEncodingIterations = 0;
-
-		m_errormetric = a_errormetric;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(-1.0f, -1.0f, -1.0f, -1.0f);
-			m_afDecodedAlphas[uiPixel] = -1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the error for the block by summing the pixel errors
-	//
-	void Block4x4Encoding::CalcBlockError(void)
-	{
-		m_fError = 0.0f;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_fError += CalcPixelError(m_afrgbaDecodedColors[uiPixel], m_afDecodedAlphas[uiPixel],
-										m_pafrgbaSource[uiPixel]);
-		}
-		
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the error between the source pixel and the decoded pixel
-	// the error amount is base on the error metric
-	//
-	float Block4x4Encoding::CalcPixelError(ColorFloatRGBA a_frgbaDecodedColor, float a_fDecodedAlpha,
-											ColorFloatRGBA a_frgbaSourcePixel)
-	{
-
-		// if a border pixel
-		if (isnan(a_frgbaSourcePixel.fA))
-		{
-			return 0.0f;
-		}
-
-		if (m_errormetric == ErrorMetric::RGBA)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDRed = (a_fDecodedAlpha * a_frgbaDecodedColor.fR) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fR);
-			float fDGreen = (a_fDecodedAlpha * a_frgbaDecodedColor.fG) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fG);
-			float fDBlue = (a_fDecodedAlpha * a_frgbaDecodedColor.fB) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fB);
-
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			return fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue + fDAlpha*fDAlpha;
-		}
-		else if (m_errormetric == ErrorMetric::RGBX)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDRed = a_frgbaDecodedColor.fR - a_frgbaSourcePixel.fR;
-			float fDGreen = a_frgbaDecodedColor.fG - a_frgbaSourcePixel.fG;
-			float fDBlue = a_frgbaDecodedColor.fB - a_frgbaSourcePixel.fB;
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			return fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue + fDAlpha*fDAlpha;
-		}
-		else if (m_errormetric == ErrorMetric::REC709)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fLuma1 = a_frgbaSourcePixel.fR*0.2126f + a_frgbaSourcePixel.fG*0.7152f + a_frgbaSourcePixel.fB*0.0722f;
-			float fChromaR1 = 0.5f * ((a_frgbaSourcePixel.fR - fLuma1) * (1.0f / (1.0f - 0.2126f)));
-			float fChromaB1 = 0.5f * ((a_frgbaSourcePixel.fB - fLuma1) * (1.0f / (1.0f - 0.0722f)));
-
-			float fLuma2 = a_frgbaDecodedColor.fR*0.2126f +
-							a_frgbaDecodedColor.fG*0.7152f +
-							a_frgbaDecodedColor.fB*0.0722f;
-			float fChromaR2 = 0.5f * ((a_frgbaDecodedColor.fR - fLuma2) * (1.0f / (1.0f - 0.2126f)));
-			float fChromaB2 = 0.5f * ((a_frgbaDecodedColor.fB - fLuma2) * (1.0f / (1.0f - 0.0722f)));
-
-			float fDeltaL = a_frgbaSourcePixel.fA * fLuma1 - a_fDecodedAlpha * fLuma2;
-			float fDeltaCr = a_frgbaSourcePixel.fA * fChromaR1 - a_fDecodedAlpha * fChromaR2;
-			float fDeltaCb = a_frgbaSourcePixel.fA * fChromaB1 - a_fDecodedAlpha * fChromaB2;
-
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			// Favor Luma accuracy over Chroma, and Red over Blue 
-			return LUMA_WEIGHT*fDeltaL*fDeltaL +
-					fDeltaCr*fDeltaCr +
-					CHROMA_BLUE_WEIGHT*fDeltaCb*fDeltaCb +
-					fDAlpha*fDAlpha;
-	#if 0
-			float fDRed = a_frgbaDecodedPixel.fR - a_frgbaSourcePixel.fR;
-			float fDGreen = a_frgbaDecodedPixel.fG - a_frgbaSourcePixel.fG;
-			float fDBlue = a_frgbaDecodedPixel.fB - a_frgbaSourcePixel.fB;
-			return 2.0f * 3.0f * fDeltaL * fDeltaL + fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue;
-#endif
-		}
-		else if (m_errormetric == ErrorMetric::NORMALXYZ)
-		{
-			float fDecodedX = 2.0f * a_frgbaDecodedColor.fR - 1.0f;
-			float fDecodedY = 2.0f * a_frgbaDecodedColor.fG - 1.0f;
-			float fDecodedZ = 2.0f * a_frgbaDecodedColor.fB - 1.0f;
-
-			float fDecodedLength = sqrtf(fDecodedX*fDecodedX + fDecodedY*fDecodedY + fDecodedZ*fDecodedZ);
-
-			if (fDecodedLength < 0.5f)
-			{
-				return 1.0f;
-			}
-			else if (fDecodedLength == 0.0f)
-			{
-				fDecodedX = 1.0f;
-				fDecodedY = 0.0f;
-				fDecodedZ = 0.0f;
-			}
-			else
-			{
-				fDecodedX /= fDecodedLength;
-				fDecodedY /= fDecodedLength;
-				fDecodedZ /= fDecodedLength;
-			}
-
-			float fSourceX = 2.0f * a_frgbaSourcePixel.fR - 1.0f;
-			float fSourceY = 2.0f * a_frgbaSourcePixel.fG - 1.0f;
-			float fSourceZ = 2.0f * a_frgbaSourcePixel.fB - 1.0f;
-
-			float fSourceLength = sqrtf(fSourceX*fSourceX + fSourceY*fSourceY + fSourceZ*fSourceZ);
-
-			if (fSourceLength == 0.0f)
-			{
-				fSourceX = 1.0f;
-				fSourceY = 0.0f;
-				fSourceZ = 0.0f;
-			}
-			else
-			{
-				fSourceX /= fSourceLength;
-				fSourceY /= fSourceLength;
-				fSourceZ /= fSourceLength;
-			}
-
-			float fDotProduct = fSourceX*fDecodedX + fSourceY*fDecodedY + fSourceZ*fDecodedZ;
-			float fNormalizedDotProduct = 1.0f - 0.5f * (fDotProduct + 1.0f);
-			float fDotProductError = fNormalizedDotProduct * fNormalizedDotProduct;
-			
-			float fLength2 = fDecodedX*fDecodedX + fDecodedY*fDecodedY + fDecodedZ*fDecodedZ;
-			float fLength2Error = fabsf(1.0f - fLength2);
-
-			float fDeltaW = a_frgbaDecodedColor.fA - a_frgbaSourcePixel.fA;
-			float fErrorW = fDeltaW * fDeltaW;
-
-			return fDotProductError + fLength2Error + fErrorW;
-		}
-		else // ErrorMetric::NUMERIC
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDX = a_frgbaDecodedColor.fR - a_frgbaSourcePixel.fR;
-			float fDY = a_frgbaDecodedColor.fG - a_frgbaSourcePixel.fG;
-			float fDZ = a_frgbaDecodedColor.fB - a_frgbaSourcePixel.fB;
-			float fDW = a_frgbaDecodedColor.fA - a_frgbaSourcePixel.fA;
-
-			return fDX*fDX + fDY*fDY + fDZ*fDZ + fDW*fDW;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
-
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding.h b/thirdparty/etc2comp/EtcBlock4x4Encoding.h
deleted file mode 100644
index c14c3b8616..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-#include "EtcErrorMetric.h"
-
-#include <assert.h>
-#include <float.h>
-
-namespace Etc
-{
-	class Block4x4;
-
-	// abstract base class for specific encodings
-	class Block4x4Encoding
-	{
-	public:
-
-		static const unsigned int ROWS = 4;
-		static const unsigned int COLUMNS = 4;
-		static const unsigned int PIXELS = ROWS * COLUMNS;
-		static const float LUMA_WEIGHT;
-		static const float CHROMA_BLUE_WEIGHT;
-
-		typedef enum
-		{
-			MODE_UNKNOWN,
-			//
-			MODE_ETC1,
-			MODE_T,
-			MODE_H,
-			MODE_PLANAR,
-			MODE_R11,
-			MODE_RG11,
-			//
-			MODES
-		} Mode;
-
-		Block4x4Encoding(void);
-		//virtual ~Block4x4Encoding(void) =0;
-		virtual ~Block4x4Encoding(void) {}
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-
-									unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric) = 0;
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-
-											ErrorMetric a_errormetric) = 0;
-
-		// perform an iteration of the encoding
-		// the first iteration must generate a complete, valid (if poor) encoding
-		virtual void PerformIteration(float a_fEffort) = 0;
-
-		void CalcBlockError(void);
-
-		inline float GetError(void)
-		{
-			assert(m_fError >= 0.0f);
-
-			return m_fError;
-		}
-
-		inline ColorFloatRGBA * GetDecodedColors(void)
-		{
-			return m_afrgbaDecodedColors;
-		}
-
-		inline float * GetDecodedAlphas(void)
-		{
-			return m_afDecodedAlphas;
-		}
-
-		virtual void SetEncodingBits(void) = 0;
-
-		virtual bool GetFlip(void) = 0;
-
-		virtual bool IsDifferential(void) = 0;
-
-		virtual bool HasSeverelyBentDifferentialColors(void) const = 0;
-
-		inline Mode GetMode(void)
-		{
-			return m_mode;
-		}
-
-		inline bool IsDone(void)
-		{
-			return m_boolDone;
-		}
-
-		inline void SetDoneIfPerfect()
-		{
-			if (GetError() == 0.0f)
-			{
-				m_boolDone = true;
-			}
-		}
-
-		float CalcPixelError(ColorFloatRGBA a_frgbaDecodedColor, float a_fDecodedAlpha,
-								ColorFloatRGBA a_frgbaSourcePixel);
-
-	protected:
-
-		void Init(Block4x4 *a_pblockParent,
-					ColorFloatRGBA *a_pafrgbaSource,
-
-					ErrorMetric a_errormetric);
-
-		Block4x4		*m_pblockParent;
-		ColorFloatRGBA	*m_pafrgbaSource;
-
-		bool			m_boolBorderPixels;				// if block has any border pixels
-
-		ColorFloatRGBA	m_afrgbaDecodedColors[PIXELS];	// decoded RGB components, ignore Alpha
-		float			m_afDecodedAlphas[PIXELS];		// decoded alpha component
-		float			m_fError;						// error for RGBA relative to m_pafrgbaSource
-
-		// intermediate encoding
-		Mode			m_mode;
-
-		unsigned int	m_uiEncodingIterations;
-		bool			m_boolDone;						// all iterations have been done
-		ErrorMetric		m_errormetric;
-
-	private:
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h b/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h
deleted file mode 100644
index 4065700379..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ################################################################################
-	// Block4x4EncodingBits
-	// Base class for Block4x4EncodingBits_XXXX
-	// ################################################################################
-
-	class Block4x4EncodingBits
-	{
-	public:
-
-		enum class Format
-		{
-			UNKNOWN,
-			//
-			RGB8,
-			RGBA8,
-			R11,
-			RG11,
-			RGB8A1,
-			//
-			FORMATS
-		};
-
-		static unsigned int GetBytesPerBlock(Format a_format)
-		{
-			switch (a_format)
-			{
-			case Format::RGB8:
-			case Format::R11:
-			case Format::RGB8A1:
-				return 8;
-				break;
-
-			case Format::RGBA8:
-			case Format::RG11:
-				return 16;
-				break;
-
-			default:
-				return 0;
-				break;
-			}
-
-		}
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_RGB8
-	// Encoding bits for the RGB portion of ETC1, RGB8, RGB8A1 and RGBA8
-	// ################################################################################
-
-	class Block4x4EncodingBits_RGB8
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-
-		inline Block4x4EncodingBits_RGB8(void)
-		{
-			assert(sizeof(Block4x4EncodingBits_RGB8) == BYTES_PER_BLOCK);
-
-			for (unsigned int uiByte = 0; uiByte < BYTES_PER_BLOCK; uiByte++)
-			{
-				auc[uiByte] = 0;
-			}
-
-		}
-
-		typedef struct
-		{
-			unsigned red2 : 4;
-			unsigned red1 : 4;
-			//
-			unsigned green2 : 4;
-			unsigned green1 : 4;
-			//
-			unsigned blue2 : 4;
-			unsigned blue1 : 4;
-			//
-			unsigned flip : 1;
-			unsigned diff : 1;
-			unsigned cw2 : 3;
-			unsigned cw1 : 3;
-			//
-			unsigned int selectors;
-		} Individual;
-
-		typedef struct
-		{
-			signed dred2 : 3;
-			unsigned red1 : 5;
-			//
-			signed dgreen2 : 3;
-			unsigned green1 : 5;
-			//
-			signed dblue2 : 3;
-			unsigned blue1 : 5;
-			//
-			unsigned flip : 1;
-			unsigned diff : 1;
-			unsigned cw2 : 3;
-			unsigned cw1 : 3;
-			//
-			unsigned int selectors;
-		} Differential;
-
-		typedef struct
-		{
-			unsigned red1b : 2;
-			unsigned detect2 : 1;
-			unsigned red1a : 2;
-			unsigned detect1 : 3;
-			//
-			unsigned blue1 : 4;
-			unsigned green1 : 4;
-			//
-			unsigned green2 : 4;
-			unsigned red2 : 4;
-			//
-			unsigned db : 1;
-			unsigned diff : 1;
-			unsigned da : 2;
-			unsigned blue2 : 4;
-			//
-			unsigned int selectors;
-		} T;
-
-		typedef struct
-		{
-			unsigned green1a : 3;
-			unsigned red1 : 4;
-			unsigned detect1 : 1;
-			//
-			unsigned blue1b : 2;
-			unsigned detect3 : 1;
-			unsigned blue1a : 1;
-			unsigned green1b : 1;
-			unsigned detect2 : 3;
-			//
-			unsigned green2a : 3;
-			unsigned red2 : 4;
-			unsigned blue1c : 1;
-			//
-			unsigned db : 1;
-			unsigned diff : 1;
-			unsigned da : 1;
-			unsigned blue2 : 4;
-			unsigned green2b : 1;
-			//
-			unsigned int selectors;
-		} H;
-
-		typedef struct
-		{
-			unsigned originGreen1 : 1;
-			unsigned originRed : 6;
-			unsigned detect1 : 1;
-			//
-			unsigned originBlue1 : 1;
-			unsigned originGreen2 : 6;
-			unsigned detect2 : 1;
-			//
-			unsigned originBlue3 : 2;
-			unsigned detect4 : 1;
-			unsigned originBlue2 : 2;
-			unsigned detect3 : 3;
-			//
-			unsigned horizRed2 : 1;
-			unsigned diff : 1;
-			unsigned horizRed1 : 5;
-			unsigned originBlue4 : 1;
-			//
-			unsigned horizBlue1: 1;
-			unsigned horizGreen : 7;
-			//
-			unsigned vertRed1 : 3;
-			unsigned horizBlue2 : 5;
-			//
-			unsigned vertGreen1 : 5;
-			unsigned vertRed2 : 3;
-			//
-			unsigned vertBlue : 6;
-			unsigned vertGreen2 : 2;
-		} Planar;
-
-		union
-		{
-			unsigned char auc[BYTES_PER_BLOCK];
-			unsigned long int ul;
-			Individual individual;
-			Differential differential;
-			T t;
-			H h;
-			Planar planar;
-		};
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_A8
-	// Encoding bits for the A portion of RGBA8
-	// ################################################################################
-
-	class Block4x4EncodingBits_A8
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-		static const unsigned int SELECTOR_BYTES = 6;
-
-		typedef struct
-		{
-			unsigned base : 8;
-			unsigned table : 4;
-			unsigned multiplier : 4;
-			unsigned selectors0 : 8;
-			unsigned selectors1 : 8;
-			unsigned selectors2 : 8;
-			unsigned selectors3 : 8;
-			unsigned selectors4 : 8;
-			unsigned selectors5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_R11
-	// Encoding bits for the R portion of R11
-	// ################################################################################
-
-	class Block4x4EncodingBits_R11
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-		static const unsigned int SELECTOR_BYTES = 6;
-
-		typedef struct
-		{
-			unsigned base : 8;
-			unsigned table : 4;
-			unsigned multiplier : 4;
-			unsigned selectors0 : 8;
-			unsigned selectors1 : 8;
-			unsigned selectors2 : 8;
-			unsigned selectors3 : 8;
-			unsigned selectors4 : 8;
-			unsigned selectors5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-	class Block4x4EncodingBits_RG11
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 16;
-		static const unsigned int SELECTOR_BYTES = 12;
-
-		typedef struct
-		{
-			//Red portion
-			unsigned baseR : 8;
-			unsigned tableIndexR : 4;
-			unsigned multiplierR : 4;
-			unsigned selectorsR0 : 8;
-			unsigned selectorsR1 : 8;
-			unsigned selectorsR2 : 8;
-			unsigned selectorsR3 : 8;
-			unsigned selectorsR4 : 8;
-			unsigned selectorsR5 : 8;
-			//Green portion
-			unsigned baseG : 8;
-			unsigned tableIndexG : 4;
-			unsigned multiplierG : 4;
-			unsigned selectorsG0 : 8;
-			unsigned selectorsG1 : 8;
-			unsigned selectorsG2 : 8;
-			unsigned selectorsG3 : 8;
-			unsigned selectorsG4 : 8;
-			unsigned selectorsG5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp
deleted file mode 100644
index a27f74c0d5..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp
+++ /dev/null
@@ -1,1281 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_ETC1.cpp
-
-Block4x4Encoding_ETC1 is the encoder to use when targetting file format ETC1.  This encoder is also
-used for the ETC1 subset of file format RGB8, RGBA8 and RGB8A1
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_ETC1.h"
-
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcDifferentialTrys.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// pixel processing order if the flip bit = 0 (horizontal split)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderFlip0[PIXELS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-	// pixel processing order if the flip bit = 1 (vertical split)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderFlip1[PIXELS] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-	// pixel processing order for horizontal scan (ETC normally does a vertical scan)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-	// pixel indices for different block halves
-	const unsigned int Block4x4Encoding_ETC1::s_auiLeftPixelMapping[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiRightPixelMapping[8] = { 8, 9, 10, 11, 12, 13, 14, 15 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiTopPixelMapping[8] = { 0, 1, 4, 5, 8, 9, 12, 13 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiBottomPixelMapping[8] = { 2, 3, 6, 7, 10, 11, 14, 15 };
-
-	// CW ranges that the ETC1 decoders use
-	// CW is basically a contrast for the different selector bits, since these values are offsets to the base color
-	// the first axis in the array is indexed by the CW in the encoding bits
-	// the second axis in the array is indexed by the selector bits
-	float Block4x4Encoding_ETC1::s_aafCwTable[CW_RANGES][SELECTORS] =
-	{
-		{ 2.0f / 255.0f, 8.0f / 255.0f, -2.0f / 255.0f, -8.0f / 255.0f },
-		{ 5.0f / 255.0f, 17.0f / 255.0f, -5.0f / 255.0f, -17.0f / 255.0f },
-		{ 9.0f / 255.0f, 29.0f / 255.0f, -9.0f / 255.0f, -29.0f / 255.0f },
-		{ 13.0f / 255.0f, 42.0f / 255.0f, -13.0f / 255.0f, -42.0f / 255.0f },
-		{ 18.0f / 255.0f, 60.0f / 255.0f, -18.0f / 255.0f, -60.0f / 255.0f },
-		{ 24.0f / 255.0f, 80.0f / 255.0f, -24.0f / 255.0f, -80.0f / 255.0f },
-		{ 33.0f / 255.0f, 106.0f / 255.0f, -33.0f / 255.0f, -106.0f / 255.0f },
-		{ 47.0f / 255.0f, 183.0f / 255.0f, -47.0f / 255.0f, -183.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_ETC1::Block4x4Encoding_ETC1(void)
-	{
-		m_mode = MODE_ETC1;
-		m_boolDiff = false;
-		m_boolFlip = false;
-		m_frgbaColor1 = ColorFloatRGBA();
-		m_frgbaColor2 = ColorFloatRGBA();
-		m_uiCW1 = 0;
-		m_uiCW2 = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_auiSelectors[uiPixel] = 0;
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-		m_boolMostLikelyFlip = false;
-
-		m_fError = -1.0f;
-
-		m_fError1 = -1.0f;
-		m_fError2 = -1.0f;
-		m_boolSeverelyBentDifferentialColors = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-	}
-
-	 Block4x4Encoding_ETC1::~Block4x4Encoding_ETC1(void) {}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_ETC1::InitFromSource(Block4x4 *a_pblockParent,
-												ColorFloatRGBA *a_pafrgbaSource,
-												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-		m_fError = -1.0f;
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_ETC1::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource, 
-														ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-		m_fError = -1.0f;
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = m_pencodingbitsRGB8->individual.diff;
-		m_boolFlip = m_pencodingbitsRGB8->individual.flip;
-		if (m_boolDiff)
-		{
-			int iR2 = (int)(m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2);
-			if (iR2 < 0)
-			{
-				iR2 = 0;
-			}
-			else if (iR2 > 31)
-			{
-				iR2 = 31;
-			}
-
-			int iG2 = (int)(m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2);
-			if (iG2 < 0)
-			{
-				iG2 = 0;
-			}
-			else if (iG2 > 31)
-			{
-				iG2 = 31;
-			}
-
-			int iB2 = (int)(m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2);
-			if (iB2 < 0)
-			{
-				iB2 = 0;
-			}
-			else if (iB2 > 31)
-			{
-				iB2 = 31;
-			}
-
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
-
-		}
-		else
-		{
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(m_pencodingbitsRGB8->individual.red1, m_pencodingbitsRGB8->individual.green1, m_pencodingbitsRGB8->individual.blue1);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(m_pencodingbitsRGB8->individual.red2, m_pencodingbitsRGB8->individual.green2, m_pencodingbitsRGB8->individual.blue2);
-		}
-
-		m_uiCW1 = m_pencodingbitsRGB8->individual.cw1;
-		m_uiCW2 = m_pencodingbitsRGB8->individual.cw2;
-
-		InitFromEncodingBits_Selectors();
-
-		Decode();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// init the selectors from a prior encoding
-	//
-	void Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors(void)
-	{
-
-		unsigned char *paucSelectors = (unsigned char *)&m_pencodingbitsRGB8->individual.selectors;
-
-		for (unsigned int iPixel = 0; iPixel < PIXELS; iPixel++)
-		{
-			unsigned int uiByteMSB = (unsigned int)(1 - (iPixel / 8));
-			unsigned int uiByteLSB = (unsigned int)(3 - (iPixel / 8));
-			unsigned int uiShift = (unsigned int)(iPixel & 7);
-
-			unsigned int uiSelectorMSB = (unsigned int)((paucSelectors[uiByteMSB] >> uiShift) & 1);
-			unsigned int uiSelectorLSB = (unsigned int)((paucSelectors[uiByteLSB] >> uiShift) & 1);
-
-			m_auiSelectors[iPixel] = (uiSelectorMSB << 1) + uiSelectorLSB;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_ETC1::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			TryIndividual(m_boolMostLikelyFlip, 1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 4:
-			TryIndividual(!m_boolMostLikelyFlip, 1);
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			TryDegenerates1();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryDegenerates2();
-			if (a_fEffort <= 89.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			TryDegenerates3();
-			if (a_fEffort <= 99.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_ETC1::PerformFirstIteration(void)
-	{
-		CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-
-		TryIndividual(m_boolMostLikelyFlip, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryIndividual(!m_boolMostLikelyFlip, 0);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// algorithm:
-	// create a source average color for the Left, Right, Top and Bottom halves using the 8 pixels in each half
-	// note: the "gray line" is the line of equal delta RGB that goes thru the average color
-	// for each half:
-	//		see how close each of the 8 pixels are to the "gray line" that goes thru the source average color
-	//		create an error value that is the sum of the distances from the gray line
-	// h_error is the sum of Left and Right errors
-	// v_error is the sum of Top and Bottom errors
-	//
-	void Block4x4Encoding_ETC1::CalculateMostLikelyFlip(void)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		CalculateSourceAverages();
-
-		float fLeftGrayErrorSum = 0.0f;
-		float fRightGrayErrorSum = 0.0f;
-		float fTopGrayErrorSum = 0.0f;
-		float fBottomGrayErrorSum = 0.0f;
-
-		for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-		{
-			ColorFloatRGBA *pfrgbaLeft = &m_pafrgbaSource[uiPixel];
-			ColorFloatRGBA *pfrgbaRight = &m_pafrgbaSource[uiPixel + 8];
-			ColorFloatRGBA *pfrgbaTop = &m_pafrgbaSource[s_auiTopPixelMapping[uiPixel]];
-			ColorFloatRGBA *pfrgbaBottom = &m_pafrgbaSource[s_auiBottomPixelMapping[uiPixel]];
-
-			float fLeftGrayError = CalcGrayDistance2(*pfrgbaLeft, m_frgbaSourceAverageLeft);
-			float fRightGrayError = CalcGrayDistance2(*pfrgbaRight, m_frgbaSourceAverageRight);
-			float fTopGrayError = CalcGrayDistance2(*pfrgbaTop, m_frgbaSourceAverageTop);
-			float fBottomGrayError = CalcGrayDistance2(*pfrgbaBottom, m_frgbaSourceAverageBottom);
-
-			fLeftGrayErrorSum += fLeftGrayError;
-			fRightGrayErrorSum += fRightGrayError;
-			fTopGrayErrorSum += fTopGrayError;
-			fBottomGrayErrorSum += fBottomGrayError;
-		}
-
-		if (DEBUG_PRINT)
-		{
-			printf("\n%.2f %.2f\n", fLeftGrayErrorSum + fRightGrayErrorSum, fTopGrayErrorSum + fBottomGrayErrorSum);
-		}
-
-		m_boolMostLikelyFlip = (fTopGrayErrorSum + fBottomGrayErrorSum) < (fLeftGrayErrorSum + fRightGrayErrorSum);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate source pixel averages for each 2x2 quadrant in a 4x4 block
-	// these are used to determine the averages for each of the 4 different halves (left, right, top, bottom)
-	// ignore pixels that have alpha == NAN (these are border pixels outside of the source image)
-	// weight the averages based on a pixel's alpha
-	//
-	void Block4x4Encoding_ETC1::CalculateSourceAverages(void)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
-
-		if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE || boolRGBX)
-		{
-			ColorFloatRGBA frgbaSumUL = m_pafrgbaSource[0] + m_pafrgbaSource[1] + m_pafrgbaSource[4] + m_pafrgbaSource[5];
-			ColorFloatRGBA frgbaSumLL = m_pafrgbaSource[2] + m_pafrgbaSource[3] + m_pafrgbaSource[6] + m_pafrgbaSource[7];
-			ColorFloatRGBA frgbaSumUR = m_pafrgbaSource[8] + m_pafrgbaSource[9] + m_pafrgbaSource[12] + m_pafrgbaSource[13];
-			ColorFloatRGBA frgbaSumLR = m_pafrgbaSource[10] + m_pafrgbaSource[11] + m_pafrgbaSource[14] + m_pafrgbaSource[15];
-
-			m_frgbaSourceAverageLeft = (frgbaSumUL + frgbaSumLL) * 0.125f;
-			m_frgbaSourceAverageRight = (frgbaSumUR + frgbaSumLR) * 0.125f;
-			m_frgbaSourceAverageTop = (frgbaSumUL + frgbaSumUR) * 0.125f;
-			m_frgbaSourceAverageBottom = (frgbaSumLL + frgbaSumLR) * 0.125f;
-		}
-		else
-		{
-			float afSourceAlpha[PIXELS];
-
-			// treat alpha NAN as 0.0f
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				afSourceAlpha[uiPixel] = isnan(m_pafrgbaSource[uiPixel].fA) ? 
-																		0.0f : 
-																		m_pafrgbaSource[uiPixel].fA;
-			}
-
-			ColorFloatRGBA afrgbaAlphaWeightedSource[PIXELS];
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				afrgbaAlphaWeightedSource[uiPixel] = m_pafrgbaSource[uiPixel] * afSourceAlpha[uiPixel];
-			}
-
-			ColorFloatRGBA frgbaSumUL = afrgbaAlphaWeightedSource[0] +
-										afrgbaAlphaWeightedSource[1] +
-										afrgbaAlphaWeightedSource[4] +
-										afrgbaAlphaWeightedSource[5];
-
-			ColorFloatRGBA frgbaSumLL = afrgbaAlphaWeightedSource[2] +
-										afrgbaAlphaWeightedSource[3] +
-										afrgbaAlphaWeightedSource[6] +
-										afrgbaAlphaWeightedSource[7];
-
-			ColorFloatRGBA frgbaSumUR = afrgbaAlphaWeightedSource[8] +
-										afrgbaAlphaWeightedSource[9] +
-										afrgbaAlphaWeightedSource[12] +
-										afrgbaAlphaWeightedSource[13];
-
-			ColorFloatRGBA frgbaSumLR = afrgbaAlphaWeightedSource[10] +
-										afrgbaAlphaWeightedSource[11] +
-										afrgbaAlphaWeightedSource[14] +
-										afrgbaAlphaWeightedSource[15];
-
-			float fWeightSumUL = afSourceAlpha[0] +
-									afSourceAlpha[1] +
-									afSourceAlpha[4] +
-									afSourceAlpha[5];
-
-			float fWeightSumLL = afSourceAlpha[2] +
-									afSourceAlpha[3] +
-									afSourceAlpha[6] +
-									afSourceAlpha[7];
-
-			float fWeightSumUR = afSourceAlpha[8] +
-									afSourceAlpha[9] +
-									afSourceAlpha[12] +
-									afSourceAlpha[13];
-
-			float fWeightSumLR = afSourceAlpha[10] +
-									afSourceAlpha[11] +
-									afSourceAlpha[14] +
-									afSourceAlpha[15];
-
-			ColorFloatRGBA frgbaSumLeft = frgbaSumUL + frgbaSumLL;
-			ColorFloatRGBA frgbaSumRight = frgbaSumUR + frgbaSumLR;
-			ColorFloatRGBA frgbaSumTop = frgbaSumUL + frgbaSumUR;
-			ColorFloatRGBA frgbaSumBottom = frgbaSumLL + frgbaSumLR;
-
-			float fWeightSumLeft = fWeightSumUL + fWeightSumLL;
-			float fWeightSumRight = fWeightSumUR + fWeightSumLR;
-			float fWeightSumTop = fWeightSumUL + fWeightSumUR;
-			float fWeightSumBottom = fWeightSumLL + fWeightSumLR;
-
-			// check to see if there is at least 1 pixel with  non-zero alpha
-			// completely transparent block should not make it to this code
-			assert((fWeightSumLeft + fWeightSumRight) > 0.0f);
-			assert((fWeightSumTop + fWeightSumBottom) > 0.0f);
-
-			if (fWeightSumLeft > 0.0f)
-			{
-				m_frgbaSourceAverageLeft = frgbaSumLeft * (1.0f/fWeightSumLeft);
-			}
-			if (fWeightSumRight > 0.0f)
-			{
-				m_frgbaSourceAverageRight = frgbaSumRight * (1.0f/fWeightSumRight);
-			}
-			if (fWeightSumTop > 0.0f)
-			{
-				m_frgbaSourceAverageTop = frgbaSumTop * (1.0f/fWeightSumTop);
-			}
-			if (fWeightSumBottom > 0.0f)
-			{
-				m_frgbaSourceAverageBottom = frgbaSumBottom * (1.0f/fWeightSumBottom);
-			}
-
-			if (fWeightSumLeft == 0.0f)
-			{
-				assert(fWeightSumRight > 0.0f);
-				m_frgbaSourceAverageLeft = m_frgbaSourceAverageRight;
-			}
-			if (fWeightSumRight == 0.0f)
-			{
-				assert(fWeightSumLeft > 0.0f);
-				m_frgbaSourceAverageRight = m_frgbaSourceAverageLeft;
-			}
-			if (fWeightSumTop == 0.0f)
-			{
-				assert(fWeightSumBottom > 0.0f);
-				m_frgbaSourceAverageTop = m_frgbaSourceAverageBottom;
-			}
-			if (fWeightSumBottom == 0.0f)
-			{
-				assert(fWeightSumTop > 0.0f);
-				m_frgbaSourceAverageBottom = m_frgbaSourceAverageTop;
-			}
-		}
-
-		
-
-		if (DEBUG_PRINT)
-		{
-			printf("\ntarget: [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f]\n",
-				m_frgbaSourceAverageLeft.fR, m_frgbaSourceAverageLeft.fG, m_frgbaSourceAverageLeft.fB,
-				m_frgbaSourceAverageRight.fR, m_frgbaSourceAverageRight.fG, m_frgbaSourceAverageRight.fB,
-				m_frgbaSourceAverageTop.fR, m_frgbaSourceAverageTop.fG, m_frgbaSourceAverageTop.fB,
-				m_frgbaSourceAverageBottom.fR, m_frgbaSourceAverageBottom.fG, m_frgbaSourceAverageBottom.fB);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding
-	// use a_boolFlip to set the encoding F bit
-	// use a_uiRadius to alter basecolor components in the range[-a_uiRadius:a_uiRadius]
-	// use a_iGrayOffset1 and a_iGrayOffset2 to offset the basecolor to search for degenerate encodings
-	// replace the encoding if the encoding error is less than previous encoding
-	//
-	void Block4x4Encoding_ETC1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-												int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
-								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
-
-		Block4x4Encoding_ETC1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryDifferentialHalf(&trys.m_half1);
-		encodingTry.TryDifferentialHalf(&trys.m_half2);
-
-		// find best halves that are within differential range
-		DifferentialTrys::Try *ptryBest1 = nullptr;
-		DifferentialTrys::Try *ptryBest2 = nullptr;
-		encodingTry.m_fError = FLT_MAX;
-
-		// see if the best of each half are in differential range
-		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
-		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
-		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
-		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
-		{
-			ptryBest1 = trys.m_half1.m_ptryBest;
-			ptryBest2 = trys.m_half2.m_ptryBest;
-			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-		}
-		else
-		{
-			// else, find the next best halves that are in differential range
-			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
-			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
-				ptry1++)
-			{
-				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
-				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
-					ptry2++)
-				{
-					iDRed = ptry2->m_iRed - ptry1->m_iRed;
-					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
-					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
-					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
-					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
-					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
-
-					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
-					{
-						float fError = ptry1->m_fError + ptry2->m_fError;
-
-						if (fError < encodingTry.m_fError)
-						{
-							encodingTry.m_fError = fError;
-
-							ptryBest1 = ptry1;
-							ptryBest2 = ptry2;
-						}
-					}
-
-				}
-			}
-			assert(encodingTry.m_fError < FLT_MAX);
-			assert(ptryBest1 != nullptr);
-			assert(ptryBest2 != nullptr);
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = true;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				float fDeltaRGB1 = s_aafCwTable[m_uiCW1][uiSelector1];
-				float fDeltaRGB2 = s_aafCwTable[m_uiCW2][uiSelector2];
-
-				m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-				m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
-			m_fError = m_fError1 + m_fError2;
-
-			// sanity check
-			{
-				int iRed1 = m_frgbaColor1.IntRed(31.0f);
-				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-				int iRed2 = m_frgbaColor2.IntRed(31.0f);
-				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-				iDRed = iRed2 - iRed1;
-				iDGreen = iGreen2 - iGreen1;
-				iDBlue = iBlue2 - iBlue1;
-
-				assert(iDRed >= -4 && iDRed < 4);
-				assert(iDGreen >= -4 && iDGreen < 4);
-				assert(iDBlue >= -4 && iDBlue < 4);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding for a half of a 4x4 block
-	// vary the basecolor components using a radius
-	//
-	void Block4x4Encoding_ETC1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius; 
-				iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-				iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 31);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-					iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-					iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 31);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-						iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-						iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 31);
-
-					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, 
-															FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = (frgbaColor + s_aafCwTable[uiCW][2]).ClampRGB();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-																	*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{	
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 individual mode encoding
-	// use a_boolFlip to set the encoding F bit
-	// use a_uiRadius to alter basecolor components in the range[-a_uiRadius:a_uiRadius]
-	// replace the encoding if the encoding error is less than previous encoding
-	//
-	void Block4x4Encoding_ETC1::TryIndividual(bool a_boolFlip, unsigned int a_uiRadius)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		IndividualTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, a_uiRadius);
-
-		Block4x4Encoding_ETC1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryIndividualHalf(&trys.m_half1);
-		encodingTry.TryIndividualHalf(&trys.m_half2);
-
-		// use the best of each half
-		IndividualTrys::Try *ptryBest1 = trys.m_half1.m_ptryBest;
-		IndividualTrys::Try *ptryBest2 = trys.m_half2.m_ptryBest;
-		encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = false;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				float fDeltaRGB1 = s_aafCwTable[m_uiCW1][uiSelector1];
-				float fDeltaRGB2 = s_aafCwTable[m_uiCW2][uiSelector2];
-
-				m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-				m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_fError = m_fError1 + m_fError2;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding for a half of a 4x4 block
-	// vary the basecolor components using a radius
-	//
-	void Block4x4Encoding_ETC1::TryIndividualHalf(IndividualTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
-			iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-			iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 15);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-				iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-				iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 15);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-					iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-					iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 15);
-
-					IndividualTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[IndividualTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-															FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = (frgbaColor + s_aafCwTable[uiCW][2]).ClampRGB();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-										*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 1 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates1(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 2 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates2(void)
-	{
-
-		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 3 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates3(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 4 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates4(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best selector for each pixel based on a particular basecolor and CW that have been previously set
-	// calculate the selectors for each half of the block separately
-	// set the block error as the sum of each half's error
-	//
-	void Block4x4Encoding_ETC1::CalculateSelectors()
-	{
-		if (m_boolFlip)
-		{
-			CalculateHalfOfTheSelectors(0, s_auiTopPixelMapping);
-			CalculateHalfOfTheSelectors(1, s_auiBottomPixelMapping);
-		}
-		else
-		{
-			CalculateHalfOfTheSelectors(0, s_auiLeftPixelMapping);
-			CalculateHalfOfTheSelectors(1, s_auiRightPixelMapping);
-		}
-
-		m_fError = m_fError1 + m_fError2;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// choose best selectors for half of the block
-	// calculate the error for half of the block
-	//
-	void Block4x4Encoding_ETC1::CalculateHalfOfTheSelectors(unsigned int a_uiHalf,
-		const unsigned int *pauiPixelMapping)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		ColorFloatRGBA *pfrgbaColor = a_uiHalf ? &m_frgbaColor2 : &m_frgbaColor1;
-		unsigned int *puiCW = a_uiHalf ? &m_uiCW2 : &m_uiCW1;
-
-		float *pfHalfError = a_uiHalf ? &m_fError2 : &m_fError1;
-		*pfHalfError = FLT_MAX;
-
-		// try each CW
-		for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-		{
-			if (DEBUG_PRINT)
-			{
-				printf("\ncw=%u\n", uiCW);
-			}
-
-			unsigned int auiPixelSelectors[PIXELS / 2];
-			ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-			float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-			for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-			{
-				if (DEBUG_PRINT)
-				{
-					printf("\tsource [%.2f,%.2f,%.2f]\n", m_pafrgbaSource[pauiPixelMapping[uiPixel]].fR,
-						m_pafrgbaSource[pauiPixelMapping[uiPixel]].fG, m_pafrgbaSource[pauiPixelMapping[uiPixel]].fB);
-				}
-
-				ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[pauiPixelMapping[uiPixel]];
-				ColorFloatRGBA frgbaDecodedPixel;
-
-				for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-				{
-					float fDeltaRGB = s_aafCwTable[uiCW][uiSelector];
-
-					frgbaDecodedPixel = (*pfrgbaColor + fDeltaRGB).ClampRGB();
-
-					float fPixelError;
-					
-					fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[pauiPixelMapping[uiPixel]],
-														*pfrgbaSourcePixel);
-					
-					if (DEBUG_PRINT)
-					{
-						printf("\tpixel %u, index %u [%.2f,%.2f,%.2f], error %.2f", uiPixel, uiSelector,
-							frgbaDecodedPixel.fR,
-							frgbaDecodedPixel.fG,
-							frgbaDecodedPixel.fB,
-							fPixelError);
-					}
-
-					if (fPixelError < afPixelErrors[uiPixel])
-					{
-						if (DEBUG_PRINT)
-						{
-							printf(" *");
-						}
-
-						auiPixelSelectors[uiPixel] = uiSelector;
-						afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-						afPixelErrors[uiPixel] = fPixelError;
-					}
-
-					if (DEBUG_PRINT)
-					{
-						printf("\n");
-					}
-				}
-			}
-
-			// add up all pixel errors
-			float fCWError = 0.0f;
-			for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-			{
-				fCWError += afPixelErrors[uiPixel];
-			}
-			if (DEBUG_PRINT)
-			{
-				printf("\terror %.2f\n", fCWError);
-			}
-
-			// if best CW so far
-			if (fCWError < *pfHalfError)
-			{
-				*pfHalfError = fCWError;
-				*puiCW = uiCW;
-				for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-				{
-					m_auiSelectors[pauiPixelMapping[uiPixel]] = auiPixelSelectors[uiPixel];
-					m_afrgbaDecodedColors[pauiPixelMapping[uiPixel]] = afrgbaDecodedPixels[uiPixel];
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_ETC1::SetEncodingBits(void)
-	{
-		assert(m_mode == MODE_ETC1);
-
-		if (m_boolDiff)
-		{
-			int iRed1 = m_frgbaColor1.IntRed(31.0f);
-			int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-			int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-			int iRed2 = m_frgbaColor2.IntRed(31.0f);
-			int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-			int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-			int iDRed2 = iRed2 - iRed1;
-			int iDGreen2 = iGreen2 - iGreen1;
-			int iDBlue2 = iBlue2 - iBlue1;
-
-			assert(iDRed2 >= -4 && iDRed2 < 4);
-			assert(iDGreen2 >= -4 && iDGreen2 < 4);
-			assert(iDBlue2 >= -4 && iDBlue2 < 4);
-
-			m_pencodingbitsRGB8->differential.red1 = (unsigned int)iRed1;
-			m_pencodingbitsRGB8->differential.green1 = (unsigned int)iGreen1;
-			m_pencodingbitsRGB8->differential.blue1 = (unsigned int)iBlue1;
-
-			m_pencodingbitsRGB8->differential.dred2 = iDRed2;
-			m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
-			m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->individual.red1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-			m_pencodingbitsRGB8->individual.green1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-			m_pencodingbitsRGB8->individual.blue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-			m_pencodingbitsRGB8->individual.red2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-			m_pencodingbitsRGB8->individual.green2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-			m_pencodingbitsRGB8->individual.blue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-		}
-
-		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
-		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
-
-		SetEncodingBits_Selectors();
-
-		m_pencodingbitsRGB8->individual.diff = (unsigned int)m_boolDiff;
-		m_pencodingbitsRGB8->individual.flip = (unsigned int)m_boolFlip;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the selectors in the encoding bits
-	//
-	void Block4x4Encoding_ETC1::SetEncodingBits_Selectors(void)
-	{
-
-		m_pencodingbitsRGB8->individual.selectors = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiSelector = m_auiSelectors[uiPixel];
-
-			// set index msb
-			m_pencodingbitsRGB8->individual.selectors |= (uiSelector >> 1) << (uiPixel ^ 8);
-
-			// set index lsb
-			m_pencodingbitsRGB8->individual.selectors |= (uiSelector & 1) << ((16 + uiPixel) ^ 8);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_ETC1::Decode(void)
-	{
-
-		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
-
-		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
-		{
-			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
-			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
-
-			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
-
-			float fDelta = s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
-			m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h
deleted file mode 100644
index c0dc84d5d5..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcDifferentialTrys.h"
-#include "EtcIndividualTrys.h"
-
-namespace Etc
-{
-
-	// base class for Block4x4Encoding_RGB8
-	class Block4x4Encoding_ETC1 : public Block4x4Encoding
-	{
-	public:
-
-		Block4x4Encoding_ETC1(void);
-		virtual ~Block4x4Encoding_ETC1(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource, 
-
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		inline virtual bool GetFlip(void)
-		{
-			return m_boolFlip;
-		}
-
-		inline virtual bool IsDifferential(void)
-		{
-			return m_boolDiff;
-		}
-
-		virtual void SetEncodingBits(void);
-
-		void Decode(void);
-
-		inline ColorFloatRGBA GetColor1(void) const
-		{
-			return m_frgbaColor1;
-		}
-
-		inline ColorFloatRGBA GetColor2(void) const
-		{
-			return m_frgbaColor2;
-		}
-
-		inline const unsigned int * GetSelectors(void) const
-		{
-			return m_auiSelectors;
-		}
-
-		inline unsigned int GetCW1(void) const
-		{
-			return m_uiCW1;
-		}
-
-		inline unsigned int GetCW2(void) const
-		{
-			return m_uiCW2;
-		}
-
-		inline bool HasSeverelyBentDifferentialColors(void) const
-		{
-			return m_boolSeverelyBentDifferentialColors;
-		}
-
-	protected:
-
-		static const unsigned int s_auiPixelOrderFlip0[PIXELS];
-		static const unsigned int s_auiPixelOrderFlip1[PIXELS];
-		static const unsigned int s_auiPixelOrderHScan[PIXELS];
-
-		static const unsigned int s_auiLeftPixelMapping[8];
-		static const unsigned int s_auiRightPixelMapping[8];
-		static const unsigned int s_auiTopPixelMapping[8];
-		static const unsigned int s_auiBottomPixelMapping[8];
-
-		static const unsigned int SELECTOR_BITS = 2;
-		static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
-
-		static const unsigned int CW_BITS = 3;
-		static const unsigned int CW_RANGES = 1 << CW_BITS;
-
-		static float s_aafCwTable[CW_RANGES][SELECTORS];
-		static unsigned char s_aucDifferentialCwRange[256];
-
-		static const int MAX_DIFFERENTIAL = 3;
-		static const int MIN_DIFFERENTIAL = -4;
-
-		void InitFromEncodingBits_Selectors(void);
-
-		void PerformFirstIteration(void);
-		void CalculateMostLikelyFlip(void);
-
-		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-								int a_iGrayOffset1, int a_iGrayOffset2);
-		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
-
-		void TryIndividual(bool a_boolFlip, unsigned int a_uiRadius);
-		void TryIndividualHalf(IndividualTrys::Half *a_phalf);
-
-		void TryDegenerates1(void);
-		void TryDegenerates2(void);
-		void TryDegenerates3(void);
-		void TryDegenerates4(void);
-
-		void CalculateSelectors();
-		void CalculateHalfOfTheSelectors(unsigned int a_uiHalf,
-											const unsigned int *pauiPixelMapping);
-
-		// calculate the distance2 of r_frgbaPixel from r_frgbaTarget's gray line
-		inline float CalcGrayDistance2(ColorFloatRGBA &r_frgbaPixel, 
-										ColorFloatRGBA &r_frgbaTarget)
-		{
-			float fDeltaGray = ((r_frgbaPixel.fR - r_frgbaTarget.fR) +
-								(r_frgbaPixel.fG - r_frgbaTarget.fG) +
-								(r_frgbaPixel.fB - r_frgbaTarget.fB)) / 3.0f;
-
-			ColorFloatRGBA frgbaPointOnGrayLine = (r_frgbaTarget + fDeltaGray).ClampRGB();
-
-			float fDR = r_frgbaPixel.fR - frgbaPointOnGrayLine.fR;
-			float fDG = r_frgbaPixel.fG - frgbaPointOnGrayLine.fG;
-			float fDB = r_frgbaPixel.fB - frgbaPointOnGrayLine.fB;
-
-			return (fDR*fDR) + (fDG*fDG) + (fDB*fDB);
-		}
-
-		void SetEncodingBits_Selectors(void);
-
-		// intermediate encoding
-		bool			m_boolDiff;
-		bool			m_boolFlip;
-		ColorFloatRGBA	m_frgbaColor1;
-		ColorFloatRGBA	m_frgbaColor2;
-		unsigned int	m_uiCW1;
-		unsigned int	m_uiCW2;
-		unsigned int	m_auiSelectors[PIXELS];
-
-		// state shared between iterations
-		ColorFloatRGBA	m_frgbaSourceAverageLeft;
-		ColorFloatRGBA	m_frgbaSourceAverageRight;
-		ColorFloatRGBA	m_frgbaSourceAverageTop;
-		ColorFloatRGBA	m_frgbaSourceAverageBottom;
-		bool			m_boolMostLikelyFlip;
-
-		// stats
-		float			m_fError1;	// error for Etc1 half 1
-		float			m_fError2;	// error for Etc1 half 2
-		bool			m_boolSeverelyBentDifferentialColors;	// only valid if m_boolDiff;
-
-		// final encoding
-		Block4x4EncodingBits_RGB8 *m_pencodingbitsRGB8;		// or RGB8 portion of Block4x4EncodingBits_RGB8A8
-
-		private:
-
-		void CalculateSourceAverages(void);
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp
deleted file mode 100644
index 4c012fbbf1..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_R11.cpp
-
-Block4x4Encoding_R11 is the encoder to use when targetting file format R11 and SR11 (signed R11).  
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// modifier values to use for R11, SR11, RG11 and SRG11
-	float Block4x4Encoding_R11::s_aafModifierTable[MODIFIER_TABLE_ENTRYS][SELECTORS]
-	{
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -9.0f / 255.0f, -15.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 8.0f / 255.0f, 14.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f, -10.0f / 255.0f, -13.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 9.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -6.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 12.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -12.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f, 11.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f,  -9.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 8.0f / 255.0f, 10.0f / 255.0f },
-		{ -4.0f / 255.0f, -7.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-
-		{ -2.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -4.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-		{ -1.0f / 255.0f, -2.0f / 255.0f,  -3.0f / 255.0f, -10.0f / 255.0f, 0.0f / 255.0f, 1.0f / 255.0f, 2.0f / 255.0f,  9.0f / 255.0f },
-		{ -4.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f,  -9.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  8.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f,  -9.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  8.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_R11::Block4x4Encoding_R11(void)
-	{
-
-		m_pencodingbitsR11 = nullptr;
-
-	}
-
-	Block4x4Encoding_R11::~Block4x4Encoding_R11(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_R11::InitFromSource(Block4x4 *a_pblockParent,
-		ColorFloatRGBA *a_pafrgbaSource,
-		unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)a_paucEncodingBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_R11::InitFromEncodingBits(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)a_paucEncodingBits;
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-			(unsigned char *)m_pencodingbitsR11,
-			a_pafrgbaSource,
-			a_errormetric);
-
-		// init R11 portion
-		{
-			m_mode = MODE_R11;
-			if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-			{
-				m_fRedBase = (float)(signed char)m_pencodingbitsR11->data.base;
-			}
-			else
-			{
-				m_fRedBase = (float)(unsigned char)m_pencodingbitsR11->data.base;
-			}
-			m_fRedMultiplier = (float)m_pencodingbitsR11->data.multiplier;
-			m_uiRedModifierTableIndex = m_pencodingbitsR11->data.table;
-
-			unsigned long long int ulliSelectorBits = 0;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors0 << 40;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors1 << 32;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors2 << 24;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors3 << 16;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors4 << 8;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors5;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiRedSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (SELECTORS - 1);
-			}
-
-			// decode the red channel
-			// calc red error
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				float fDecodedPixelData = 0.0f;
-				if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-				{
-					fDecodedPixelData = DecodePixelRed(m_fRedBase, m_fRedMultiplier,
-						m_uiRedModifierTableIndex,
-						m_auiRedSelectors[uiPixel]);
-				}
-				else if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-				{
-					fDecodedPixelData = DecodePixelRed(m_fRedBase + 128, m_fRedMultiplier,
-						m_uiRedModifierTableIndex,
-						m_auiRedSelectors[uiPixel]);
-				}
-				else
-				{
-					assert(0);
-				}
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fDecodedPixelData, 0.0f, 0.0f, 1.0f);
-			}
-			CalcBlockError();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_R11::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-		m_mode = MODE_R11;
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			m_fError = FLT_MAX;
-			m_fRedBlockError = FLT_MAX;		// artificially high value
-			CalculateR11(8, 0.0f, 0.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 1:
-			CalculateR11(8, 2.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			if (a_fEffort <= 24.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 2:
-			CalculateR11(8, 12.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			CalculateR11(7, 6.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 4:
-			CalculateR11(6, 3.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 5:
-			CalculateR11(5, 1.0f, 0.0f);
-			m_fError = m_fRedBlockError;
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base color, multiplier and selectors
-	//
-	// a_uiSelectorsUsed limits the number of selector combinations to try
-	// a_fBaseRadius limits the range of base colors to try
-	// a_fMultiplierRadius limits the range of multipliers to try
-	//
-	void Block4x4Encoding_R11::CalculateR11(unsigned int a_uiSelectorsUsed, 
-												float a_fBaseRadius, float a_fMultiplierRadius)
-	{
-		// maps from virtual (monotonic) selector to ETC selector
-		static const unsigned int auiVirtualSelectorMap[8] = {3, 2, 1, 0, 4, 5, 6, 7};
-
-		// find min/max red
-		float fMinRed = 1.0f;
-		float fMaxRed = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// ignore border pixels
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			float fRed = m_pafrgbaSource[uiPixel].fR;
-
-			if (fRed < fMinRed)
-			{
-				fMinRed = fRed;
-			}
-			if (fRed > fMaxRed)
-			{
-				fMaxRed = fRed;
-			}
-		}
-		assert(fMinRed <= fMaxRed);
-
-		float fRedRange = (fMaxRed - fMinRed);
-
-		// try each modifier table entry							  
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			for (unsigned int uiMinVirtualSelector = 0; 
-					uiMinVirtualSelector <= (8- a_uiSelectorsUsed); 
-					uiMinVirtualSelector++)
-			{
-				unsigned int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
-
-				unsigned int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
-				unsigned int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
-
-				float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fTableEntryRange = s_aafModifierTable[uiTableEntry][uiMaxSelector] -
-											s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-				float fCenter = fMinRed + fCenterRatio*fRedRange;
-				fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-				float fMinBase = fCenter - (a_fBaseRadius / 255.0f);
-				if (fMinBase < 0.0f)
-				{
-					fMinBase = 0.0f;
-				}
-
-				float fMaxBase = fCenter + (a_fBaseRadius / 255.0f);
-				if (fMaxBase > 1.0f)
-				{
-					fMaxBase = 1.0f;
-				}
-
-				for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-				{
-					float fRangeMultiplier = roundf(fRedRange / fTableEntryRange);
-
-					float fMinMultiplier = fRangeMultiplier - a_fMultiplierRadius;
-					if (fMinMultiplier < 1.0f)
-					{
-						fMinMultiplier = 0.0f;
-					}
-					else if (fMinMultiplier > 15.0f)
-					{
-						fMinMultiplier = 15.0f;
-					}
-
-					float fMaxMultiplier = fRangeMultiplier + a_fMultiplierRadius;
-					if (fMaxMultiplier < 1.0f)
-					{
-						fMaxMultiplier = 1.0f;
-					}
-					else if (fMaxMultiplier > 15.0f)
-					{
-						fMaxMultiplier = 15.0f;
-					}
-
-					for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-					{
-						// find best selector for each pixel
-						unsigned int auiBestSelectors[PIXELS];
-						float afBestRedError[PIXELS];
-						float afBestPixelRed[PIXELS];
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							float fBestPixelRedError = FLT_MAX;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								float fPixelRed = DecodePixelRed(fBase * 255.0f, fMultiplier, uiTableEntry, uiSelector);
-
-								ColorFloatRGBA frgba(fPixelRed, m_pafrgbaSource[uiPixel].fG,0.0f,1.0f);
-
-								float fPixelRedError = CalcPixelError(frgba, 1.0f, m_pafrgbaSource[uiPixel]);
-
-								if (fPixelRedError < fBestPixelRedError)
-								{
-									fBestPixelRedError = fPixelRedError;
-									auiBestSelectors[uiPixel] = uiSelector;
-									afBestRedError[uiPixel] = fBestPixelRedError;
-									afBestPixelRed[uiPixel] = fPixelRed;
-								}
-							}
-						}
-						float fBlockError = 0.0f;  
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							fBlockError += afBestRedError[uiPixel];
-						}
-						if (fBlockError < m_fRedBlockError)
-						{
-							m_fRedBlockError = fBlockError;
-
-							if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-							{
-								m_fRedBase = 255.0f * fBase;
-							}
-							else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-							{
-								m_fRedBase = (fBase * 255) - 128;
-							}
-							else
-							{
-								assert(0);
-							}
-							m_fRedMultiplier = fMultiplier;
-							m_uiRedModifierTableIndex = uiTableEntry;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiRedSelectors[uiPixel] = auiBestSelectors[uiPixel];
-								float fBestPixelRed = afBestPixelRed[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fBestPixelRed, 0.0f, 0.0f, 1.0f);
-								m_afDecodedAlphas[uiPixel] = 1.0f;
-							}
-						}
-					}
-				}
-
-			}
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_R11::SetEncodingBits(void)
-	{
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsR11->data.base = (unsigned char)roundf(m_fRedBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsR11->data.base = (signed char)roundf(m_fRedBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsR11->data.table = m_uiRedModifierTableIndex;
-		m_pencodingbitsR11->data.multiplier = (unsigned char)roundf(m_fRedMultiplier);
-
-		unsigned long long int ulliSelectorBits = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiShift = 45 - (3 * uiPixel);
-			ulliSelectorBits |= ((unsigned long long int)m_auiRedSelectors[uiPixel]) << uiShift;
-		}
-
-		m_pencodingbitsR11->data.selectors0 = ulliSelectorBits >> 40;
-		m_pencodingbitsR11->data.selectors1 = ulliSelectorBits >> 32;
-		m_pencodingbitsR11->data.selectors2 = ulliSelectorBits >> 24;
-		m_pencodingbitsR11->data.selectors3 = ulliSelectorBits >> 16;
-		m_pencodingbitsR11->data.selectors4 = ulliSelectorBits >> 8;
-		m_pencodingbitsR11->data.selectors5 = ulliSelectorBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h
deleted file mode 100644
index b40c1e0036..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_R11;
-
-	// ################################################################################
-	// Block4x4Encoding_R11
-	// ################################################################################
-
-	class Block4x4Encoding_R11 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		Block4x4Encoding_R11(void);
-		virtual ~Block4x4Encoding_R11(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-			ColorFloatRGBA *a_pafrgbaSource,
-			unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-			unsigned char *a_paucEncodingBits,
-			ColorFloatRGBA *a_pafrgbaSource,
-			ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		inline float GetRedBase(void) const
-		{
-			return m_fRedBase;
-		}
-
-		inline float GetRedMultiplier(void) const
-		{
-			return m_fRedMultiplier;
-		}
-
-		inline int GetRedTableIndex(void) const
-		{
-			return m_uiRedModifierTableIndex;
-		}
-
-		inline const unsigned int * GetRedSelectors(void) const
-		{
-			return m_auiRedSelectors;
-		}
-
-	protected:
-
-		static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
-		static const unsigned int SELECTOR_BITS = 3;
-		static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
-
-		static float s_aafModifierTable[MODIFIER_TABLE_ENTRYS][SELECTORS];
-
-		void CalculateR11(unsigned int a_uiSelectorsUsed, 
-							float a_fBaseRadius, float a_fMultiplierRadius);
-
-		
-
-	
-		inline float DecodePixelRed(float a_fBase, float a_fMultiplier,
-			unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-		{
-			float fMultiplier = a_fMultiplier;
-			if (fMultiplier <= 0.0f)
-			{
-				fMultiplier = 1.0f / 8.0f;
-			}
-
-			float fPixelRed = a_fBase * 8 + 4 +
-				8 * fMultiplier*s_aafModifierTable[a_uiTableIndex][a_uiSelector]*255;
-			fPixelRed /= 2047.0f;
-
-			if (fPixelRed < 0.0f)
-			{
-				fPixelRed = 0.0f;
-			}
-			else if (fPixelRed > 1.0f)
-			{
-				fPixelRed = 1.0f;
-			}
-
-			return fPixelRed;
-		}
-
-		Block4x4EncodingBits_R11 *m_pencodingbitsR11;
-
-		float m_fRedBase;
-		float m_fRedMultiplier;
-		float m_fRedBlockError;
-		unsigned int m_uiRedModifierTableIndex;
-		unsigned int m_auiRedSelectors[PIXELS];
-
-		
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp
deleted file mode 100644
index 417835db51..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RG11.cpp
-
-Block4x4Encoding_RG11 is the encoder to use when targetting file format RG11 and SRG11 (signed RG11).
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RG11.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RG11::Block4x4Encoding_RG11(void)
-	{
-		m_pencodingbitsRG11 = nullptr;
-	}
-
-	Block4x4Encoding_RG11::~Block4x4Encoding_RG11(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RG11::InitFromSource(Block4x4 *a_pblockParent,
-		ColorFloatRGBA *a_pafrgbaSource,
-		unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsRG11 = (Block4x4EncodingBits_RG11 *)a_paucEncodingBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RG11::InitFromEncodingBits(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-
-		m_pencodingbitsRG11 = (Block4x4EncodingBits_RG11 *)a_paucEncodingBits;
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-			(unsigned char *)m_pencodingbitsRG11,
-			a_pafrgbaSource,
-			a_errormetric);
-		m_fError = 0.0f;
-
-		{
-			m_mode = MODE_RG11;
-			if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-			{
-				m_fRedBase = (float)(signed char)m_pencodingbitsRG11->data.baseR;
-				m_fGrnBase = (float)(signed char)m_pencodingbitsRG11->data.baseG;
-			}
-			else
-			{
-				m_fRedBase = (float)(unsigned char)m_pencodingbitsRG11->data.baseR;
-				m_fGrnBase = (float)(unsigned char)m_pencodingbitsRG11->data.baseG;
-			}
-			m_fRedMultiplier = (float)m_pencodingbitsRG11->data.multiplierR;
-			m_fGrnMultiplier = (float)m_pencodingbitsRG11->data.multiplierG;
-			m_uiRedModifierTableIndex = m_pencodingbitsRG11->data.tableIndexR;
-			m_uiGrnModifierTableIndex = m_pencodingbitsRG11->data.tableIndexG;
-
-			unsigned long long int ulliSelectorBitsR = 0;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR0 << 40;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR1 << 32;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR2 << 24;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR3 << 16;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR4 << 8;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR5;
-
-			unsigned long long int ulliSelectorBitsG = 0;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG0 << 40;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG1 << 32;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG2 << 24;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG3 << 16;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG4 << 8;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG5;
-
-			
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiRedSelectors[uiPixel] = (ulliSelectorBitsR >> uiShift) & (SELECTORS - 1);
-				m_auiGrnSelectors[uiPixel] = (ulliSelectorBitsG >> uiShift) & (SELECTORS - 1);
-			}
-
-			
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				float fRedDecodedData = 0.0f;
-				float fGrnDecodedData = 0.0f;
-				if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-				{
-					fRedDecodedData = DecodePixelRed(m_fRedBase, m_fRedMultiplier, m_uiRedModifierTableIndex, m_auiRedSelectors[uiPixel]);
-					fGrnDecodedData = DecodePixelRed(m_fGrnBase, m_fGrnMultiplier, m_uiGrnModifierTableIndex, m_auiGrnSelectors[uiPixel]);
-				}
-				else if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-				{
-					fRedDecodedData = DecodePixelRed(m_fRedBase + 128, m_fRedMultiplier, m_uiRedModifierTableIndex, m_auiRedSelectors[uiPixel]);
-					fGrnDecodedData = DecodePixelRed(m_fGrnBase + 128, m_fGrnMultiplier, m_uiGrnModifierTableIndex, m_auiGrnSelectors[uiPixel]);
-				}
-				else
-				{
-					assert(0);
-				}
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fRedDecodedData, fGrnDecodedData, 0.0f, 1.0f);
-			}
-
-		}
-
-		CalcBlockError();
- 	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RG11::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			m_fError = FLT_MAX;
-			m_fGrnBlockError = FLT_MAX;		// artificially high value
-			m_fRedBlockError = FLT_MAX;
-			CalculateR11(8, 0.0f, 0.0f);
-			CalculateG11(8, 0.0f, 0.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 1:
-			CalculateR11(8, 2.0f, 1.0f);
-			CalculateG11(8, 2.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			if (a_fEffort <= 24.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 2:
-			CalculateR11(8, 12.0f, 1.0f);
-			CalculateG11(8, 12.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			CalculateR11(7, 6.0f, 1.0f);
-			CalculateG11(7, 6.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 4:
-			CalculateR11(6, 3.0f, 1.0f);
-			CalculateG11(6, 3.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 5:
-			CalculateR11(5, 1.0f, 0.0f);
-			CalculateG11(5, 1.0f, 0.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base color, multiplier and selectors
-	//
-	// a_uiSelectorsUsed limits the number of selector combinations to try
-	// a_fBaseRadius limits the range of base colors to try
-	// a_fMultiplierRadius limits the range of multipliers to try
-	//
-	void Block4x4Encoding_RG11::CalculateG11(unsigned int a_uiSelectorsUsed,
-		float a_fBaseRadius, float a_fMultiplierRadius)
-	{
-		// maps from virtual (monotonic) selector to etc selector
-		static const unsigned int auiVirtualSelectorMap[8] = { 3, 2, 1, 0, 4, 5, 6, 7 };
-
-		// find min/max Grn
-		float fMinGrn = 1.0f;
-		float fMaxGrn = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// ignore border pixels
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			float fGrn = m_pafrgbaSource[uiPixel].fG;
-
-			if (fGrn < fMinGrn)
-			{
-				fMinGrn = fGrn;
-			}
-			if (fGrn > fMaxGrn)
-			{
-				fMaxGrn = fGrn;
-			}
-		}
-		assert(fMinGrn <= fMaxGrn);
-
-		float fGrnRange = (fMaxGrn - fMinGrn);
-
-		// try each modifier table entry							  
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			for (unsigned int uiMinVirtualSelector = 0;
-			uiMinVirtualSelector <= (8 - a_uiSelectorsUsed);
-				uiMinVirtualSelector++)
-			{
-				unsigned int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
-
-				unsigned int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
-				unsigned int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
-
-				float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fTableEntryRange = s_aafModifierTable[uiTableEntry][uiMaxSelector] -
-					s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-				float fCenter = fMinGrn + fCenterRatio*fGrnRange;
-				fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-				float fMinBase = fCenter - (a_fBaseRadius / 255.0f);
-				if (fMinBase < 0.0f)
-				{
-					fMinBase = 0.0f;
-				}
-
-				float fMaxBase = fCenter + (a_fBaseRadius / 255.0f);
-				if (fMaxBase > 1.0f)
-				{
-					fMaxBase = 1.0f;
-				}
-
-				for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-				{
-					float fRangeMultiplier = roundf(fGrnRange / fTableEntryRange);
-
-					float fMinMultiplier = fRangeMultiplier - a_fMultiplierRadius;
-					if (fMinMultiplier < 1.0f)
-					{
-						fMinMultiplier = 0.0f;
-					}
-					else if (fMinMultiplier > 15.0f)
-					{
-						fMinMultiplier = 15.0f;
-					}
-
-					float fMaxMultiplier = fRangeMultiplier + a_fMultiplierRadius;
-					if (fMaxMultiplier < 1.0f)
-					{
-						fMaxMultiplier = 1.0f;
-					}
-					else if (fMaxMultiplier > 15.0f)
-					{
-						fMaxMultiplier = 15.0f;
-					}
-
-					for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-					{
-						// find best selector for each pixel
-						unsigned int auiBestSelectors[PIXELS];
-						float afBestGrnError[PIXELS];
-						float afBestPixelGrn[PIXELS];
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							float fBestPixelGrnError = FLT_MAX;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								//DecodePixelRed is not red channel specific
-								float fPixelGrn = DecodePixelRed(fBase * 255.0f, fMultiplier, uiTableEntry, uiSelector);
-								
-								ColorFloatRGBA frgba(m_pafrgbaSource[uiPixel].fR, fPixelGrn, 0.0f, 1.0f);
-									
-								float fPixelGrnError = CalcPixelError(frgba, 1.0f, m_pafrgbaSource[uiPixel]);
-
-								if (fPixelGrnError < fBestPixelGrnError)
-								{
-									fBestPixelGrnError = fPixelGrnError;
-									auiBestSelectors[uiPixel] = uiSelector;
-									afBestGrnError[uiPixel] = fBestPixelGrnError;
-									afBestPixelGrn[uiPixel] = fPixelGrn;
-								}
-							}
-						}
-						float fBlockError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							fBlockError += afBestGrnError[uiPixel];
-						}
-
-						if (fBlockError < m_fGrnBlockError)
-						{
-							m_fGrnBlockError = fBlockError;
-
-							if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-							{
-								m_fGrnBase = 255.0f * fBase;
-							}
-							else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-							{
-								m_fGrnBase = (fBase * 255) - 128;
-							}
-							else
-							{
-								assert(0);
-							}
-							m_fGrnMultiplier = fMultiplier;
-							m_uiGrnModifierTableIndex = uiTableEntry;
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiGrnSelectors[uiPixel] = auiBestSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel].fG = afBestPixelGrn[uiPixel];
-								m_afDecodedAlphas[uiPixel] = 1.0f;
-							}
-						}
-					}
-				}
-
-			}
-		}
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RG11::SetEncodingBits(void)
-	{
-		unsigned long long int ulliSelectorBitsR = 0;
-		unsigned long long int ulliSelectorBitsG = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiShift = 45 - (3 * uiPixel);
-			ulliSelectorBitsR |= ((unsigned long long int)m_auiRedSelectors[uiPixel]) << uiShift;
-			ulliSelectorBitsG |= ((unsigned long long int)m_auiGrnSelectors[uiPixel]) << uiShift;
-		}
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsRG11->data.baseR = (unsigned char)roundf(m_fRedBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsRG11->data.baseR = (signed char)roundf(m_fRedBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsRG11->data.tableIndexR = m_uiRedModifierTableIndex;
-		m_pencodingbitsRG11->data.multiplierR = (unsigned char)roundf(m_fRedMultiplier);
-
-		m_pencodingbitsRG11->data.selectorsR0 = ulliSelectorBitsR >> 40;
-		m_pencodingbitsRG11->data.selectorsR1 = ulliSelectorBitsR >> 32;
-		m_pencodingbitsRG11->data.selectorsR2 = ulliSelectorBitsR >> 24;
-		m_pencodingbitsRG11->data.selectorsR3 = ulliSelectorBitsR >> 16;
-		m_pencodingbitsRG11->data.selectorsR4 = ulliSelectorBitsR >> 8;
-		m_pencodingbitsRG11->data.selectorsR5 = ulliSelectorBitsR;
-
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsRG11->data.baseG = (unsigned char)roundf(m_fGrnBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsRG11->data.baseG = (signed char)roundf(m_fGrnBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsRG11->data.tableIndexG = m_uiGrnModifierTableIndex;
-		m_pencodingbitsRG11->data.multiplierG = (unsigned char)roundf(m_fGrnMultiplier);
-
-		m_pencodingbitsRG11->data.selectorsG0 = ulliSelectorBitsG >> 40;
-		m_pencodingbitsRG11->data.selectorsG1 = ulliSelectorBitsG >> 32;
-		m_pencodingbitsRG11->data.selectorsG2 = ulliSelectorBitsG >> 24;
-		m_pencodingbitsRG11->data.selectorsG3 = ulliSelectorBitsG >> 16;
-		m_pencodingbitsRG11->data.selectorsG4 = ulliSelectorBitsG >> 8;
-		m_pencodingbitsRG11->data.selectorsG5 = ulliSelectorBitsG;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h
deleted file mode 100644
index d4993b8c5f..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_RG11;
-
-	// ################################################################################
-	// Block4x4Encoding_RG11
-	// ################################################################################
-
-	class Block4x4Encoding_RG11 : public Block4x4Encoding_R11
-	{
-		float m_fGrnBase;
-		float m_fGrnMultiplier;
-		float m_fGrnBlockError;
-		unsigned int m_auiGrnSelectors[PIXELS];
-		unsigned int m_uiGrnModifierTableIndex;
-	public:
-
-		Block4x4Encoding_RG11(void);
-		virtual ~Block4x4Encoding_RG11(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-			ColorFloatRGBA *a_pafrgbaSource,
-
-			unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-			unsigned char *a_paucEncodingBits,
-			ColorFloatRGBA *a_pafrgbaSource,
-
-			ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		Block4x4EncodingBits_RG11 *m_pencodingbitsRG11;
-
-		void CalculateG11(unsigned int a_uiSelectorsUsed, float a_fBaseRadius, float a_fMultiplierRadius);
-
-		inline float GetGrnBase(void) const
-		{
-			return m_fGrnBase;
-		}
-
-		inline float GetGrnMultiplier(void) const
-		{
-			return m_fGrnMultiplier;
-		}
-
-		inline int GetGrnTableIndex(void) const
-		{
-			return m_uiGrnModifierTableIndex;
-		}
-
-		inline const unsigned int * GetGrnSelectors(void) const
-		{
-			return m_auiGrnSelectors;
-		}
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
deleted file mode 100644
index 5c7ebed788..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
+++ /dev/null
@@ -1,1730 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8.cpp
-
-Block4x4Encoding_RGB8 is the encoder to use for the ETC2 extensions when targetting file format RGB8.  
-This encoder is also used for the ETC2 subset of file format RGBA8.
-
-Block4x4Encoding_ETC1 encodes the ETC1 subset of RGB8.
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-#include "EtcMath.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-	float Block4x4Encoding_RGB8::s_afTHDistanceTable[TH_DISTANCES] =
-	{
-		3.0f / 255.0f,
-		6.0f / 255.0f,
-		11.0f / 255.0f,
-		16.0f / 255.0f,
-		23.0f / 255.0f,
-		32.0f / 255.0f,
-		41.0f / 255.0f,
-		64.0f / 255.0f
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8::Block4x4Encoding_RGB8(void)
-	{
-
-		m_pencodingbitsRGB8 = nullptr;
-
-	}
-
-	Block4x4Encoding_RGB8::~Block4x4Encoding_RGB8(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-		
-		// handle ETC1 modes
-		Block4x4Encoding_ETC1::InitFromEncodingBits(a_pblockParent,
-													a_paucEncodingBits, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		if (m_pencodingbitsRGB8->differential.diff)
-		{
-			int iRed1 = (int)m_pencodingbitsRGB8->differential.red1;
-			int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-			int iRed2 = iRed1 + iDRed2;
-
-			int iGreen1 = (int)m_pencodingbitsRGB8->differential.green1;
-			int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-			int iGreen2 = iGreen1 + iDGreen2;
-
-			int iBlue1 = (int)m_pencodingbitsRGB8->differential.blue1;
-			int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-			int iBlue2 = iBlue1 + iDBlue2;
-
-			if (iRed2 < 0 || iRed2 > 31)
-			{
-				InitFromEncodingBits_T();
-			}
-			else if (iGreen2 < 0 || iGreen2 > 31)
-			{
-				InitFromEncodingBits_H();
-			}
-			else if (iBlue2 < 0 || iBlue2 > 31)
-			{
-				InitFromEncodingBits_Planar();
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_T(void)
-	{
-
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_H(void)
-	{
-
-		m_mode = MODE_H;
-		
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) + 
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if Planar mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_Planar(void)
-	{
-
-		m_mode = MODE_PLANAR;
-
-		unsigned char ucOriginRed = m_pencodingbitsRGB8->planar.originRed;
-		unsigned char ucOriginGreen = (unsigned char)((m_pencodingbitsRGB8->planar.originGreen1 << 6) +
-										m_pencodingbitsRGB8->planar.originGreen2);
-		unsigned char ucOriginBlue = (unsigned char)((m_pencodingbitsRGB8->planar.originBlue1 << 5) +
-										(m_pencodingbitsRGB8->planar.originBlue2 << 3) +
-										(m_pencodingbitsRGB8->planar.originBlue3 << 1) +
-										m_pencodingbitsRGB8->planar.originBlue4);
-
-		unsigned char ucHorizRed = (unsigned char)((m_pencodingbitsRGB8->planar.horizRed1 << 1) +
-									m_pencodingbitsRGB8->planar.horizRed2);
-		unsigned char ucHorizGreen = m_pencodingbitsRGB8->planar.horizGreen;
-		unsigned char ucHorizBlue = (unsigned char)((m_pencodingbitsRGB8->planar.horizBlue1 << 5) +
-									m_pencodingbitsRGB8->planar.horizBlue2);
-
-		unsigned char ucVertRed = (unsigned char)((m_pencodingbitsRGB8->planar.vertRed1 << 3) +
-									m_pencodingbitsRGB8->planar.vertRed2);
-		unsigned char ucVertGreen = (unsigned char)((m_pencodingbitsRGB8->planar.vertGreen1 << 2) +
-									m_pencodingbitsRGB8->planar.vertGreen2);
-		unsigned char ucVertBlue = m_pencodingbitsRGB8->planar.vertBlue;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromR6G7B6(ucOriginRed, ucOriginGreen, ucOriginBlue);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromR6G7B6(ucHorizRed, ucHorizGreen, ucHorizBlue);
-		m_frgbaColor3 = ColorFloatRGBA::ConvertFromR6G7B6(ucVertRed, ucVertGreen, ucVertBlue);
-
-		DecodePixels_Planar();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			Block4x4Encoding_ETC1::PerformFirstIteration();
-			if (m_boolDone)
-			{
-				break;
-			}
-			TryPlanar(0);
-			SetDoneIfPerfect();
-			if (m_boolDone)
-			{
-				break;
-			}
-			TryTAndH(0);
-			break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryIndividual(m_boolMostLikelyFlip, 1);
-			break;
-
-		case 3:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 4:
-			Block4x4Encoding_ETC1::TryIndividual(!m_boolMostLikelyFlip, 1);
-			break;
-
-		case 5:
-			TryPlanar(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryTAndH(1);
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 9:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 89.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 10:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in Planar mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryPlanar(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		encodingTry.CalculatePlanarCornerColors();
-
-		encodingTry.DecodePixels_Planar();
-
-		encodingTry.CalcBlockError();
-
-		if (a_uiRadius > 0)
-		{
-			encodingTry.TwiddlePlanar();
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_PLANAR;
-			m_boolDiff = true;
-			m_boolFlip = false;
-			m_frgbaColor1 = encodingTry.m_frgbaColor1;
-			m_frgbaColor2 = encodingTry.m_frgbaColor2;
-			m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-			}
-
-			m_fError = encodingTry.m_fError;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode or H mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryTAndH(unsigned int a_uiRadius)
-	{
-
-		CalculateBaseColorsForTAndH();
-
-		TryT(a_uiRadius);
-
-		TryH(a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate original values for base colors
-	// store them in m_frgbaOriginalColor1 and m_frgbaOriginalColor2
-	//
-	void Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH(void)
-	{
-
-		bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
-
-		ColorFloatRGBA frgbaBlockAverage = (m_frgbaSourceAverageLeft + m_frgbaSourceAverageRight) * 0.5f;
-
-		// find pixel farthest from average gray line
-		unsigned int uiFarthestPixel = 0;
-		float fFarthestGrayDistance2 = 0.0f;
-		unsigned int uiTransparentPixels = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// don't count transparent
-			if (m_pafrgbaSource[uiPixel].fA == 0.0f && !boolRGBX)
-			{
-				uiTransparentPixels++;
-			}
-			else
-			{
-				float fGrayDistance2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], frgbaBlockAverage);
-
-				if (fGrayDistance2 > fFarthestGrayDistance2)
-				{
-					uiFarthestPixel = uiPixel;
-					fFarthestGrayDistance2 = fGrayDistance2;
-				}
-			}
-		}
-		// a transparent block should not reach this method
-		assert(uiTransparentPixels < PIXELS);
-
-		// set the original base colors to:
-		//		half way to the farthest pixel and
-		//		the mirror color on the other side of the average
-		ColorFloatRGBA frgbaOffset = (m_pafrgbaSource[uiFarthestPixel] - frgbaBlockAverage) * 0.5f;
-		m_frgbaOriginalColor1_TAndH = (frgbaBlockAverage + frgbaOffset).QuantizeR4G4B4();
-		m_frgbaOriginalColor2_TAndH = (frgbaBlockAverage - frgbaOffset).ClampRGB().QuantizeR4G4B4();	// the "other side" might be out of range
-
-		// move base colors to find best fit
-		for (unsigned int uiIteration = 0; uiIteration < 10; uiIteration++)
-		{
-			// find the center of pixels closest to each color
-			float fPixelsCloserToColor1 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor1;
-			float fPixelsCloserToColor2 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor2;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				// don't count transparent pixels
-				if (m_pafrgbaSource[uiPixel].fA == 0.0f)
-				{
-					continue;
-				}
-
-				float fGrayDistance2ToColor1 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor1_TAndH);
-				float fGrayDistance2ToColor2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor2_TAndH);
-
-				ColorFloatRGBA frgbaAlphaWeightedSource = m_pafrgbaSource[uiPixel] * m_pafrgbaSource[uiPixel].fA;
-					
-				if (fGrayDistance2ToColor1 <= fGrayDistance2ToColor2)
-				{
-					fPixelsCloserToColor1 += m_pafrgbaSource[uiPixel].fA;
-					frgbSumPixelsCloserToColor1 = frgbSumPixelsCloserToColor1 + frgbaAlphaWeightedSource;
-				}
-				else
-				{
-					fPixelsCloserToColor2 += m_pafrgbaSource[uiPixel].fA;
-					frgbSumPixelsCloserToColor2 = frgbSumPixelsCloserToColor2 + frgbaAlphaWeightedSource;
-				}
-			}
-			if (fPixelsCloserToColor1 == 0.0f || fPixelsCloserToColor2 == 0.0f)
-			{
-				break;
-			}
-
-			ColorFloatRGBA frgbAvgColor1Pixels = (frgbSumPixelsCloserToColor1 * (1.0f / fPixelsCloserToColor1)).QuantizeR4G4B4();
-			ColorFloatRGBA frgbAvgColor2Pixels = (frgbSumPixelsCloserToColor2 * (1.0f / fPixelsCloserToColor2)).QuantizeR4G4B4();
-
-			if (frgbAvgColor1Pixels.fR == m_frgbaOriginalColor1_TAndH.fR &&
-				frgbAvgColor1Pixels.fG == m_frgbaOriginalColor1_TAndH.fG &&
-				frgbAvgColor1Pixels.fB == m_frgbaOriginalColor1_TAndH.fB &&
-				frgbAvgColor2Pixels.fR == m_frgbaOriginalColor2_TAndH.fR &&
-				frgbAvgColor2Pixels.fG == m_frgbaOriginalColor2_TAndH.fG &&
-				frgbAvgColor2Pixels.fB == m_frgbaOriginalColor2_TAndH.fB)
-			{
-				break;
-			}
-
-			m_frgbaOriginalColor1_TAndH = frgbAvgColor1Pixels;
-			m_frgbaOriginalColor2_TAndH = frgbAvgColor2Pixels;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = m_frgbaColor2;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-														m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryH_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-														m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// use linear regression to find the best fit for colors along the edges of the 4x4 block
-	//
-	void Block4x4Encoding_RGB8::CalculatePlanarCornerColors(void)
-	{
-		ColorFloatRGBA afrgbaRegression[MAX_PLANAR_REGRESSION_SIZE];
-		ColorFloatRGBA frgbaSlope;
-		ColorFloatRGBA frgbaOffset;
-
-		// top edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[4];
-		afrgbaRegression[2] = m_pafrgbaSource[8];
-		afrgbaRegression[3] = m_pafrgbaSource[12];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = frgbaOffset;
-		m_frgbaColor2 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// left edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[1];
-		afrgbaRegression[2] = m_pafrgbaSource[2];
-		afrgbaRegression[3] = m_pafrgbaSource[3];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = (m_frgbaColor1 + frgbaOffset) * 0.5f;		// average with top edge
-		m_frgbaColor3 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// right edge
-		afrgbaRegression[0] = m_pafrgbaSource[12];
-		afrgbaRegression[1] = m_pafrgbaSource[13];
-		afrgbaRegression[2] = m_pafrgbaSource[14];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor2 = (m_frgbaColor2 + frgbaOffset) * 0.5f;		// average with top edge
-
-		// bottom edge
-		afrgbaRegression[0] = m_pafrgbaSource[3];
-		afrgbaRegression[1] = m_pafrgbaSource[7];
-		afrgbaRegression[2] = m_pafrgbaSource[11];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor3 = (m_frgbaColor3 + frgbaOffset) * 0.5f;		// average with left edge
-
-		// quantize corner colors to 6/7/6
-		m_frgbaColor1 = m_frgbaColor1.QuantizeR6G7B6();
-		m_frgbaColor2 = m_frgbaColor2.QuantizeR6G7B6();
-		m_frgbaColor3 = m_frgbaColor3.QuantizeR6G7B6();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R, G and B independently
-	//
-	// R, G and B decoding and errors are independent, so R, G and B twiddles can be independent
-	//
-	// return true if improvement
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanar(void)
-	{
-		bool boolImprovement = false;
-
-		while (TwiddlePlanarR())
-		{
-			boolImprovement = true;
-		}
-
-		while (TwiddlePlanarG())
-		{
-			boolImprovement = true;
-		}
-
-		while (TwiddlePlanarB())
-		{
-			boolImprovement = true;
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarR()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginRed = encodingTry.m_frgbaColor1.IntRed(63.0f);
-		int iHorizRed = encodingTry.m_frgbaColor2.IntRed(63.0f);
-		int iVertRed = encodingTry.m_frgbaColor3.IntRed(63.0f);
-
-		for (int iTryOriginRed = iOriginRed - 1; iTryOriginRed <= iOriginRed + 1; iTryOriginRed++)
-		{
-			// check for out of range
-			if (iTryOriginRed < 0 || iTryOriginRed > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fR = ((iTryOriginRed << 2) + (iTryOriginRed >> 4)) / 255.0f;
-
-			for (int iTryHorizRed = iHorizRed - 1; iTryHorizRed <= iHorizRed + 1; iTryHorizRed++)
-			{
-				// check for out of range
-				if (iTryHorizRed < 0 || iTryHorizRed > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fR = ((iTryHorizRed << 2) + (iTryHorizRed >> 4)) / 255.0f;
-
-				for (int iTryVertRed = iVertRed - 1; iTryVertRed <= iVertRed + 1; iTryVertRed++)
-				{
-					// check for out of range
-					if (iTryVertRed < 0 || iTryVertRed > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginRed == iOriginRed && iTryHorizRed == iHorizRed && iTryVertRed == iVertRed)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fR = ((iTryVertRed << 2) + (iTryVertRed >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing G
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarG()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginGreen = encodingTry.m_frgbaColor1.IntGreen(127.0f);
-		int iHorizGreen = encodingTry.m_frgbaColor2.IntGreen(127.0f);
-		int iVertGreen = encodingTry.m_frgbaColor3.IntGreen(127.0f);
-
-		for (int iTryOriginGreen = iOriginGreen - 1; iTryOriginGreen <= iOriginGreen + 1; iTryOriginGreen++)
-		{
-			// check for out of range
-			if (iTryOriginGreen < 0 || iTryOriginGreen > 127)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fG = ((iTryOriginGreen << 1) + (iTryOriginGreen >> 6)) / 255.0f;
-
-			for (int iTryHorizGreen = iHorizGreen - 1; iTryHorizGreen <= iHorizGreen + 1; iTryHorizGreen++)
-			{
-				// check for out of range
-				if (iTryHorizGreen < 0 || iTryHorizGreen > 127)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fG = ((iTryHorizGreen << 1) + (iTryHorizGreen >> 6)) / 255.0f;
-
-				for (int iTryVertGreen = iVertGreen - 1; iTryVertGreen <= iVertGreen + 1; iTryVertGreen++)
-				{
-					// check for out of range
-					if (iTryVertGreen < 0 || iTryVertGreen > 127)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginGreen == iOriginGreen && 
-						iTryHorizGreen == iHorizGreen && 
-						iTryVertGreen == iVertGreen)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fG = ((iTryVertGreen << 1) + (iTryVertGreen >> 6)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing B
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarB()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginBlue = encodingTry.m_frgbaColor1.IntBlue(63.0f);
-		int iHorizBlue = encodingTry.m_frgbaColor2.IntBlue(63.0f);
-		int iVertBlue = encodingTry.m_frgbaColor3.IntBlue(63.0f);
-
-		for (int iTryOriginBlue = iOriginBlue - 1; iTryOriginBlue <= iOriginBlue + 1; iTryOriginBlue++)
-		{
-			// check for out of range
-			if (iTryOriginBlue < 0 || iTryOriginBlue > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fB = ((iTryOriginBlue << 2) + (iTryOriginBlue >> 4)) / 255.0f;
-
-			for (int iTryHorizBlue = iHorizBlue - 1; iTryHorizBlue <= iHorizBlue + 1; iTryHorizBlue++)
-			{
-				// check for out of range
-				if (iTryHorizBlue < 0 || iTryHorizBlue > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fB = ((iTryHorizBlue << 2) + (iTryHorizBlue >> 4)) / 255.0f;
-
-				for (int iTryVertBlue = iVertBlue - 1; iTryVertBlue <= iVertBlue + 1; iTryVertBlue++)
-				{
-					// check for out of range
-					if (iTryVertBlue < 0 || iTryVertBlue > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginBlue == iOriginBlue && iTryHorizBlue == iHorizBlue && iTryVertBlue == iVertBlue)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fB = ((iTryVertBlue << 2) + (iTryVertBlue >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits(void)
-	{
-
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			Block4x4Encoding_ETC1::SetEncodingBits();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		m_pencodingbitsRGB8->t.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		m_pencodingbitsRGB8->h.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_Planar(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_PLANAR);
-		assert(m_boolDiff == true);
-
-		unsigned int uiOriginRed = (unsigned int)m_frgbaColor1.IntRed(63.0f);
-		unsigned int uiOriginGreen = (unsigned int)m_frgbaColor1.IntGreen(127.0f);
-		unsigned int uiOriginBlue = (unsigned int)m_frgbaColor1.IntBlue(63.0f);
-
-		unsigned int uiHorizRed = (unsigned int)m_frgbaColor2.IntRed(63.0f);
-		unsigned int uiHorizGreen = (unsigned int)m_frgbaColor2.IntGreen(127.0f);
-		unsigned int uiHorizBlue = (unsigned int)m_frgbaColor2.IntBlue(63.0f);
-
-		unsigned int uiVertRed = (unsigned int)m_frgbaColor3.IntRed(63.0f);
-		unsigned int uiVertGreen = (unsigned int)m_frgbaColor3.IntGreen(127.0f);
-		unsigned int uiVertBlue = (unsigned int)m_frgbaColor3.IntBlue(63.0f);
-
-		m_pencodingbitsRGB8->planar.originRed = uiOriginRed;
-		m_pencodingbitsRGB8->planar.originGreen1 = uiOriginGreen >> 6;
-		m_pencodingbitsRGB8->planar.originGreen2 = uiOriginGreen;
-		m_pencodingbitsRGB8->planar.originBlue1 = uiOriginBlue >> 5;
-		m_pencodingbitsRGB8->planar.originBlue2 = uiOriginBlue >> 3;
-		m_pencodingbitsRGB8->planar.originBlue3 = uiOriginBlue >> 1;
-		m_pencodingbitsRGB8->planar.originBlue4 = uiOriginBlue;
-
-		m_pencodingbitsRGB8->planar.horizRed1 = uiHorizRed >> 1;
-		m_pencodingbitsRGB8->planar.horizRed2 = uiHorizRed;
-		m_pencodingbitsRGB8->planar.horizGreen = uiHorizGreen;
-		m_pencodingbitsRGB8->planar.horizBlue1 = uiHorizBlue >> 5;
-		m_pencodingbitsRGB8->planar.horizBlue2 = uiHorizBlue;
-
-		m_pencodingbitsRGB8->planar.vertRed1 = uiVertRed >> 3;
-		m_pencodingbitsRGB8->planar.vertRed2 = uiVertRed;
-		m_pencodingbitsRGB8->planar.vertGreen1 = uiVertGreen >> 2;
-		m_pencodingbitsRGB8->planar.vertGreen2 = uiVertGreen;
-		m_pencodingbitsRGB8->planar.vertBlue = uiVertBlue;
-
-		m_pencodingbitsRGB8->planar.diff = 1;
-
-		// create valid RG differentials and an invalid B differential to trigger planar mode
-		m_pencodingbitsRGB8->planar.detect1 = 0;
-		m_pencodingbitsRGB8->planar.detect2 = 0;
-		m_pencodingbitsRGB8->planar.detect3 = 0;
-		m_pencodingbitsRGB8->planar.detect4 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		int iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect1 = 1;
-		}
-		if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect2 = 1;
-		}
-		if (iBlue2 >= 4)
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 7;
-			m_pencodingbitsRGB8->planar.detect4 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 0;
-			m_pencodingbitsRGB8->planar.detect4 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-			iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-
-			// make sure red and green don't overflow and blue does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 >= 0 && iGreen2 <= 31);
-			assert(iBlue2 < 0 || iBlue2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for H mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_Planar(void)
-	{
-
-		int iRO = (int)roundf(m_frgbaColor1.fR * 255.0f);
-		int iGO = (int)roundf(m_frgbaColor1.fG * 255.0f);
-		int iBO = (int)roundf(m_frgbaColor1.fB * 255.0f);
-
-		int iRH = (int)roundf(m_frgbaColor2.fR * 255.0f);
-		int iGH = (int)roundf(m_frgbaColor2.fG * 255.0f);
-		int iBH = (int)roundf(m_frgbaColor2.fB * 255.0f);
-
-		int iRV = (int)roundf(m_frgbaColor3.fR * 255.0f);
-		int iGV = (int)roundf(m_frgbaColor3.fG * 255.0f);
-		int iBV = (int)roundf(m_frgbaColor3.fB * 255.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			int iX = (int)(uiPixel >> 2);
-			int iY = (int)(uiPixel & 3);
-
-			int iR = (iX*(iRH - iRO) + iY*(iRV - iRO) + 4*iRO + 2) >> 2;
-			int iG = (iX*(iGH - iGO) + iY*(iGV - iGO) + 4*iGO + 2) >> 2;
-			int iB = (iX*(iBH - iBO) + iY*(iBV - iBO) + 4*iBO + 2) >> 2;
-
-			ColorFloatRGBA frgba;
-			frgba.fR = (float)iR / 255.0f;
-			frgba.fG = (float)iG / 255.0f;
-			frgba.fB = (float)iB / 255.0f;
-			frgba.fA = 1.0f;
-
-			m_afrgbaDecodedColors[uiPixel] = frgba.ClampRGB();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a linear regression for the a_uiPixels in a_pafrgbaPixels[]
-	//
-	// output the closest color line using a_pfrgbaSlope and a_pfrgbaOffset
-	//
-	void Block4x4Encoding_RGB8::ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-												ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset)
-	{
-		typedef struct
-		{
-			float f[4];
-		} Float4;
-
-		Float4 *paf4Pixels = (Float4 *)(a_pafrgbaPixels);
-		Float4 *pf4Slope = (Float4 *)(a_pfrgbaSlope);
-		Float4 *pf4Offset = (Float4 *)(a_pfrgbaOffset);
-
-		float afX[MAX_PLANAR_REGRESSION_SIZE];
-		float afY[MAX_PLANAR_REGRESSION_SIZE];
-
-		// handle r, g and b separately.  don't bother with a
-		for (unsigned int uiComponent = 0; uiComponent < 3; uiComponent++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < a_uiPixels; uiPixel++)
-			{
-				afX[uiPixel] = (float)uiPixel;
-				afY[uiPixel] = paf4Pixels[uiPixel].f[uiComponent];
-				
-			}
-			Etc::Regression(afX, afY, a_uiPixels,
-				&(pf4Slope->f[uiComponent]), &(pf4Offset->f[uiComponent]));
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h
deleted file mode 100644
index 03754d5e3b..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_ETC1.h"
-
-namespace Etc
-{
-
-	class Block4x4Encoding_RGB8 : public Block4x4Encoding_ETC1
-	{
-	public:
-
-		Block4x4Encoding_RGB8(void);
-		virtual ~Block4x4Encoding_RGB8(void);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-		
-		virtual void SetEncodingBits(void);
-
-		inline ColorFloatRGBA GetColor3(void) const
-		{
-			return m_frgbaColor3;
-		}
-
-	protected:
-
-		static const unsigned int PLANAR_CORNER_COLORS = 3;
-		static const unsigned int MAX_PLANAR_REGRESSION_SIZE = 4;
-		static const unsigned int TH_DISTANCES = 8;
-
-		static float s_afTHDistanceTable[TH_DISTANCES];
-
-		void TryPlanar(unsigned int a_uiRadius);
-		void TryTAndH(unsigned int a_uiRadius);
-
-		void InitFromEncodingBits_Planar(void);
-
-		ColorFloatRGBA	m_frgbaColor3;		// used for planar
-
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-		void SetEncodingBits_Planar(void);
-
-		// state shared between iterations
-		ColorFloatRGBA	m_frgbaOriginalColor1_TAndH;
-		ColorFloatRGBA	m_frgbaOriginalColor2_TAndH;
-
-		void CalculateBaseColorsForTAndH(void);
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-	private:
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void CalculatePlanarCornerColors(void);
-
-		void ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-			ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset);
-
-		bool TwiddlePlanar(void);
-		bool TwiddlePlanarR();
-		bool TwiddlePlanarG();
-		bool TwiddlePlanarB();
-
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void DecodePixels_Planar(void);
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
deleted file mode 100644
index b94b64e68c..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
+++ /dev/null
@@ -1,1819 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8A1.cpp contains:
-	Block4x4Encoding_RGB8A1
-	Block4x4Encoding_RGB8A1_Opaque
-	Block4x4Encoding_RGB8A1_Transparent
-
-These encoders are used when targetting file format RGB8A1.
-
-Block4x4Encoding_RGB8A1_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGB8A1_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGB8A1 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1
-	// ####################################################################################################
-
-	float Block4x4Encoding_RGB8A1::s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS] =
-	{
-		{ 0.0f / 255.0f, 8.0f / 255.0f, 0.0f / 255.0f, -8.0f / 255.0f },
-		{ 0.0f / 255.0f, 17.0f / 255.0f, 0.0f / 255.0f, -17.0f / 255.0f },
-		{ 0.0f / 255.0f, 29.0f / 255.0f, 0.0f / 255.0f, -29.0f / 255.0f },
-		{ 0.0f / 255.0f, 42.0f / 255.0f, 0.0f / 255.0f, -42.0f / 255.0f },
-		{ 0.0f / 255.0f, 60.0f / 255.0f, 0.0f / 255.0f, -60.0f / 255.0f },
-		{ 0.0f / 255.0f, 80.0f / 255.0f, 0.0f / 255.0f, -80.0f / 255.0f },
-		{ 0.0f / 255.0f, 106.0f / 255.0f, 0.0f / 255.0f, -106.0f / 255.0f },
-		{ 0.0f / 255.0f, 183.0f / 255.0f, 0.0f / 255.0f, -183.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8A1::Block4x4Encoding_RGB8A1(void)
-	{
-		m_pencodingbitsRGB8 = nullptr;
-		m_boolOpaque = false;
-		m_boolTransparent = false;
-		m_boolPunchThroughPixels = true;
-
-	}
-	Block4x4Encoding_RGB8A1::~Block4x4Encoding_RGB8A1(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGB8A1::InitFromSource(Block4x4 *a_pblockParent,
-													ColorFloatRGBA *a_pafrgbaSource,
-													unsigned char *a_paucEncodingBits,
-													ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding_RGB8::InitFromSource(a_pblockParent,
-			a_pafrgbaSource,
-			a_paucEncodingBits,
-			a_errormetric);
-
-		m_boolOpaque = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE;
-		m_boolTransparent = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT;
-		m_boolPunchThroughPixels = a_pblockParent->HasPunchThroughPixels();
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			if (m_pafrgbaSource[uiPixel].fA >= 0.5f)
-			{
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-			else
-			{
-				m_afDecodedAlphas[uiPixel] = 0.0f;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-
-
-		InitFromEncodingBits_ETC1(a_pblockParent,
-			a_paucEncodingBits,
-			a_pafrgbaSource,
-			a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		int iRed1 = m_pencodingbitsRGB8->differential.red1;
-		int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-		int iRed2 = iRed1 + iDRed2;
-
-		int iGreen1 = m_pencodingbitsRGB8->differential.green1;
-		int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-		int iGreen2 = iGreen1 + iDGreen2;
-
-		int iBlue1 = m_pencodingbitsRGB8->differential.blue1;
-		int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-		int iBlue2 = iBlue1 + iDBlue2;
-
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			InitFromEncodingBits_T();
-		}
-		else if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			InitFromEncodingBits_H();
-		}
-		else if (iBlue2 < 0 || iBlue2 > 31)
-		{
-			Block4x4Encoding_RGB8::InitFromEncodingBits_Planar();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding assuming the encoding is an ETC1 mode.
-	// if it isn't an ETC1 mode, this will be overwritten later
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,
-			a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = m_pencodingbitsRGB8->differential.flip;
-		m_boolOpaque = m_pencodingbitsRGB8->differential.diff;
-
-		int iR2 = m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2;
-		if (iR2 < 0)
-		{
-			iR2 = 0;
-		}
-		else if (iR2 > 31)
-		{
-			iR2 = 31;
-		}
-
-		int iG2 = m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2;
-		if (iG2 < 0)
-		{
-			iG2 = 0;
-		}
-		else if (iG2 > 31)
-		{
-			iG2 = 31;
-		}
-
-		int iB2 = m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2;
-		if (iB2 < 0)
-		{
-			iB2 = 0;
-		}
-		else if (iB2 > 31)
-		{
-			iB2 = 31;
-		}
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
-
-		m_uiCW1 = m_pencodingbitsRGB8->differential.cw1;
-		m_uiCW2 = m_pencodingbitsRGB8->differential.cw2;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		Decode_ETC1();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_T(void)
-	{
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_H(void)
-	{
-		m_mode = MODE_H;
-
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) +
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for ETC1 modes, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::Decode_ETC1(void)
-	{
-
-		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
-
-		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
-		{
-			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
-			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
-
-			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
-
-			float fDelta;
-			if (m_boolOpaque)
-				fDelta = Block4x4Encoding_ETC1::s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
-			else 
-				fDelta = s_aafCwOpaqueUnsetTable[uiCW][m_auiSelectors[uiPixel]];
-
-			if (m_boolOpaque == false && m_auiSelectors[uiPixel] == TRANSPARENT_SELECTOR)
-			{
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-				m_afDecodedAlphas[uiPixel] = 0.0f;
-			}
-			else
-			{
-				m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for T mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-					m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for H mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-					m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// RGB8A1 can't use individual mode
-	// RGB8A1 with transparent pixels can't use planar mode
-	//
-	void Block4x4Encoding_RGB8A1::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolOpaque);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			if (a_fEffort <= 39.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH();
-			TryT(1);
-			TryH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 4:
-			TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1::PerformFirstIteration(void)
-	{
-		Block4x4Encoding_ETC1::CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		Block4x4Encoding_RGB8A1 encodingTry = *this;
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius, 
-													int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
-								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
-
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryDifferentialHalf(&trys.m_half1);
-		encodingTry.TryDifferentialHalf(&trys.m_half2);
-
-		// find best halves that are within differential range
-		DifferentialTrys::Try *ptryBest1 = nullptr;
-		DifferentialTrys::Try *ptryBest2 = nullptr;
-		encodingTry.m_fError = FLT_MAX;
-
-		// see if the best of each half are in differential range
-		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
-		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
-		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
-		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
-		{
-			ptryBest1 = trys.m_half1.m_ptryBest;
-			ptryBest2 = trys.m_half2.m_ptryBest;
-			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-		}
-		else
-		{
-			// else, find the next best halves that are in differential range
-			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
-			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
-				ptry1++)
-			{
-				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
-				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
-					ptry2++)
-				{
-					iDRed = ptry2->m_iRed - ptry1->m_iRed;
-					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
-					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
-					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
-					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
-					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
-
-					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
-					{
-						float fError = ptry1->m_fError + ptry2->m_fError;
-
-						if (fError < encodingTry.m_fError)
-						{
-							encodingTry.m_fError = fError;
-
-							ptryBest1 = ptry1;
-							ptryBest2 = ptry2;
-						}
-					}
-
-				}
-			}
-			assert(encodingTry.m_fError < FLT_MAX);
-			assert(ptryBest1 != nullptr);
-			assert(ptryBest2 != nullptr);
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = true;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			m_fError = 0.0f;
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				if (uiSelector1 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel1] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel1] = 0.0f;
-				}
-				else
-				{
-					float fDeltaRGB1 = s_aafCwOpaqueUnsetTable[m_uiCW1][uiSelector1];
-					m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-					m_afDecodedAlphas[uiPixel1] = 1.0f;
-				}
-
-				if (uiSelector2 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel2] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel2] = 0.0f;
-				}
-				else
-				{
-					float fDeltaRGB2 = s_aafCwOpaqueUnsetTable[m_uiCW2][uiSelector2];
-					m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-					m_afDecodedAlphas[uiPixel2] = 1.0f;
-				}
-
-				float fDeltaA1 = m_afDecodedAlphas[uiPixel1] - m_pafrgbaSource[uiPixel1].fA;
-				m_fError += fDeltaA1 * fDeltaA1;
-				float fDeltaA2 = m_afDecodedAlphas[uiPixel2] - m_pafrgbaSource[uiPixel2].fA;
-				m_fError += fDeltaA2 * fDeltaA2;
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
-			m_fError = m_fError1 + m_fError2;
-
-			// sanity check
-			{
-				int iRed1 = m_frgbaColor1.IntRed(31.0f);
-				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-				int iRed2 = m_frgbaColor2.IntRed(31.0f);
-				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-				iDRed = iRed2 - iRed1;
-				iDGreen = iGreen2 - iGreen1;
-				iDBlue = iBlue2 - iBlue1;
-
-				assert(iDRed >= -4 && iDRed < 4);
-				assert(iDGreen >= -4 && iDGreen < 4);
-				assert(iDBlue >= -4 && iDBlue < 4);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		uses s_aafCwOpaqueUnsetTable
-	//		color for selector set to 0,0,0,0
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
-		iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-			iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 31);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-			iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-				iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 31);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-				iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-					iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 31);
-
-					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedColors[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-							FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = ColorFloatRGBA();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								if (pfrgbaSourcePixel->fA < 0.5f)
-								{
-									uiSelector = TRANSPARENT_SELECTOR;
-								}
-								else if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									continue;
-								}
-
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-								
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-																	*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedColors[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-								if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									break;
-								}
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8A1::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-													m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in H mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8A1::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryH_BestSelectorCombination(void)
-	{
-
-		// abort if colors and CW will pose an encoding problem
-		{
-			unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(255.0f);
-			unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(255.0f);
-			unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(255.0f);
-			unsigned int uiColorValue1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-
-			unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(255.0f);
-			unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(255.0f);
-			unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(255.0f);
-			unsigned int uiColorValue2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-			unsigned int uiCWLsb = m_uiCW1 & 1;
-
-			if ((uiColorValue1 >= (uiColorValue2 & uiCWLsb)) == 0 ||
-				(uiColorValue1 < (uiColorValue2 & uiCWLsb)) == 1)
-			{
-				return;
-			}
-		}
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-											FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-													m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 1 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates1(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 2 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates2(void)
-	{
-
-		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 3 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates3(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 4 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates4(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits(void)
-	{
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			SetEncodingBits_ETC1();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			Block4x4Encoding_RGB8::SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if ETC1 mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_ETC1(void)
-	{
-
-		// there is no individual mode in RGB8A1
-		assert(m_boolDiff);
-
-		int iRed1 = m_frgbaColor1.IntRed(31.0f);
-		int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-		int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-		int iRed2 = m_frgbaColor2.IntRed(31.0f);
-		int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-		int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-		int iDRed2 = iRed2 - iRed1;
-		int iDGreen2 = iGreen2 - iGreen1;
-		int iDBlue2 = iBlue2 - iBlue1;
-
-		assert(iDRed2 >= -4 && iDRed2 < 4);
-		assert(iDGreen2 >= -4 && iDGreen2 < 4);
-		assert(iDBlue2 >= -4 && iDBlue2 < 4);
-
-		m_pencodingbitsRGB8->differential.red1 = iRed1;
-		m_pencodingbitsRGB8->differential.green1 = iGreen1;
-		m_pencodingbitsRGB8->differential.blue1 = iBlue1;
-
-		m_pencodingbitsRGB8->differential.dred2 = iDRed2;
-		m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
-		m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
-
-		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
-		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
-
-		SetEncodingBits_Selectors();
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		m_pencodingbitsRGB8->individual.flip = m_boolFlip;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if T mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1_Opaque
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1_Opaque::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolPunchThroughPixels);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::TryPlanar(1);
-			break;
-
-		case 4:
-			Block4x4Encoding_RGB8::TryTAndH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1_Opaque::PerformFirstIteration(void)
-	{
-		
-		// set decoded alphas
-		// calculate alpha error
-		m_fError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-
-			float fDeltaA = 1.0f - m_pafrgbaSource[uiPixel].fA;
-			m_fError += fDeltaA * fDeltaA;
-		}
-
-		CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryPlanar(0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryTAndH(0);
-		SetDoneIfPerfect();
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1_Transparent
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1_Transparent::PerformIteration(float )
-	{
-		assert(!m_boolOpaque);
-		assert(m_boolTransparent);
-		assert(!m_boolDone);
-		assert(m_uiEncodingIterations == 0);
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = false;
-
-		m_uiCW1 = 0;
-		m_uiCW2 = 0;
-
-		m_frgbaColor1 = ColorFloatRGBA();
-		m_frgbaColor2 = ColorFloatRGBA();
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_auiSelectors[uiPixel] = TRANSPARENT_SELECTOR;
-
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-			m_afDecodedAlphas[uiPixel] = 0.0f;
-		}
-
-		CalcBlockError();
-
-		m_boolDone = true;
-		m_uiEncodingIterations++;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
deleted file mode 100644
index ff26e462f8..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcErrorMetric.h"
-#include "EtcBlock4x4EncodingBits.h"
-
-namespace Etc
-{
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1
-	// RGB8A1 if not completely opaque or transparent
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		static const unsigned int TRANSPARENT_SELECTOR = 2;
-
-		Block4x4Encoding_RGB8A1(void);
-		virtual ~Block4x4Encoding_RGB8A1(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		void InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-										unsigned char *a_paucEncodingBits,
-										ColorFloatRGBA *a_pafrgbaSource,
-										ErrorMetric a_errormetric);
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void PerformFirstIteration(void);
-
-		void Decode_ETC1(void);
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void SetEncodingBits_ETC1(void);
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-
-	protected:
-
-		bool m_boolOpaque;				// all source pixels have alpha >= 0.5
-		bool m_boolTransparent;			// all source pixels have alpha < 0.5
-		bool m_boolPunchThroughPixels;	// some source pixels have alpha < 0.5
-
-		static float s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS];
-
-	private:
-
-		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-								int a_iGrayOffset1, int a_iGrayOffset2);
-		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
-
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-		void TryDegenerates1(void);
-		void TryDegenerates2(void);
-		void TryDegenerates3(void);
-		void TryDegenerates4(void);
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1_Opaque
-	// RGB8A1 if all pixels have alpha==1
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1_Opaque : public Block4x4Encoding_RGB8A1
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		void PerformFirstIteration(void);
-
-	private:
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1_Transparent
-	// RGB8A1 if all pixels have alpha==0
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1_Transparent : public Block4x4Encoding_RGB8A1
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-	private:
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
deleted file mode 100644
index 600c7ab405..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGBA8.cpp contains:
-	Block4x4Encoding_RGBA8
-	Block4x4Encoding_RGBA8_Opaque
-	Block4x4Encoding_RGBA8_Transparent
-
-These encoders are used when targetting file format RGBA8.
-
-Block4x4Encoding_RGBA8_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGBA8_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGBA8 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8
-	// ####################################################################################################
-
-	float Block4x4Encoding_RGBA8::s_aafModifierTable[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS]
-	{
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -9.0f / 255.0f, -15.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 8.0f / 255.0f, 14.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f, -10.0f / 255.0f, -13.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 9.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -6.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 12.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -12.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f, 11.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f,  -9.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 8.0f / 255.0f, 10.0f / 255.0f },
-		{ -4.0f / 255.0f, -7.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-
-		{ -2.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -4.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-		{ -1.0f / 255.0f, -2.0f / 255.0f,  -3.0f / 255.0f, -10.0f / 255.0f, 0.0f / 255.0f, 1.0f / 255.0f, 2.0f / 255.0f,  9.0f / 255.0f },
-		{ -4.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f,  -9.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  8.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f,  -9.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  8.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGBA8::Block4x4Encoding_RGBA8(void)
-	{
-
-		m_pencodingbitsA8 = nullptr;
-
-	}
-	Block4x4Encoding_RGBA8::~Block4x4Encoding_RGBA8(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGBA8::InitFromSource(Block4x4 *a_pblockParent,
-												ColorFloatRGBA *a_pafrgbaSource,
-												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + sizeof(Block4x4EncodingBits_A8));
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGBA8::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-
-		m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + sizeof(Block4x4EncodingBits_A8));
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-													(unsigned char *) m_pencodingbitsRGB8,
-													a_pafrgbaSource,
-													a_errormetric);
-
-		// init A8 portion
-		// has to be done after InitFromEncodingBits()
-		{
-			m_fBase = m_pencodingbitsA8->data.base / 255.0f;
-			m_fMultiplier = (float)m_pencodingbitsA8->data.multiplier;
-			m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
-
-			unsigned long long int ulliSelectorBits = 0;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors0 << 40;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors1 << 32;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors2 << 24;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors3 << 16;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors4 << 8;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors5;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (ALPHA_SELECTORS - 1);
-			}
-
-			// decode the alphas
-			// calc alpha error
-			m_fError = 0.0f;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afDecodedAlphas[uiPixel] = DecodePixelAlpha(m_fBase, m_fMultiplier,
-					m_uiModifierTableIndex,
-					m_auiAlphaSelectors[uiPixel]);
-
-				float fDeltaAlpha = m_afDecodedAlphas[uiPixel] - m_pafrgbaSource[uiPixel].fA;
-				m_fError += fDeltaAlpha * fDeltaAlpha;
-			}
-		}
-
-		// redo error calc to include alpha
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// similar to Block4x4Encoding_RGB8_Base::Encode_RGB8(), but with alpha added
-	//
-	void Block4x4Encoding_RGBA8::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		if (m_uiEncodingIterations == 0)
-		{
-			if (a_fEffort < 24.9f)
-			{
-				CalculateA8(0.0f);
-			}
-			else if (a_fEffort < 49.9f)
-			{
-				CalculateA8(1.0f);
-			}
-			else
-			{
-				CalculateA8(2.0f);
-			}
-		}
-
-		Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base alpga, multiplier and selectors
-	//
-	// a_fRadius limits the range of base alpha to try
-	//
-	void Block4x4Encoding_RGBA8::CalculateA8(float a_fRadius)
-	{
-
-		// find min/max alpha
-		float fMinAlpha = 1.0f;
-		float fMaxAlpha = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-
-			// ignore border pixels
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			if (fAlpha < fMinAlpha)
-			{
-				fMinAlpha = fAlpha;
-			}
-			if (fAlpha > fMaxAlpha)
-			{
-				fMaxAlpha = fAlpha;
-			}
-		}
-		assert(fMinAlpha <= fMaxAlpha);
-
-		float fAlphaRange = fMaxAlpha - fMinAlpha;
-
-		// try each modifier table entry
-		m_fError = FLT_MAX;		// artificially high value
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			static const unsigned int MIN_VALUE_SELECTOR = 3;
-			static const unsigned int MAX_VALUE_SELECTOR = 7;
-
-			float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][MIN_VALUE_SELECTOR];
-
-			float fTableEntryRange = s_aafModifierTable[uiTableEntry][MAX_VALUE_SELECTOR] -
-				s_aafModifierTable[uiTableEntry][MIN_VALUE_SELECTOR];
-
-			float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-			float fCenter = fMinAlpha + fCenterRatio*fAlphaRange;
-			fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-			float fMinBase = fCenter - (a_fRadius / 255.0f);
-			if (fMinBase < 0.0f)
-			{
-				fMinBase = 0.0f;
-			}
-
-			float fMaxBase = fCenter + (a_fRadius / 255.0f);
-			if (fMaxBase > 1.0f)
-			{
-				fMaxBase = 1.0f;
-			}
-
-			for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-			{
-
-				float fRangeMultiplier = roundf(fAlphaRange / fTableEntryRange);
-
-				float fMinMultiplier = fRangeMultiplier - a_fRadius;
-				if (fMinMultiplier < 1.0f)
-				{
-					fMinMultiplier = 1.0f;
-				}
-				else if (fMinMultiplier > 15.0f)
-				{
-					fMinMultiplier = 15.0f;
-				}
-
-				float fMaxMultiplier = fRangeMultiplier + a_fRadius;
-				if (fMaxMultiplier < 1.0f)
-				{
-					fMaxMultiplier = 1.0f;
-				}
-				else if (fMaxMultiplier > 15.0f)
-				{
-					fMaxMultiplier = 15.0f;
-				}
-
-				for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-				{
-					// find best selector for each pixel
-					unsigned int auiBestSelectors[PIXELS];
-					float afBestAlphaError[PIXELS];
-					float afBestDecodedAlphas[PIXELS];
-					for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-					{
-						float fBestPixelAlphaError = FLT_MAX;
-						for (unsigned int uiSelector = 0; uiSelector < ALPHA_SELECTORS; uiSelector++)
-						{
-							float fDecodedAlpha = DecodePixelAlpha(fBase, fMultiplier, uiTableEntry, uiSelector);
-
-							// border pixels (NAN) should have zero error
-							float fPixelDeltaAlpha = isnan(m_pafrgbaSource[uiPixel].fA) ?
-															0.0f :
-															fDecodedAlpha - m_pafrgbaSource[uiPixel].fA;
-
-							float fPixelAlphaError = fPixelDeltaAlpha * fPixelDeltaAlpha;
-
-							if (fPixelAlphaError < fBestPixelAlphaError)
-							{
-								fBestPixelAlphaError = fPixelAlphaError;
-								auiBestSelectors[uiPixel] = uiSelector;
-								afBestAlphaError[uiPixel] = fBestPixelAlphaError;
-								afBestDecodedAlphas[uiPixel] = fDecodedAlpha;
-							}
-						}
-					}
-
-					float fBlockError = 0.0f;
-					for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-					{
-						fBlockError += afBestAlphaError[uiPixel];
-					}
-
-					if (fBlockError < m_fError)
-					{
-						m_fError = fBlockError;
-
-						m_fBase = fBase;
-						m_fMultiplier = fMultiplier;
-						m_uiModifierTableIndex = uiTableEntry;
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_auiAlphaSelectors[uiPixel] = auiBestSelectors[uiPixel];
-							m_afDecodedAlphas[uiPixel] = afBestDecodedAlphas[uiPixel];
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8::SetEncodingBits(void)
-	{
-
-		// set the RGB8 portion
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		{
-			m_pencodingbitsA8->data.base = (unsigned char)roundf(255.0f * m_fBase);
-			m_pencodingbitsA8->data.table = m_uiModifierTableIndex;
-			m_pencodingbitsA8->data.multiplier = (unsigned char)roundf(m_fMultiplier);
-
-			unsigned long long int ulliSelectorBits = 0;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				ulliSelectorBits |= ((unsigned long long int)m_auiAlphaSelectors[uiPixel]) << uiShift;
-			}
-
-			m_pencodingbitsA8->data.selectors0 = ulliSelectorBits >> 40;
-			m_pencodingbitsA8->data.selectors1 = ulliSelectorBits >> 32;
-			m_pencodingbitsA8->data.selectors2 = ulliSelectorBits >> 24;
-			m_pencodingbitsA8->data.selectors3 = ulliSelectorBits >> 16;
-			m_pencodingbitsA8->data.selectors4 = ulliSelectorBits >> 8;
-			m_pencodingbitsA8->data.selectors5 = ulliSelectorBits;
-		}
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8_Opaque
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGBA8_Opaque::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		if (m_uiEncodingIterations == 0)
-		{
-			m_fError = 0.0f;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-		}
-
-		Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8_Opaque::SetEncodingBits(void)
-	{
-
-		// set the RGB8 portion
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		m_pencodingbitsA8->data.base = 255;
-		m_pencodingbitsA8->data.table = 15;
-		m_pencodingbitsA8->data.multiplier = 15;
-		m_pencodingbitsA8->data.selectors0 = 0xFF;
-		m_pencodingbitsA8->data.selectors1 = 0xFF;
-		m_pencodingbitsA8->data.selectors2 = 0xFF;
-		m_pencodingbitsA8->data.selectors3 = 0xFF;
-		m_pencodingbitsA8->data.selectors4 = 0xFF;
-		m_pencodingbitsA8->data.selectors5 = 0xFF;
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8_Transparent
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGBA8_Transparent::PerformIteration(float )
-	{
-		assert(!m_boolDone);
-		assert(m_uiEncodingIterations == 0);
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-			m_afDecodedAlphas[uiPixel] = 0.0f;
-		}
-
-		m_fError = 0.0f;
-
-		m_boolDone = true;
-		m_uiEncodingIterations++;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8_Transparent::SetEncodingBits(void)
-	{
-
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		m_pencodingbitsA8->data.base = 0;
-		m_pencodingbitsA8->data.table = 0;
-		m_pencodingbitsA8->data.multiplier = 1;
-		m_pencodingbitsA8->data.selectors0 = 0;
-		m_pencodingbitsA8->data.selectors1 = 0;
-		m_pencodingbitsA8->data.selectors2 = 0;
-		m_pencodingbitsA8->data.selectors3 = 0;
-		m_pencodingbitsA8->data.selectors4 = 0;
-		m_pencodingbitsA8->data.selectors5 = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h
deleted file mode 100644
index 5765d36b90..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_A8;
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8
-	// RGBA8 if not completely opaque or transparent
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		Block4x4Encoding_RGBA8(void);
-		virtual ~Block4x4Encoding_RGBA8(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-									unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	protected:
-
-		static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
-		static const unsigned int ALPHA_SELECTOR_BITS = 3;
-		static const unsigned int ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
-
-		static float s_aafModifierTable[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS];
-
-		void CalculateA8(float a_fRadius);
-
-		Block4x4EncodingBits_A8 *m_pencodingbitsA8;	// A8 portion of Block4x4EncodingBits_RGBA8
-
-		float m_fBase;
-		float m_fMultiplier;
-		unsigned int m_uiModifierTableIndex;
-		unsigned int m_auiAlphaSelectors[PIXELS];
-
-	private:
-
-		inline float DecodePixelAlpha(float a_fBase, float a_fMultiplier,
-										unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-		{
-			float fPixelAlpha = a_fBase + 
-								a_fMultiplier*s_aafModifierTable[a_uiTableIndex][a_uiSelector];
-			if (fPixelAlpha < 0.0f)
-			{
-				fPixelAlpha = 0.0f;
-			}
-			else if (fPixelAlpha > 1.0f)
-			{
-				fPixelAlpha = 1.0f;
-			}
-
-			return fPixelAlpha;
-		}
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8_Opaque
-	// RGBA8 if all pixels have alpha==1
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8_Opaque : public Block4x4Encoding_RGBA8
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8_Transparent
-	// RGBA8 if all pixels have alpha==0
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8_Transparent : public Block4x4Encoding_RGBA8
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcColor.h b/thirdparty/etc2comp/EtcColor.h
deleted file mode 100644
index 7ceae05b65..0000000000
--- a/thirdparty/etc2comp/EtcColor.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	inline float LogToLinear(float a_fLog)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLog <= 0.04045f)
-		{
-			return a_fLog / 12.92f;
-		}
-		else
-		{
-			return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
-		}
-	}
-
-	inline float LinearToLog(float &a_fLinear)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLinear <= 0.0031308f)
-		{
-			return 12.92f * a_fLinear;
-		}
-		else
-		{
-			return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
-		}
-	}
-
-	class ColorR8G8B8A8
-	{
-	public:
-
-		unsigned char ucR;
-		unsigned char ucG;
-		unsigned char ucB;
-		unsigned char ucA;
-
-	};
-}
diff --git a/thirdparty/etc2comp/EtcColorFloatRGBA.h b/thirdparty/etc2comp/EtcColorFloatRGBA.h
deleted file mode 100644
index f2ca2c1f71..0000000000
--- a/thirdparty/etc2comp/EtcColorFloatRGBA.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-#include "EtcColor.h"
-
-#include <math.h>
-
-namespace Etc
-{
-
-	class ColorFloatRGBA
-    {
-    public:
-
-		ColorFloatRGBA(void)
-        {
-            fR = fG = fB = fA = 0.0f;
-        }
-
-		ColorFloatRGBA(float a_fR, float a_fG, float a_fB, float a_fA)
-        {
-            fR = a_fR;
-            fG = a_fG;
-            fB = a_fB;
-            fA = a_fA;
-        }
-
-		inline ColorFloatRGBA operator+(ColorFloatRGBA& a_rfrgba)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_rfrgba.fR;
-			frgba.fG = fG + a_rfrgba.fG;
-			frgba.fB = fB + a_rfrgba.fB;
-			frgba.fA = fA + a_rfrgba.fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator+(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_f;
-			frgba.fG = fG + a_f;
-			frgba.fB = fB + a_f;
-			frgba.fA = fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator-(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR - a_f;
-			frgba.fG = fG - a_f;
-			frgba.fB = fB - a_f;
-			frgba.fA = fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator-(ColorFloatRGBA& a_rfrgba)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR - a_rfrgba.fR;
-			frgba.fG = fG - a_rfrgba.fG;
-			frgba.fB = fB - a_rfrgba.fB;
-			frgba.fA = fA - a_rfrgba.fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator*(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR * a_f;
-			frgba.fG = fG * a_f;
-			frgba.fB = fB * a_f;
-			frgba.fA = fA;
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ScaleRGB(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = a_f * fR;
-			frgba.fG = a_f * fG;
-			frgba.fB = a_f * fB;
-			frgba.fA = fA;
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA RoundRGB(void)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = roundf(fR);
-			frgba.fG = roundf(fG);
-			frgba.fB = roundf(fB);
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ToLinear()
-		{
-			ColorFloatRGBA frgbaLinear;
-			frgbaLinear.fR = LogToLinear(fR);
-			frgbaLinear.fG = LogToLinear(fG);
-			frgbaLinear.fB = LogToLinear(fB);
-			frgbaLinear.fA = fA;
-
-			return frgbaLinear;
-		}
-
-		inline ColorFloatRGBA ToLog(void)
-		{
-			ColorFloatRGBA frgbaLog;
-			frgbaLog.fR = LinearToLog(fR);
-			frgbaLog.fG = LinearToLog(fG);
-			frgbaLog.fB = LinearToLog(fB);
-			frgbaLog.fA = fA;
-
-			return frgbaLog;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGBA8(unsigned char a_ucR, 
-			unsigned char a_ucG, unsigned char a_ucB, unsigned char a_ucA)
-		{
-			ColorFloatRGBA frgba;
-
-			frgba.fR = (float)a_ucR / 255.0f;
-			frgba.fG = (float)a_ucG / 255.0f;
-			frgba.fB = (float)a_ucB / 255.0f;
-			frgba.fA = (float)a_ucA / 255.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGB4(unsigned char a_ucR4,
-														unsigned char a_ucG4,
-														unsigned char a_ucB4)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR4 << 4) + a_ucR4);
-			unsigned char ucG8 = (unsigned char)((a_ucG4 << 4) + a_ucG4);
-			unsigned char ucB8 = (unsigned char)((a_ucB4 << 4) + a_ucB4);
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGB5(unsigned char a_ucR5,
-			unsigned char a_ucG5,
-			unsigned char a_ucB5)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR5 << 3) + (a_ucR5 >> 2));
-			unsigned char ucG8 = (unsigned char)((a_ucG5 << 3) + (a_ucG5 >> 2));
-			unsigned char ucB8 = (unsigned char)((a_ucB5 << 3) + (a_ucB5 >> 2));
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromR6G7B6(unsigned char a_ucR6,
-			unsigned char a_ucG7,
-			unsigned char a_ucB6)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR6 << 2) + (a_ucR6 >> 4));
-			unsigned char ucG8 = (unsigned char)((a_ucG7 << 1) + (a_ucG7 >> 6));
-			unsigned char ucB8 = (unsigned char)((a_ucB6 << 2) + (a_ucB6 >> 4));
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		// quantize to 4 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR4G4B4(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 4 bits
-			frgba = frgba.ClampRGB().ScaleRGB(15.0f).RoundRGB();
-			unsigned int uiR4 = (unsigned int)frgba.fR;
-			unsigned int uiG4 = (unsigned int)frgba.fG;
-			unsigned int uiB4 = (unsigned int)frgba.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float) ((uiR4 << 4) + uiR4);
-			frgba.fG = (float) ((uiG4 << 4) + uiG4);
-			frgba.fB = (float) ((uiB4 << 4) + uiB4);
-
-			frgba = frgba.ScaleRGB(1.0f/255.0f);
-
-			return frgba;
-		}
-
-		// quantize to 5 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR5G5B5(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 5 bits
-			frgba = frgba.ClampRGB().ScaleRGB(31.0f).RoundRGB();
-			unsigned int uiR5 = (unsigned int)frgba.fR;
-			unsigned int uiG5 = (unsigned int)frgba.fG;
-			unsigned int uiB5 = (unsigned int)frgba.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float)((uiR5 << 3) + (uiR5 >> 2));
-			frgba.fG = (float)((uiG5 << 3) + (uiG5 >> 2));
-			frgba.fB = (float)((uiB5 << 3) + (uiB5 >> 2));
-
-			frgba = frgba.ScaleRGB(1.0f / 255.0f);
-
-			return frgba;
-		}
-
-		// quantize to 6/7/6 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR6G7B6(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 6/7/6 bits
-			ColorFloatRGBA frgba6 = frgba.ClampRGB().ScaleRGB(63.0f).RoundRGB();
-			ColorFloatRGBA frgba7 = frgba.ClampRGB().ScaleRGB(127.0f).RoundRGB();
-			unsigned int uiR6 = (unsigned int)frgba6.fR;
-			unsigned int uiG7 = (unsigned int)frgba7.fG;
-			unsigned int uiB6 = (unsigned int)frgba6.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float)((uiR6 << 2) + (uiR6 >> 4));
-			frgba.fG = (float)((uiG7 << 1) + (uiG7 >> 6));
-			frgba.fB = (float)((uiB6 << 2) + (uiB6 >> 4));
-
-			frgba = frgba.ScaleRGB(1.0f / 255.0f);
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ClampRGB(void)
-		{
-			ColorFloatRGBA frgba = *this;
-			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
-			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
-			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
-			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
-			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
-			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ClampRGBA(void)
-		{
-			ColorFloatRGBA frgba = *this;
-			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
-			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
-			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
-			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
-			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
-			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
-			if (frgba.fA < 0.0f) { frgba.fA = 0.0f; }
-			if (frgba.fA > 1.0f) { frgba.fA = 1.0f; }
-
-			return frgba;
-		}
-
-		inline int IntRed(float a_fScale)
-		{
-			return (int)roundf(fR * a_fScale);
-		}
-
-		inline int IntGreen(float a_fScale)
-		{
-			return (int)roundf(fG * a_fScale);
-		}
-
-		inline int IntBlue(float a_fScale)
-		{
-			return (int)roundf(fB * a_fScale);
-		}
-
-		inline int IntAlpha(float a_fScale)
-		{
-			return (int)roundf(fA * a_fScale);
-		}
-
-		float	fR, fG, fB, fA;
-    };
-
-}
-
diff --git a/thirdparty/etc2comp/EtcConfig.h b/thirdparty/etc2comp/EtcConfig.h
deleted file mode 100644
index 3bfe1d99a8..0000000000
--- a/thirdparty/etc2comp/EtcConfig.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#ifdef _WIN32
-#define ETC_WINDOWS (1)
-#else
-#define ETC_WINDOWS (0)
-#endif
-
-#if __APPLE__
-#define ETC_OSX (1)
-#else
-#define ETC_OSX (0)
-#endif
-
-#if __unix__
-#define ETC_UNIX (1)
-#else
-#define ETC_UNIX (0)
-#endif
-
-
-// short names for common types
-#include <stdint.h>
-typedef int8_t i8;
-typedef int16_t i16;
-typedef int32_t i32;
-typedef int64_t i64;
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-typedef float	f32;
-typedef double	f64;
-
-// Keep asserts enabled in release builds during development
-#undef NDEBUG
-
-// 0=disable. stb_image can be used if you need to compress
-//other image formats like jpg
-#define USE_STB_IMAGE_LOAD 0
-
-#if ETC_WINDOWS
-#include <sdkddkver.h>
-#define _CRT_SECURE_NO_WARNINGS (1)
-#include <tchar.h>
-#endif
-
-#include <stdio.h>
-
diff --git a/thirdparty/etc2comp/EtcDifferentialTrys.cpp b/thirdparty/etc2comp/EtcDifferentialTrys.cpp
deleted file mode 100644
index ef4cd103d9..0000000000
--- a/thirdparty/etc2comp/EtcDifferentialTrys.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcDifferentialTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Differential mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcDifferentialTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	DifferentialTrys::DifferentialTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-										const unsigned int *a_pauiPixelMapping1,
-										const unsigned int *a_pauiPixelMapping2,
-										unsigned int a_uiRadius,
-										int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		m_boolSeverelyBentColors = false;
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR5G5B5();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR5G5B5();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(31.0f)+a_iGrayOffset1, a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(31.0f) + a_iGrayOffset1, a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(31.0f) + a_iGrayOffset1, a_uiRadius);
-		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(31.0f) + a_iGrayOffset2, a_uiRadius);
-
-		int iDeltaRed = iRed2 - iRed1;
-		int iDeltaGreen = iGreen2 - iGreen1;
-		int iDeltaBlue = iBlue2 - iBlue1;
-
-		// make sure components are within range
-		{
-			if (iDeltaRed > 3)
-			{
-				if (iDeltaRed > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed - 3) / 2;
-				iRed2 = iRed1 + 3;
-				iDeltaRed = 3;
-			}
-			else if (iDeltaRed < -4)
-			{
-				if (iDeltaRed < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed + 4) / 2;
-				iRed2 = iRed1 - 4;
-				iDeltaRed = -4;
-			}
-			assert(iRed1 >= (signed)(0 + a_uiRadius) && iRed1 <= (signed)(31 - a_uiRadius));
-			assert(iRed2 >= (signed)(0 + a_uiRadius) && iRed2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaRed >= -4 && iDeltaRed <= 3);
-
-			if (iDeltaGreen > 3)
-			{
-				if (iDeltaGreen > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen - 3) / 2;
-				iGreen2 = iGreen1 + 3;
-				iDeltaGreen = 3;
-			}
-			else if (iDeltaGreen < -4)
-			{
-				if (iDeltaGreen < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen + 4) / 2;
-				iGreen2 = iGreen1 - 4;
-				iDeltaGreen = -4;
-			}
-			assert(iGreen1 >= (signed)(0 + a_uiRadius) && iGreen1 <= (signed)(31 - a_uiRadius));
-			assert(iGreen2 >= (signed)(0 + a_uiRadius) && iGreen2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaGreen >= -4 && iDeltaGreen <= 3);
-
-			if (iDeltaBlue > 3)
-			{
-				if (iDeltaBlue > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue - 3) / 2;
-				iBlue2 = iBlue1 + 3;
-				iDeltaBlue = 3;
-			}
-			else if (iDeltaBlue < -4)
-			{
-				if (iDeltaBlue < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue + 4) / 2;
-				iBlue2 = iBlue1 - 4;
-				iDeltaBlue = -4;
-			}
-			assert(iBlue1 >= (signed)(0+a_uiRadius) && iBlue1 <= (signed)(31 - a_uiRadius));
-			assert(iBlue2 >= (signed)(0 + a_uiRadius) && iBlue2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaBlue >= -4 && iDeltaBlue <= 3);
-		}
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void DifferentialTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue, 
-										const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcDifferentialTrys.h b/thirdparty/etc2comp/EtcDifferentialTrys.h
deleted file mode 100644
index 71860908ff..0000000000
--- a/thirdparty/etc2comp/EtcDifferentialTrys.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class DifferentialTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 2;
-
-		DifferentialTrys(ColorFloatRGBA a_frgbaColor1,
-							ColorFloatRGBA a_frgbaColor2,
-							const unsigned int *a_pauiPixelMapping1,
-							const unsigned int *a_pauiPixelMapping2,
-							unsigned int a_uiRadius,
-							int a_iGrayOffset1, int a_iGrayOffset2);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (31- a_iDistance))
-			{
-				return (31 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 125;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-		bool m_boolSeverelyBentColors;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcErrorMetric.h b/thirdparty/etc2comp/EtcErrorMetric.h
deleted file mode 100644
index df4dcab4fb..0000000000
--- a/thirdparty/etc2comp/EtcErrorMetric.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Etc
-{
-
-	enum ErrorMetric
-	{
-		RGBA,
-		RGBX,
-		REC709,
-		NUMERIC,
-		NORMALXYZ,
-		//
-		ERROR_METRICS,
-		//
-		BT709 = REC709
-	};
-
-	inline const char *ErrorMetricToString(ErrorMetric errorMetric)
-	{
-		switch (errorMetric)
-		{
-		case RGBA:
-			return "RGBA";
-		case RGBX:
-			return "RGBX";
-		case REC709:
-			return "REC709";
-		case NUMERIC:
-			return "NUMERIC";
-		case NORMALXYZ:
-			return "NORMALXYZ";
-		case ERROR_METRICS:
-		default:
-			return "UNKNOWN";
-		}
-	}
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFile.cpp b/thirdparty/etc2comp/EtcFile.cpp
deleted file mode 100644
index 831a3aac45..0000000000
--- a/thirdparty/etc2comp/EtcFile.cpp
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef _WIN32
-#define _CRT_SECURE_NO_WARNINGS (1)
-#endif
-
-#include "EtcConfig.h"
-
-
-#include "EtcFile.h"
-
-#include "EtcFileHeader.h"
-#include "EtcColor.h"
-#include "Etc.h"
-#include "EtcBlock4x4EncodingBits.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdlib.h>
-
-using namespace Etc;
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-			unsigned char *a_paucEncodingBits, unsigned int a_uiEncodingBitsBytes,
-			unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-			unsigned int a_uiExtendedWidth, unsigned int a_uiExtendedHeight)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		m_pstrFilename = const_cast<char *>("");
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	m_imageformat = a_imageformat;
-
-	m_uiNumMipmaps = 1;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-	m_pMipmapImages[0].paucEncodingBits = std::shared_ptr<unsigned char>(a_paucEncodingBits, [](unsigned char *p) { delete[] p; } );
-	m_pMipmapImages[0].uiEncodingBitsBytes = a_uiEncodingBitsBytes;
-	m_pMipmapImages[0].uiExtendedWidth = a_uiExtendedWidth;
-	m_pMipmapImages[0].uiExtendedHeight = a_uiExtendedHeight;
-
-	m_uiSourceWidth = a_uiSourceWidth;
-	m_uiSourceHeight = a_uiSourceHeight;
-
-	switch (m_fileformat)
-	{
-	case Format::PKM:
-		m_pheader = new FileHeader_Pkm(this);
-		break;
-
-	case Format::KTX:
-		m_pheader = new FileHeader_Ktx(this);
-		break;
-
-	default:
-		assert(0);
-		break;
-	}
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-	unsigned int a_uiNumMipmaps, RawImage *a_pMipmapImages,
-	unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		m_pstrFilename = const_cast<char *>("");
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	m_imageformat = a_imageformat;
-
-	m_uiNumMipmaps = a_uiNumMipmaps;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-
-	for(unsigned int mip = 0; mip < m_uiNumMipmaps; mip++)
-	{
-		m_pMipmapImages[mip] = a_pMipmapImages[mip];
-	}
-
-	m_uiSourceWidth = a_uiSourceWidth;
-	m_uiSourceHeight = a_uiSourceHeight;
-
-	switch (m_fileformat)
-	{
-	case Format::PKM:
-		m_pheader = new FileHeader_Pkm(this);
-		break;
-
-	case Format::KTX:
-		m_pheader = new FileHeader_Ktx(this);
-		break;
-
-	default:
-		assert(0);
-		break;
-	}
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		return;
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	FILE *pfile = fopen(m_pstrFilename, "rb");
-	if (pfile == nullptr)
-	{
-		printf("ERROR: Couldn't open %s", m_pstrFilename);
-		exit(1);
-	}
-	fseek(pfile, 0, SEEK_END);
-	unsigned int fileSize = ftell(pfile);
-	fseek(pfile, 0, SEEK_SET);
-	size_t szResult;
-
-	m_pheader = new FileHeader_Ktx(this);
-	szResult = fread( ((FileHeader_Ktx*)m_pheader)->GetData(), 1, sizeof(FileHeader_Ktx::Data), pfile);
-	assert(szResult > 0);
-
-	m_uiNumMipmaps = 1;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-
-	if (((FileHeader_Ktx*)m_pheader)->GetData()->m_u32BytesOfKeyValueData > 0)
-		fseek(pfile, ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32BytesOfKeyValueData, SEEK_CUR);
-	szResult = fread(&m_pMipmapImages->uiEncodingBitsBytes, 1, sizeof(unsigned int), pfile);
-	assert(szResult > 0);
-
-	m_pMipmapImages->paucEncodingBits = std::shared_ptr<unsigned char>(new unsigned char[m_pMipmapImages->uiEncodingBitsBytes], [](unsigned char *p) { delete[] p; } );
-	assert(ftell(pfile) + m_pMipmapImages->uiEncodingBitsBytes <= fileSize);
-	szResult = fread(m_pMipmapImages->paucEncodingBits.get(), 1, m_pMipmapImages->uiEncodingBitsBytes, pfile);
-	assert(szResult == m_pMipmapImages->uiEncodingBitsBytes);
-
-	uint32_t uiInternalFormat = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32GlInternalFormat;
-	uint32_t uiBaseInternalFormat = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32GlBaseInternalFormat;
-	
-	if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC1_RGB8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC1_RGB8)
-	{
-		m_imageformat = Image::Format::ETC1;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGB8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGB8)
-	{
-		m_imageformat = Image::Format::RGB8;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGB8A1 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGB8A1)
-	{
-		m_imageformat = Image::Format::RGB8A1;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGBA8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGBA8)
-	{
-		m_imageformat = Image::Format::RGBA8;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_R11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_R11)
-	{
-		m_imageformat = Image::Format::R11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_SIGNED_R11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_R11)
-	{
-		m_imageformat = Image::Format::SIGNED_R11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RG11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RG11)
-	{
-		m_imageformat = Image::Format::RG11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_SIGNED_RG11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RG11)
-	{
-		m_imageformat = Image::Format::SIGNED_RG11;
-	}
-	else
-	{
-		m_imageformat = Image::Format::UNKNOWN;
-	}
-
-	m_uiSourceWidth = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32PixelWidth;
-	m_uiSourceHeight = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32PixelHeight;
-	m_pMipmapImages->uiExtendedWidth = Image::CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-	m_pMipmapImages->uiExtendedHeight = Image::CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-	unsigned int uiBlocks = m_pMipmapImages->uiExtendedWidth * m_pMipmapImages->uiExtendedHeight / 16;
-	Block4x4EncodingBits::Format encodingbitsformat = Image::DetermineEncodingBitsFormat(m_imageformat);
-	unsigned int expectedbytes = uiBlocks * Block4x4EncodingBits::GetBytesPerBlock(encodingbitsformat);
-	assert(expectedbytes == m_pMipmapImages->uiEncodingBitsBytes);
-
-	fclose(pfile);
-}
-
-File::~File()
-{
-	if (m_pMipmapImages != nullptr)
-	{
-		delete [] m_pMipmapImages;
-	}
-
-	if(m_pstrFilename != nullptr)
-	{
-		delete[] m_pstrFilename;
-		m_pstrFilename = nullptr;
-	}
-	if (m_pheader != nullptr)
-	{
-		delete m_pheader;
-		m_pheader = nullptr;
-	}
-}
-
-void File::UseSingleBlock(int a_iPixelX, int a_iPixelY)
-{
-	if (a_iPixelX <= -1 || a_iPixelY <= -1)
-		return;
-	if (a_iPixelX >(int) m_uiSourceWidth)
-	{
-		//if we are using a ktx thats the size of a single block or less
-		//then make sure we use the 4x4 image as the single block
-		if (m_uiSourceWidth <= 4)
-		{
-			a_iPixelX = 0;
-		}
-		else
-		{
-			printf("blockAtHV: H coordinate out of range, capped to image width\n");
-			a_iPixelX = m_uiSourceWidth - 1;
-		}
-	}
-	if (a_iPixelY >(int) m_uiSourceHeight)
-	{
-		//if we are using a ktx thats the size of a single block or less
-		//then make sure we use the 4x4 image as the single block
-		if (m_uiSourceHeight <= 4)
-		{
-			a_iPixelY= 0;
-		}
-		else
-		{
-			printf("blockAtHV: V coordinate out of range, capped to image height\n");
-			a_iPixelY = m_uiSourceHeight - 1;
-		}
-	}
-
-	unsigned int origWidth = m_uiSourceWidth;
-	unsigned int origHeight = m_uiSourceHeight;
-
-	m_uiSourceWidth = 4;
-	m_uiSourceHeight = 4;
-
-	Block4x4EncodingBits::Format encodingbitsformat = Image::DetermineEncodingBitsFormat(m_imageformat);
-	unsigned int uiEncodingBitsBytesPerBlock = Block4x4EncodingBits::GetBytesPerBlock(encodingbitsformat);
-
-	int numMipmaps = 1;
-	RawImage* pMipmapImages = new RawImage[numMipmaps];
-	pMipmapImages[0].uiExtendedWidth = Image::CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-	pMipmapImages[0].uiExtendedHeight = Image::CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-	pMipmapImages[0].uiEncodingBitsBytes = 0;
-	pMipmapImages[0].paucEncodingBits = std::shared_ptr<unsigned char>(new unsigned char[uiEncodingBitsBytesPerBlock], [](unsigned char *p) { delete[] p; });
-
-	//block position in pixels
-	// remove the bottom 2 bits to get the block coordinates 
-	unsigned int iBlockPosX = (a_iPixelX & 0xFFFFFFFC);
-	unsigned int iBlockPosY = (a_iPixelY & 0xFFFFFFFC);
-
-	int numXBlocks = (origWidth / 4);
-	int numYBlocks = (origHeight / 4);
-	
-
-	// block location 
-	//int iBlockX = (a_iPixelX % 4) == 0 ? a_iPixelX / 4.0f : (a_iPixelX / 4) + 1;
-	//int iBlockY = (a_iPixelY % 4) == 0 ? a_iPixelY / 4.0f : (a_iPixelY / 4) + 1;
-	//m_paucEncodingBits += ((iBlockY * numXBlocks) + iBlockX) * uiEncodingBitsBytesPerBlock;
-
-	
-	unsigned int num = numXBlocks*numYBlocks;
-	unsigned int uiH = 0, uiV = 0;
-	unsigned char* pEncodingBits = m_pMipmapImages[0].paucEncodingBits.get();
-	for (unsigned int uiBlock = 0; uiBlock < num; uiBlock++)
-	{
-		if (uiH == iBlockPosX && uiV == iBlockPosY)
-		{
-			memcpy(pMipmapImages[0].paucEncodingBits.get(),pEncodingBits, uiEncodingBitsBytesPerBlock);
-			break;
-		}
-		pEncodingBits += uiEncodingBitsBytesPerBlock;
-		uiH += 4;
-
-		if (uiH >= origWidth)
-		{
-			uiH = 0;
-			uiV += 4;
-		}
-	}
-
-	delete [] m_pMipmapImages;
-	m_pMipmapImages = pMipmapImages;
-}
-// ----------------------------------------------------------------------------------------------------
-//
-void File::Write()
-{
-
-	FILE *pfile = fopen(m_pstrFilename, "wb");
-	if (pfile == nullptr)
-	{
-		printf("Error: couldn't open Etc file (%s)\n", m_pstrFilename);
-		exit(1);
-	}
-
-	m_pheader->Write(pfile);
-
-	for(unsigned int mip = 0; mip < m_uiNumMipmaps; mip++)
-	{
-		if(m_fileformat == Format::KTX)
-		{
-			// Write u32 image size
-			uint32_t u32ImageSize = m_pMipmapImages[mip].uiEncodingBitsBytes;
-			uint32_t szBytesWritten = fwrite(&u32ImageSize, 1, sizeof(u32ImageSize), pfile);
-			assert(szBytesWritten == sizeof(u32ImageSize));
-		}
-
-		unsigned int iResult = (int)fwrite(m_pMipmapImages[mip].paucEncodingBits.get(), 1, m_pMipmapImages[mip].uiEncodingBitsBytes, pfile);
-		if (iResult != m_pMipmapImages[mip].uiEncodingBitsBytes)
-	{
-		printf("Error: couldn't write Etc file (%s)\n", m_pstrFilename);
-		exit(1);
-		}
-	}
-
-	fclose(pfile);
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-
diff --git a/thirdparty/etc2comp/EtcFile.h b/thirdparty/etc2comp/EtcFile.h
deleted file mode 100644
index 69bf3b2d3a..0000000000
--- a/thirdparty/etc2comp/EtcFile.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-#include "EtcImage.h"
-#include "Etc.h"
-
-namespace Etc
-{
-	class FileHeader;
-	class SourceImage;
-
-	class File
-	{
-	public:
-
-		enum class Format
-		{
-			INFER_FROM_FILE_EXTENSION,
-			PKM,
-			KTX,
-		};
-
-		File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-				unsigned char *a_paucEncodingBits, unsigned int a_uiEncodingBitsBytes,
-				unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-				unsigned int a_uiExtendedWidth, unsigned int a_uiExtendedHeight);
-
-		File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-			unsigned int a_uiNumMipmaps, RawImage *pMipmapImages,
-			unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight );
-
-		File(const char *a_pstrFilename, Format a_fileformat);
-		~File();
-		const char *GetFilename(void) { return m_pstrFilename; }
-
-		void Read(const char *a_pstrFilename);
-		void Write(void);
-
-		inline unsigned int GetSourceWidth(void)
-		{
-			return m_uiSourceWidth;
-		}
-
-		inline unsigned int GetSourceHeight(void)
-		{
-			return m_uiSourceHeight;
-		}
-
-		inline unsigned int GetExtendedWidth(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiExtendedWidth;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline unsigned int GetExtendedHeight(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiExtendedHeight;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline Image::Format GetImageFormat()
-		{
-			return m_imageformat;
-		}
-
-		inline unsigned int GetEncodingBitsBytes(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiEncodingBitsBytes;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline unsigned char*  GetEncodingBits(unsigned int mipmapIndex = 0)
-		{
-			if( mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].paucEncodingBits.get();
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		inline unsigned int GetNumMipmaps() 
-		{
-			return m_uiNumMipmaps; 
-		}
-
-		void UseSingleBlock(int a_iPixelX = -1, int a_iPixelY = -1);
-	private:
-
-		char *m_pstrFilename;               // includes directory path and file extension
-		Format m_fileformat;
-		Image::Format m_imageformat;
-		FileHeader *m_pheader;
-		unsigned int m_uiNumMipmaps;
-		RawImage*	 m_pMipmapImages;
-		unsigned int m_uiSourceWidth;
-		unsigned int m_uiSourceHeight;
-	};
-
-}
diff --git a/thirdparty/etc2comp/EtcFileHeader.cpp b/thirdparty/etc2comp/EtcFileHeader.cpp
deleted file mode 100644
index f02fcab011..0000000000
--- a/thirdparty/etc2comp/EtcFileHeader.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcFileHeader.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Pkm::FileHeader_Pkm(File *a_pfile)
-	{
-		m_pfile = a_pfile;
-
-		static const char s_acMagicNumberData[4] = { 'P', 'K', 'M', ' ' };
-		static const char s_acVersionData[2] = { '1', '0' };
-
-		for (unsigned int ui = 0; ui < sizeof(s_acMagicNumberData); ui++)
-		{
-			m_data.m_acMagicNumber[ui] = s_acMagicNumberData[ui];
-		}
-
-		for (unsigned int ui = 0; ui < sizeof(s_acVersionData); ui++)
-		{
-			m_data.m_acVersion[ui] = s_acVersionData[ui];
-		}
-
-		m_data.m_ucDataType_msb = 0;        // ETC1_RGB_NO_MIPMAPS
-		m_data.m_ucDataType_lsb = 0;
-
-		m_data.m_ucOriginalWidth_msb = (unsigned char)(m_pfile->GetSourceWidth() >> 8);
-		m_data.m_ucOriginalWidth_lsb = m_pfile->GetSourceWidth() & 0xFF;
-		m_data.m_ucOriginalHeight_msb = (unsigned char)(m_pfile->GetSourceHeight() >> 8);
-		m_data.m_ucOriginalHeight_lsb = m_pfile->GetSourceHeight() & 0xFF;
-
-		m_data.m_ucExtendedWidth_msb = (unsigned char)(m_pfile->GetExtendedWidth() >> 8);
-		m_data.m_ucExtendedWidth_lsb = m_pfile->GetExtendedWidth() & 0xFF;
-		m_data.m_ucExtendedHeight_msb = (unsigned char)(m_pfile->GetExtendedHeight() >> 8);
-		m_data.m_ucExtendedHeight_lsb = m_pfile->GetExtendedHeight() & 0xFF;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void FileHeader_Pkm::Write(FILE *a_pfile)
-	{
-
-		fwrite(&m_data, sizeof(Data), 1, a_pfile);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Ktx::FileHeader_Ktx(File *a_pfile)
-	{
-		m_pfile = a_pfile;
-
-		static const uint8_t s_au8Itentfier[12] =
-		{ 
-			0xAB, 0x4B, 0x54, 0x58, // first four bytes of Byte[12] identifier
-			0x20, 0x31, 0x31, 0xBB, // next four bytes of Byte[12] identifier
-			0x0D, 0x0A, 0x1A, 0x0A  // final four bytes of Byte[12] identifier
-		};
-
-		for (unsigned int ui = 0; ui < sizeof(s_au8Itentfier); ui++)
-		{
-			m_data.m_au8Identifier[ui] = s_au8Itentfier[ui];
-		}
-
-		m_data.m_u32Endianness				= 0x04030201;
-		m_data.m_u32GlType					= 0;
-		m_data.m_u32GlTypeSize				= 1;
-		m_data.m_u32GlFormat				= 0;
-
-		switch (m_pfile->GetImageFormat())
-		{
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGB8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGBA8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGBA8;
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGB8A1;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGB8A1;
-			break;
-		
-		case Image::Format::R11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_R11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_R11;
-			break;
-
-		case Image::Format::SIGNED_R11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_SIGNED_R11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_R11;
-			break;
-		
-		case Image::Format::RG11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RG11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RG11;
-			break;
-
-		case Image::Format::SIGNED_RG11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_SIGNED_RG11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RG11;
-			break;
-
-		default:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC1_RGB8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC1_RGB8;
-			break;
-		}
-
-		m_data.m_u32PixelWidth				= 0;
-		m_data.m_u32PixelHeight				= 0;
-		m_data.m_u32PixelDepth				= 0;
-		m_data.m_u32NumberOfArrayElements	= 0;
-		m_data.m_u32NumberOfFaces			= 0;
-		m_data.m_u32BytesOfKeyValueData		= 0;
-
-		m_pkeyvaluepair = nullptr;
-
-		m_u32Images = 0;
-		m_u32KeyValuePairs = 0;
-
-		m_data.m_u32PixelWidth = m_pfile->GetSourceWidth();
-		m_data.m_u32PixelHeight = m_pfile->GetSourceHeight();
-		m_data.m_u32PixelDepth = 0;
-		m_data.m_u32NumberOfArrayElements = 0;
-		m_data.m_u32NumberOfFaces = 1;
-		m_data.m_u32NumberOfMipmapLevels = m_pfile->GetNumMipmaps();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void FileHeader_Ktx::Write(FILE *a_pfile)
-	{
-		size_t szBytesWritten;
-
-		// Write header
-		szBytesWritten = fwrite(&m_data, 1, sizeof(Data), a_pfile);
-		assert(szBytesWritten == sizeof(Data));
-
-		// Write KeyAndValuePairs
-		if (m_u32KeyValuePairs)
-		{
-			fwrite(m_pkeyvaluepair, m_pkeyvaluepair->u32KeyAndValueByteSize, 1, a_pfile);
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Ktx::Data *FileHeader_Ktx::GetData()
-	{
-		return &m_data;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFileHeader.h b/thirdparty/etc2comp/EtcFileHeader.h
deleted file mode 100644
index 55a9cb5d9d..0000000000
--- a/thirdparty/etc2comp/EtcFileHeader.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcFile.h"
-#include <stdio.h>
-#include <inttypes.h>
-
-namespace Etc
-{
-
-	class Image;
-
-	class FileHeader
-	{
-	public:
-
-		virtual void Write(FILE *a_pfile) = 0;
-		File GetFile();
-		virtual ~FileHeader(void) {}
-	protected:
-
-		File *m_pfile;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-    class FileHeader_Pkm : public FileHeader
-    {
-    public:
-
-		FileHeader_Pkm(File *a_pfile);
-
-		virtual void Write(FILE *a_pfile);
-		virtual ~FileHeader_Pkm(void) {}
-	private:
-
-		typedef struct
-		{
-			char m_acMagicNumber[4];
-			char m_acVersion[2];
-			unsigned char m_ucDataType_msb;             // e.g. ETC1_RGB_NO_MIPMAPS
-			unsigned char m_ucDataType_lsb;
-			unsigned char m_ucExtendedWidth_msb;     //  padded to 4x4 blocks
-			unsigned char m_ucExtendedWidth_lsb;
-			unsigned char m_ucExtendedHeight_msb;    //  padded to 4x4 blocks
-			unsigned char m_ucExtendedHeight_lsb;
-			unsigned char m_ucOriginalWidth_msb;
-			unsigned char m_ucOriginalWidth_lsb;
-			unsigned char m_ucOriginalHeight_msb;
-			unsigned char m_ucOriginalHeight_lsb;
-		} Data;
-
-		Data m_data;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-    class FileHeader_Ktx : public FileHeader
-    {
-    public:
-
-		typedef struct
-		{
-			uint32_t	u32KeyAndValueByteSize;
-		} KeyValuePair;
-
-		typedef struct
-		{
-			uint8_t m_au8Identifier[12];
-			uint32_t m_u32Endianness;
-			uint32_t m_u32GlType;
-			uint32_t m_u32GlTypeSize;
-			uint32_t m_u32GlFormat;
-			uint32_t m_u32GlInternalFormat;
-			uint32_t m_u32GlBaseInternalFormat;
-			uint32_t m_u32PixelWidth;
-			uint32_t m_u32PixelHeight;
-			uint32_t m_u32PixelDepth;
-			uint32_t m_u32NumberOfArrayElements;
-			uint32_t m_u32NumberOfFaces;
-			uint32_t m_u32NumberOfMipmapLevels;
-			uint32_t m_u32BytesOfKeyValueData;
-		} Data;
-
-		enum class InternalFormat
-		{
-			ETC1_RGB8 = 0x8D64,
-			ETC1_ALPHA8 = ETC1_RGB8,
-			//
-			ETC2_R11 = 0x9270,
-			ETC2_SIGNED_R11 = 0x9271,
-			ETC2_RG11 = 0x9272,
-			ETC2_SIGNED_RG11 = 0x9273,
-			ETC2_RGB8 = 0x9274,
-			ETC2_SRGB8 = 0x9275,
-			ETC2_RGB8A1 = 0x9276,
-			ETC2_SRGB8_PUNCHTHROUGH_ALPHA1 = 0x9277,
-			ETC2_RGBA8 = 0x9278
-		};
-
-		enum class BaseInternalFormat
-		{
-			ETC2_R11 = 0x1903,
-			ETC2_RG11 = 0x8227,
-			ETC1_RGB8 = 0x1907,
-			ETC1_ALPHA8 = ETC1_RGB8,
-			//
-			ETC2_RGB8 = 0x1907,
-			ETC2_RGB8A1 = 0x1908,
-			ETC2_RGBA8 = 0x1908,
-		};
-
-		FileHeader_Ktx(File *a_pfile);
-
-		virtual void Write(FILE *a_pfile);
-		virtual ~FileHeader_Ktx(void) {}
-
-		void AddKeyAndValue(KeyValuePair *a_pkeyvaluepair);
-
-		Data* GetData();
-
-	private:
-
-		Data m_data;
-		KeyValuePair *m_pkeyvaluepair;
-		
-		uint32_t m_u32Images;
-		uint32_t m_u32KeyValuePairs;
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFilter.cpp b/thirdparty/etc2comp/EtcFilter.cpp
deleted file mode 100644
index 1ec8acdf3f..0000000000
--- a/thirdparty/etc2comp/EtcFilter.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-#include <stdlib.h>
-#include <math.h>
-#include "EtcFilter.h"
-
-
-namespace Etc
-{
-
-static const double PiConst = 3.14159265358979323846;
-
-inline double sinc(double x) 
-{
-    if ( x == 0.0 ) 
-    {
-        return 1.0;
-    }
-
-    return sin(PiConst * x) / (PiConst * x);
-}
-
-//inline float sincf( float x )
-//{
-//    x *= F_PI;
-//    if (x < 0.01f && x > -0.01f)
-//    {
-//        return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
-//    }
-//
-//    return sinf(x)/x;
-//}
-//
-//double bessel0(double x) 
-//{
-//    const double EPSILON_RATIO = 1E-16;
-//    double xh, sum, pow, ds;
-//    int k;
-//
-//    xh = 0.5 * x;
-//    sum = 1.0;
-//    pow = 1.0;
-//    k = 0;
-//    ds = 1.0;
-//    while (ds > sum * EPSILON_RATIO) 
-//    {
-//        ++k;
-//        pow = pow * (xh / k);
-//        ds = pow * pow;
-//        sum = sum + ds;
-//    }
-//
-//    return sum;
-//}
-
-//**--------------------------------------------------------------------------
-//** Name: kaiser(double alpha, double half_width, double x) 
-//** Returns:
-//** Description: Alpha controls shape of filter.  We are using 4.
-//**--------------------------------------------------------------------------
-//inline double kaiser(double alpha, double half_width, double x) 
-//{
-//    double ratio = (x / half_width);
-//    return bessel0(alpha * sqrt(1 - ratio * ratio)) / bessel0(alpha);
-//}
-//
-//float Filter_Lanczos4Sinc(float x)
-//{
-//    if (x <= -4.0f || x >= 4.0f)    // half-width of 4
-//    {
-//        return 0.0;
-//    }
-//
-//    return sinc(0.875f * x) * sinc(0.25f * x);
-//}
-//
-//double Filter_Kaiser4( double t )
-//{
-//    return kaiser( 4.0, 3.0, t);
-//}
-//
-//double Filter_KaiserOptimal( double t )
-//{
-//    return kaiser( 8.93, 3.0f, t);
-//}                  
-
-double FilterLanczos3( double t )
-{
-	if ( t <= -3.0 || t >= 3.0 ) 
-    {
-        return 0.0;
-    }
-
-    return sinc( t ) * sinc( t / 3.0 );
-}
-
-double FilterBox( double t )
-{
-    return ( t > -0.5 && t < 0.5) ? 1.0 : 0.0;
-}
-
-double FilterLinear( double t )
-{
-	if (t < 0.0) t = -t;
-
-    return (t < 1.0) ? (1.0 - t) : 0.0;
-}
-
-
-//**--------------------------------------------------------------------------
-//** Name: CalcContributions( int srcSize, 
-//**                          int destSize, 
-//**                          double filterSize, 
-//**						  bool wrap,
-//**                          double (*FilterProc)(double), 
-//**                          FilterWeights contrib[] )
-//** Returns: void
-//** Description:
-//**--------------------------------------------------------------------------
-void CalcContributions( int srcSize, int destSize, double filterSize, bool wrap, double (*FilterProc)(double), FilterWeights contrib[] )
-{
-    double scale;
-    double filterScale;
-    double center;
-    double totalWeight;
-    double weight;
-    int   iRight;
-    int   iLeft;
-    int   iDest;
-
-    scale = (double)destSize / srcSize;
-    if ( scale < 1.0 )
-    {
-        filterSize = filterSize / scale;
-        filterScale = scale;
-    }
-    else
-    {
-        filterScale = 1.0;
-    }
-
-    if ( filterSize > (double)MaxFilterSize )
-    {
-        filterSize = (double)MaxFilterSize;
-    }
-
-    for ( iDest = 0; iDest < destSize; ++iDest )
-    {
-        center = (double)iDest / scale;
-
-        iLeft = (int)ceil(center - filterSize);
-		iRight = (int)floor(center + filterSize);
-
-		if ( !wrap )
-		{
-        if ( iLeft < 0 )
-        {
-            iLeft = 0;
-        }
-
-        if ( iRight >= srcSize )
-        {
-            iRight = srcSize - 1;
-        }
-		}
-
-		int numWeights = iRight - iLeft + 1;
-
-        contrib[iDest].first = iLeft;
-        contrib[iDest].numWeights = numWeights;
-
-        totalWeight = 0;
-		double t = ((double)iLeft - center) * filterScale;
-		for (int i = 0; i < numWeights; i++)
-        {
-			weight = (*FilterProc)(t) * filterScale;
-            totalWeight += weight;
-			contrib[iDest].weight[i] = weight;
-			t += filterScale;
-        }
-
-        //**--------------------------------------------------------
-        //** Normalize weights by dividing by the sum of the weights
-        //**--------------------------------------------------------
-        if ( totalWeight > 0.0 )
-        {   
-            for ( int i = 0; i < numWeights; i++)
-            {
-                contrib[iDest].weight[i] /= totalWeight;
-            }
-        }
-    }
-}
-
-//**-------------------------------------------------------------------------
-//** Name: Filter_TwoPass( RGBCOLOR *pSrcImage, 
-//**                       int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDestImage, 
-//**                       int destWidth, int destHeight, 
-//**                       double (*FilterProc)(double) )
-//** Returns: 0 on failure and 1 on success
-//** Description: Filters a 2d image with a two pass filter by averaging the
-//**    weighted contributions of the pixels within the filter region.  The
-//**    contributions are determined by a weighting function parameter.
-//**-------------------------------------------------------------------------
-int FilterTwoPass( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                    RGBCOLOR *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double (*FilterProc)(double) )
-{
-    FilterWeights *contrib;
-    RGBCOLOR *pPixel;
-    RGBCOLOR *pSrcPixel;
-    RGBCOLOR *pTempImage;
-    int iRow;
-    int iCol;
-    int iSrcCol;
-    int iSrcRow;
-    int iWeight;
-    double dRed;
-    double dGreen;
-    double dBlue;
-    double dAlpha;
-    double filterSize = 3.0;
-
-	int maxDim = (srcWidth>srcHeight)?srcWidth:srcHeight;
-	contrib = (FilterWeights*)malloc(maxDim * sizeof(FilterWeights));
-
-	//**------------------------------------------------------------------------
-    //** Need to create a temporary image to stuff the horizontally scaled image
-    //**------------------------------------------------------------------------
-    pTempImage = (RGBCOLOR *)malloc( destWidth * srcHeight * sizeof(RGBCOLOR) );
-    if ( pTempImage == NULL )
-    {
-        // -- GODOT start --
-        free( contrib );
-        // -- GODOT end --
-        return 0;
-    }
-
-    //**-------------------------------------------------------
-    //** Horizontally filter the image into the temporary image
-    //**-------------------------------------------------------
-	bool bWrapHorizontal = !!(wrapFlags&FILTER_WRAP_X);
-	CalcContributions( srcWidth, destWidth, filterSize, bWrapHorizontal, FilterProc, contrib );
-    for ( iRow = 0; iRow < srcHeight; iRow++ )
-    {
-        for ( iCol = 0; iCol < destWidth; iCol++ )
-        {
-            dRed   = 0;
-            dGreen = 0;
-            dBlue  = 0;
-            dAlpha = 0;
-
-            for ( iWeight = 0; iWeight < contrib[iCol].numWeights; iWeight++ )
-            {
-                iSrcCol = iWeight + contrib[iCol].first;
-				if (bWrapHorizontal)
-				{
-					iSrcCol = (iSrcCol < 0) ? (srcWidth + iSrcCol) : (iSrcCol >= srcWidth) ? (iSrcCol - srcWidth) : iSrcCol;
-				}
-                pSrcPixel = pSrcImage + (iRow * srcWidth) + iSrcCol;
-                dRed   += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[0];
-                dGreen += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[1];
-                dBlue  += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[2];
-                dAlpha += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[3];
-            }
-
-            pPixel = pTempImage + (iRow * destWidth) + iCol;
-			pPixel->rgba[0] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel->rgba[1] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel->rgba[2] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel->rgba[3] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dAlpha)));
-        }
-    }
-
-    //**-------------------------------------------------------
-    //** Vertically filter the image into the destination image
-    //**-------------------------------------------------------
-	bool bWrapVertical = !!(wrapFlags&FILTER_WRAP_Y);
-	CalcContributions(srcHeight, destHeight, filterSize, bWrapVertical, FilterProc, contrib);
-    for ( iCol = 0; iCol < destWidth; iCol++ )
-    {
-        for ( iRow = 0; iRow < destHeight; iRow++ )
-        {
-            dRed   = 0;
-            dGreen = 0;
-            dBlue  = 0;
-            dAlpha = 0;
-
-            for ( iWeight = 0; iWeight < contrib[iRow].numWeights; iWeight++ )
-            {
-                iSrcRow = iWeight + contrib[iRow].first;
-				if (bWrapVertical)
-				{
-					iSrcRow = (iSrcRow < 0) ? (srcHeight + iSrcRow) : (iSrcRow >= srcHeight) ? (iSrcRow - srcHeight) : iSrcRow;
-				}
-                pSrcPixel = pTempImage + (iSrcRow * destWidth) + iCol;
-                dRed   += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[0];
-                dGreen += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[1];
-                dBlue  += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[2];
-                dAlpha += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[3];
-            }
-
-            pPixel = pDestImage + (iRow * destWidth) + iCol;
-            pPixel->rgba[0]   = (unsigned char)(std::max( 0.0, std::min( 255.0, dRed)));
-            pPixel->rgba[1] = (unsigned char)(std::max( 0.0, std::min( 255.0, dGreen)));
-            pPixel->rgba[2]  = (unsigned char)(std::max( 0.0, std::min( 255.0, dBlue)));
-            pPixel->rgba[3] = (unsigned char)(std::max( 0.0, std::min( 255.0, dAlpha)));
-        }
-    }
-
-    free( pTempImage );
-	free( contrib );
-
-    return 1;
-}
-
-//**-------------------------------------------------------------------------
-//** Name: FilterResample(RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDstImage, int dstWidth, int dstHeight)
-//** Returns: 1
-//** Description: This function runs a 2d box filter over the srouce image
-//** to produce the destination image.
-//**-------------------------------------------------------------------------
-void FilterResample( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                     RGBCOLOR *pDstImage, int dstWidth, int dstHeight )
-{
-    int iRow;
-    int iCol;
-    int iSampleRow;
-    int iSampleCol;
-    int iFirstSampleRow;
-    int iFirstSampleCol;
-    int iLastSampleRow;
-    int iLastSampleCol;
-    int red;
-    int green;
-    int blue;
-    int alpha;
-    int samples;
-    float xScale;
-    float yScale;
-
-    RGBCOLOR *pSrcPixel;
-    RGBCOLOR *pDstPixel;
-
-    xScale = (float)srcWidth / dstWidth;
-    yScale = (float)srcHeight / dstHeight;
-
-    for ( iRow = 0; iRow < dstHeight; iRow++ )
-    {
-        for ( iCol = 0; iCol < dstWidth; iCol++ )
-        {
-            iFirstSampleRow = (int)(iRow * yScale);
-            iLastSampleRow = (int)ceil(iFirstSampleRow + yScale - 1);
-            if ( iLastSampleRow >= srcHeight )
-            {
-                iLastSampleRow = srcHeight - 1;
-            }
-
-            iFirstSampleCol = (int)(iCol * xScale);
-            iLastSampleCol = (int)ceil(iFirstSampleCol + xScale - 1);
-            if ( iLastSampleCol >= srcWidth )
-            {
-                iLastSampleCol = srcWidth - 1;
-            }
-
-            samples = 0;
-            red     = 0;
-            green   = 0;
-            blue    = 0;
-            alpha   = 0;
-            for ( iSampleRow = iFirstSampleRow; iSampleRow <= iLastSampleRow; iSampleRow++ )
-            {
-                for ( iSampleCol = iFirstSampleCol; iSampleCol <= iLastSampleCol; iSampleCol++ )
-                {
-                    pSrcPixel = pSrcImage + iSampleRow * srcWidth + iSampleCol;
-                    red   += pSrcPixel->rgba[0];
-                    green += pSrcPixel->rgba[1];
-                    blue  += pSrcPixel->rgba[2];
-                    alpha += pSrcPixel->rgba[3];
-
-                    samples++;
-                }
-            }
-
-            pDstPixel = pDstImage + iRow * dstWidth + iCol;
-            if ( samples > 0 )
-            {
-                pDstPixel->rgba[0] = static_cast<uint8_t>(red / samples);
-                pDstPixel->rgba[1] = static_cast<uint8_t>(green / samples);
-                pDstPixel->rgba[2] = static_cast<uint8_t>(blue / samples);
-                pDstPixel->rgba[3] = static_cast<uint8_t>(alpha / samples);
-            }
-            else
-            {
-                pDstPixel->rgba[0] = static_cast<uint8_t>(red);
-                pDstPixel->rgba[1] = static_cast<uint8_t>(green);
-                pDstPixel->rgba[2] = static_cast<uint8_t>(blue);
-                pDstPixel->rgba[3] = static_cast<uint8_t>(alpha);
-            }
-        }
-    }
-}
-
-
-}
-\ No newline at end of file
diff --git a/thirdparty/etc2comp/EtcFilter.h b/thirdparty/etc2comp/EtcFilter.h
deleted file mode 100644
index fcf125c6df..0000000000
--- a/thirdparty/etc2comp/EtcFilter.h
+++ /dev/null
@@ -1,244 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <algorithm>
-
-namespace Etc
-{
-
-enum FilterEnums
-{
-	MaxFilterSize = 32
-};
-
-enum WrapFlags
-{
-	FILTER_WRAP_NONE = 0,
-	FILTER_WRAP_X = 0x1,
-	FILTER_WRAP_Y = 0x2
-};
-
-typedef struct tagFilterWeights
-{
-	int   first;
-	int   numWeights;
-	double weight[MaxFilterSize * 2 + 1];
-} FilterWeights;
-
-typedef struct tagRGBCOLOR
-{
-	union
-	{
-		uint32_t ulColor;
-		uint8_t rgba[4];
-	};
-} RGBCOLOR;
-
-
-double FilterBox( double t );
-double FilterLinear( double t );
-double FilterLanczos3( double t );
-
-int FilterTwoPass( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                    RGBCOLOR *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double (*FilterProc)(double) );
-void FilterResample( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                     RGBCOLOR *pDstImage, int dstWidth, int dstHeight );
-
-
-void CalcContributions(int srcSize, int destSize, double filterSize, bool wrap, double(*FilterProc)(double), FilterWeights contrib[]);
-
-template <typename T>
-void FilterResample(T *pSrcImage, int srcWidth, int srcHeight, T *pDstImage, int dstWidth, int dstHeight)
-{
-	float xScale;
-	float yScale;
-
-	T *pSrcPixel;
-	T *pDstPixel;
-
-	xScale = (float)srcWidth / dstWidth;
-	yScale = (float)srcHeight / dstHeight;
-
-	for (int iRow = 0; iRow < dstHeight; iRow++)
-	{
-		for (int iCol = 0; iCol < dstWidth; iCol++)
-		{
-			int samples;
-			int iFirstSampleRow;
-			int iFirstSampleCol;
-			int iLastSampleRow;
-			int iLastSampleCol;
-			float red;
-			float green;
-			float blue;
-			float alpha;
-
-			iFirstSampleRow = (int)(iRow * yScale);
-			iLastSampleRow = (int)ceil(iFirstSampleRow + yScale - 1);
-			if (iLastSampleRow >= srcHeight)
-			{
-				iLastSampleRow = srcHeight - 1;
-			}
-
-			iFirstSampleCol = (int)(iCol * xScale);
-			iLastSampleCol = (int)ceil(iFirstSampleCol + xScale - 1);
-			if (iLastSampleCol >= srcWidth)
-			{
-				iLastSampleCol = srcWidth - 1;
-			}
-
-			samples = 0;
-			red = 0.f;
-			green = 0.f;
-			blue = 0.f;
-			alpha = 0.f;
-			for (int iSampleRow = iFirstSampleRow; iSampleRow <= iLastSampleRow; iSampleRow++)
-			{
-				for (int iSampleCol = iFirstSampleCol; iSampleCol <= iLastSampleCol; iSampleCol++)
-				{
-					pSrcPixel = pSrcImage + (iSampleRow * srcWidth + iSampleCol) * 4;
-					red += static_cast<float>(pSrcPixel[0]);
-					green += static_cast<float>(pSrcPixel[1]);
-					blue += static_cast<float>(pSrcPixel[2]);
-					alpha += static_cast<float>(pSrcPixel[3]);
-
-					samples++;
-				}
-			}
-
-			pDstPixel = pDstImage + (iRow * dstWidth + iCol) * 4;
-			if (samples > 0)
-			{
-				pDstPixel[0] = static_cast<T>(red / samples);
-				pDstPixel[1] = static_cast<T>(green / samples);
-				pDstPixel[2] = static_cast<T>(blue / samples);
-				pDstPixel[3] = static_cast<T>(alpha / samples);
-			}
-			else
-			{
-				pDstPixel[0] = static_cast<T>(red);
-				pDstPixel[1] = static_cast<T>(green);
-				pDstPixel[2] = static_cast<T>(blue);
-				pDstPixel[3] = static_cast<T>(alpha);
-			}
-		}
-	}
-
-}
-
-//**-------------------------------------------------------------------------
-//** Name: Filter_TwoPass( RGBCOLOR *pSrcImage, 
-//**                       int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDestImage, 
-//**                       int destWidth, int destHeight, 
-//**                       double (*FilterProc)(double) )
-//** Returns: 0 on failure and 1 on success
-//** Description: Filters a 2d image with a two pass filter by averaging the
-//**    weighted contributions of the pixels within the filter region.  The
-//**    contributions are determined by a weighting function parameter.
-//**-------------------------------------------------------------------------
-template <typename T>
-int FilterTwoPass(T *pSrcImage, int srcWidth, int srcHeight,
-	T *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double(*FilterProc)(double))
-{
-	const int numComponents = 4;
-	FilterWeights *contrib;
-	T *pPixel;
-	T *pTempImage;
-	double dRed;
-	double dGreen;
-	double dBlue;
-	double dAlpha;
-	double filterSize = 3.0;
-
-	int maxDim = (srcWidth>srcHeight) ? srcWidth : srcHeight;
-	contrib = new FilterWeights[maxDim];
-
-	//**------------------------------------------------------------------------
-	//** Need to create a temporary image to stuff the horizontally scaled image
-	//**------------------------------------------------------------------------
-	pTempImage = new T[destWidth * srcHeight * numComponents];
-	if (pTempImage == NULL)
-	{
-		return 0;
-	}
-
-	//**-------------------------------------------------------
-	//** Horizontally filter the image into the temporary image
-	//**-------------------------------------------------------
-	bool bWrapHorizontal = !!(wrapFlags&FILTER_WRAP_X);
-	CalcContributions(srcWidth, destWidth, filterSize, bWrapHorizontal, FilterProc, contrib);
-	for (int iRow = 0; iRow < srcHeight; iRow++)
-	{
-		for (int iCol = 0; iCol < destWidth; iCol++)
-		{
-			dRed = 0;
-			dGreen = 0;
-			dBlue = 0;
-			dAlpha = 0;
-
-			for (int iWeight = 0; iWeight < contrib[iCol].numWeights; iWeight++)
-			{
-				int iSrcCol = iWeight + contrib[iCol].first;
-				if(bWrapHorizontal)
-				{
-					iSrcCol = (iSrcCol < 0)?(srcWidth+iSrcCol):(iSrcCol >= srcWidth)?(iSrcCol-srcWidth):iSrcCol;
-				}
-				T* pSrcPixel = pSrcImage + ((iRow * srcWidth) + iSrcCol)*numComponents;
-				dRed += contrib[iCol].weight[iWeight] * pSrcPixel[0];
-				dGreen += contrib[iCol].weight[iWeight] * pSrcPixel[1];
-				dBlue += contrib[iCol].weight[iWeight] * pSrcPixel[2];
-				dAlpha += contrib[iCol].weight[iWeight] * pSrcPixel[3];
-			}
-
-			pPixel = pTempImage + ((iRow * destWidth) + iCol)*numComponents;
-			pPixel[0] = static_cast<T>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel[1] = static_cast<T>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel[2] = static_cast<T>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel[3] = static_cast<T>(std::max(0.0, std::min(255.0, dAlpha)));
-		}
-	}
-
-	//**-------------------------------------------------------
-	//** Vertically filter the image into the destination image
-	//**-------------------------------------------------------
-	bool bWrapVertical = !!(wrapFlags&FILTER_WRAP_Y);
-	CalcContributions(srcHeight, destHeight, filterSize, bWrapVertical, FilterProc, contrib);
-	for (int iCol = 0; iCol < destWidth; iCol++)
-	{
-		for (int iRow = 0; iRow < destHeight; iRow++)
-		{
-			dRed = 0;
-			dGreen = 0;
-			dBlue = 0;
-			dAlpha = 0;
-
-			for (int iWeight = 0; iWeight < contrib[iRow].numWeights; iWeight++)
-			{
-				int iSrcRow = iWeight + contrib[iRow].first;
-				if (bWrapVertical)
-				{
-					iSrcRow = (iSrcRow < 0) ? (srcHeight + iSrcRow) : (iSrcRow >= srcHeight) ? (iSrcRow - srcHeight) : iSrcRow;
-				}
-				T* pSrcPixel = pTempImage + ((iSrcRow * destWidth) + iCol)*numComponents;
-				dRed += contrib[iRow].weight[iWeight] * pSrcPixel[0];
-				dGreen += contrib[iRow].weight[iWeight] * pSrcPixel[1];
-				dBlue += contrib[iRow].weight[iWeight] * pSrcPixel[2];
-				dAlpha += contrib[iRow].weight[iWeight] * pSrcPixel[3];
-			}
-
-			pPixel = pDestImage + ((iRow * destWidth) + iCol)*numComponents;
-			pPixel[0] = static_cast<T>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel[1] = static_cast<T>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel[2] = static_cast<T>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel[3] = static_cast<T>(std::max(0.0, std::min(255.0, dAlpha)));
-		}
-	}
-
-	delete[] pTempImage;
-	delete[] contrib;
-
-	return 1;
-}
-
-
-}
-\ No newline at end of file
diff --git a/thirdparty/etc2comp/EtcImage.cpp b/thirdparty/etc2comp/EtcImage.cpp
deleted file mode 100644
index 7a1058844d..0000000000
--- a/thirdparty/etc2comp/EtcImage.cpp
+++ /dev/null
@@ -1,685 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcImage.cpp
-
-Image is an array of 4x4 blocks that represent the encoding of the source image
-
-*/
-
-#include "EtcConfig.h"
-
-#include <stdlib.h>
-
-#include "EtcImage.h"
-
-#include "Etc.h"
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcSortedBlockList.h"
-
-#if ETC_WINDOWS
-#include <windows.h>
-#endif
-#include <ctime>
-#include <chrono>
-#include <future>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-// fix conflict with Block4x4::AlphaMix
-#ifdef OPAQUE
-#undef OPAQUE
-#endif
-#ifdef TRANSPARENT
-#undef TRANSPARENT
-#endif
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Image::Image(void)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_warningsToCapture = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = nullptr;
-
-		m_pablock = nullptr;
-
-		m_encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-		m_uiEncodingBitsBytes = 0;
-		m_paucEncodingBits = nullptr;
-
-		m_format = Format::UNKNOWN;
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// constructor using source image
-	// used to set state before Encode() is called
-	//
-	Image::Image(float *a_pafSourceRGBA, unsigned int a_uiSourceWidth,
-					unsigned int a_uiSourceHeight, 
-					ErrorMetric a_errormetric)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_warningsToCapture = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = (ColorFloatRGBA *) a_pafSourceRGBA;
-		m_uiSourceWidth = a_uiSourceWidth;
-		m_uiSourceHeight = a_uiSourceHeight;
-
-		m_uiExtendedWidth = CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-		m_uiExtendedHeight = CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-		m_uiBlockColumns = m_uiExtendedWidth >> 2;
-		m_uiBlockRows = m_uiExtendedHeight >> 2;
-
-		m_pablock = new Block4x4[GetNumberOfBlocks()];
-		assert(m_pablock);
-
-		m_format = Format::UNKNOWN;
-
-		m_encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-		m_uiEncodingBitsBytes = 0;
-		m_paucEncodingBits = nullptr;
-
-		m_errormetric = a_errormetric;
-		m_fEffort = 0.0f;
-
-		m_iEncodeTime_ms = -1;
-
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-		m_bVerboseOutput = false;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// constructor using encoding bits
-	// recreates encoding state using a previously encoded image
-	//
-	Image::Image(Format a_format,
-					unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-					unsigned char *a_paucEncidingBits, unsigned int a_uiEncodingBitsBytes,
-					Image *a_pimageSource, ErrorMetric a_errormetric)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = nullptr;
-		m_uiSourceWidth = a_uiSourceWidth;
-		m_uiSourceHeight = a_uiSourceHeight;
-
-		m_uiExtendedWidth = CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-		m_uiExtendedHeight = CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-		m_uiBlockColumns = m_uiExtendedWidth >> 2;
-		m_uiBlockRows = m_uiExtendedHeight >> 2;
-
-		unsigned int uiBlocks = GetNumberOfBlocks();
-
-		m_pablock = new Block4x4[uiBlocks];
-		assert(m_pablock);
-
-		m_format = a_format;
-
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-		
-		m_encodingbitsformat = DetermineEncodingBitsFormat(m_format);
-		if (m_encodingbitsformat == Block4x4EncodingBits::Format::UNKNOWN)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_FORMAT);
-			return;
-		}
-		m_uiEncodingBitsBytes = a_uiEncodingBitsBytes;
-		m_paucEncodingBits = a_paucEncidingBits;
-
-		m_errormetric = a_errormetric;
-		m_fEffort = 0.0f;
-		m_bVerboseOutput = false;
-		m_iEncodeTime_ms = -1;
-		
-		unsigned char *paucEncodingBits = m_paucEncodingBits;
-		unsigned int uiEncodingBitsBytesPerBlock = Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-
-		unsigned int uiH = 0;
-		unsigned int uiV = 0;
-		for (unsigned int uiBlock = 0; uiBlock < uiBlocks; uiBlock++)
-		{
-			m_pablock[uiBlock].InitFromEtcEncodingBits(a_format, uiH, uiV, paucEncodingBits, 
-														a_pimageSource, a_errormetric);
-			paucEncodingBits += uiEncodingBitsBytesPerBlock;
-			uiH += 4;
-			if (uiH >= m_uiSourceWidth)
-			{
-				uiH = 0;
-				uiV += 4;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Image::~Image(void)
-	{
-		if (m_pablock != nullptr)
-		{
-			delete[] m_pablock;
-			m_pablock = nullptr;
-		}
-
-		/*if (m_paucEncodingBits != nullptr)
-		{
-			delete[] m_paucEncodingBits;
-			m_paucEncodingBits = nullptr;
-		}*/
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// encode an image
-	// create a set of encoding bits that conforms to a_format
-	// find best fit using a_errormetric
-	// explore a range of possible encodings based on a_fEffort (range = [0:100])
-	// speed up process using a_uiJobs as the number of process threads (a_uiJobs must not excede a_uiMaxJobs)
-	//
-	Image::EncodingStatus Image::Encode(Format a_format, ErrorMetric a_errormetric, float a_fEffort, unsigned int a_uiJobs, unsigned int a_uiMaxJobs)
-	{
-
-		auto start = std::chrono::steady_clock::now();
-		
-		m_encodingStatus = EncodingStatus::SUCCESS;
-
-		m_format = a_format;
-		m_errormetric = a_errormetric;
-		m_fEffort = a_fEffort;
-
-		if (m_errormetric < 0 || m_errormetric > ERROR_METRICS)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_ERROR_METRIC);
-			return m_encodingStatus;
-		}
-
-		if (m_fEffort < ETCCOMP_MIN_EFFORT_LEVEL)
-		{
-			AddToEncodingStatus(WARNING_EFFORT_OUT_OF_RANGE);
-			m_fEffort = ETCCOMP_MIN_EFFORT_LEVEL;
-		}
-		else if (m_fEffort > ETCCOMP_MAX_EFFORT_LEVEL)
-		{
-			AddToEncodingStatus(WARNING_EFFORT_OUT_OF_RANGE);
-			m_fEffort = ETCCOMP_MAX_EFFORT_LEVEL;
-		}
-		if (a_uiJobs < 1)
-		{
-			a_uiJobs = 1;
-			AddToEncodingStatus(WARNING_JOBS_OUT_OF_RANGE);
-		}
-		else if (a_uiJobs > a_uiMaxJobs)
-		{
-			a_uiJobs = a_uiMaxJobs;
-			AddToEncodingStatus(WARNING_JOBS_OUT_OF_RANGE);
-		}
-
-		m_encodingbitsformat = DetermineEncodingBitsFormat(m_format);
-
-		if (m_encodingbitsformat == Block4x4EncodingBits::Format::UNKNOWN)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_FORMAT);
-			return m_encodingStatus;
-		}
-
-		assert(m_paucEncodingBits == nullptr);
-		m_uiEncodingBitsBytes = GetNumberOfBlocks() * Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-		m_paucEncodingBits = new unsigned char[m_uiEncodingBitsBytes];
-
-		InitBlocksAndBlockSorter();
-
-
-		std::future<void> *handle = new std::future<void>[a_uiMaxJobs];
-
-		unsigned int uiNumThreadsNeeded = 0;
-		unsigned int uiUnfinishedBlocks = GetNumberOfBlocks();
-
-		uiNumThreadsNeeded = (uiUnfinishedBlocks < a_uiJobs) ? uiUnfinishedBlocks : a_uiJobs;
-			
-		for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-		{
-			handle[i] = async(std::launch::async, &Image::RunFirstPass, this, i, uiNumThreadsNeeded);
-		}
-
-		RunFirstPass(uiNumThreadsNeeded - 1, uiNumThreadsNeeded);
-
-		for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-		{
-			handle[i].get();
-		}
-
-		// perform effort-based encoding
-		if (m_fEffort > ETCCOMP_MIN_EFFORT_LEVEL)
-		{
-			unsigned int uiFinishedBlocks = 0;
-			unsigned int uiTotalEffortBlocks = static_cast<unsigned int>(roundf(0.01f * m_fEffort  * GetNumberOfBlocks()));
-
-			if (m_bVerboseOutput)
-			{
-				printf("effortblocks = %d\n", uiTotalEffortBlocks);
-			}
-			unsigned int uiPass = 0;
-			while (1)
-			{
-				if (m_bVerboseOutput)
-				{
-					uiPass++;
-					printf("pass %u\n", uiPass);
-				}
-				m_psortedblocklist->Sort();
-				uiUnfinishedBlocks = m_psortedblocklist->GetNumberOfSortedBlocks();
-				uiFinishedBlocks = GetNumberOfBlocks() - uiUnfinishedBlocks;
-				if (m_bVerboseOutput)
-				{
-					printf("    %u unfinished blocks\n", uiUnfinishedBlocks);
-					// m_psortedblocklist->Print();
-				}
-
-				
-
-				//stop enocding when we did enough to satify the effort percentage
-				if (uiFinishedBlocks >= uiTotalEffortBlocks)
-				{
-					if (m_bVerboseOutput)
-					{
-						printf("Finished %d Blocks out of %d\n", uiFinishedBlocks, uiTotalEffortBlocks);
-					}
-					break;
-				}
-
-				unsigned int uiIteratedBlocks = 0;
-				unsigned int blocksToIterateThisPass = (uiTotalEffortBlocks - uiFinishedBlocks);
-				uiNumThreadsNeeded = (uiUnfinishedBlocks < a_uiJobs) ? uiUnfinishedBlocks : a_uiJobs;
-
-				if (uiNumThreadsNeeded <= 1)
-				{
-					//since we already how many blocks each thread will process
-					//cap the thread limit to do the proper amount of work, and not more
-					uiIteratedBlocks = IterateThroughWorstBlocks(blocksToIterateThisPass, 0, 1);
-				}
-				else
-				{
-					//we have a lot of work to do, so lets multi thread it
-					std::future<unsigned int> *handleToBlockEncoders = new std::future<unsigned int>[uiNumThreadsNeeded-1];
-
-					for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-					{
-						handleToBlockEncoders[i] = async(std::launch::async, &Image::IterateThroughWorstBlocks, this, blocksToIterateThisPass, i, uiNumThreadsNeeded);
-					}
-					uiIteratedBlocks = IterateThroughWorstBlocks(blocksToIterateThisPass, uiNumThreadsNeeded - 1, uiNumThreadsNeeded);
-
-					for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-					{
-						uiIteratedBlocks += handleToBlockEncoders[i].get();
-					}
-
-					delete[] handleToBlockEncoders;
-				}
-
-				if (m_bVerboseOutput)
-				{
-					printf("    %u iterated blocks\n", uiIteratedBlocks);
-				}
-			}
-		}
-
-		// generate Etc2-compatible bit-format 4x4 blocks
-		for (int i = 0; i < (int)a_uiJobs - 1; i++)
-		{
-			handle[i] = async(std::launch::async, &Image::SetEncodingBits, this, i, a_uiJobs);
-		}
-		SetEncodingBits(a_uiJobs - 1, a_uiJobs);
-
-		for (int i = 0; i < (int)a_uiJobs - 1; i++)
-		{
-			handle[i].get();
-		}
-
-		auto end = std::chrono::steady_clock::now();
-		std::chrono::milliseconds elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-		m_iEncodeTime_ms = (int)elapsed.count();
-
-		delete[] handle;
-		delete m_psortedblocklist;
-		return m_encodingStatus;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// iterate the encoding thru the blocks with the worst error
-	// stop when a_uiMaxBlocks blocks have been iterated
-	// split the blocks between the process threads using a_uiMultithreadingOffset and a_uiMultithreadingStride
-	//
-	unsigned int Image::IterateThroughWorstBlocks(unsigned int a_uiMaxBlocks, 
-													unsigned int a_uiMultithreadingOffset, 
-													unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-		unsigned int uiIteratedBlocks = a_uiMultithreadingOffset;
-
-		SortedBlockList::Link *plink = m_psortedblocklist->GetLinkToFirstBlock();
-		for (plink = plink->Advance(a_uiMultithreadingOffset);
-				plink != nullptr;
-				plink = plink->Advance(a_uiMultithreadingStride) )
-		{
-			if (uiIteratedBlocks >= a_uiMaxBlocks)
-			{
-				break;
-			}
-
-			plink->GetBlock()->PerformEncodingIteration(m_fEffort);
-
-			uiIteratedBlocks += a_uiMultithreadingStride;	
-		}
-
-		return uiIteratedBlocks;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// determine which warnings to check for during Encode() based on encoding format
-	//
-	void Image::FindEncodingWarningTypesForCurFormat()
-	{
-		TrackEncodingWarning(WARNING_ALL_TRANSPARENT_PIXELS);
-		TrackEncodingWarning(WARNING_SOME_RGBA_NOT_0_TO_1);
-		switch (m_format)
-		{
-		case Image::Format::ETC1:
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_ALL_OPAQUE_PIXELS);
-			break;
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			TrackEncodingWarning(WARNING_ALL_OPAQUE_PIXELS);
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO);
-			TrackEncodingWarning(WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-			break;
-
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-			break;
-		case Image::Format::FORMATS:
-		case Image::Format::UNKNOWN:
-		default:
-			assert(0);
-			break;
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// examine source pixels to check for warnings
-	//
-	void Image::FindAndSetEncodingWarnings()
-	{
-		int numPixels = (m_uiBlockRows * 4) * (m_uiBlockColumns * 4);
-		if (m_iNumOpaquePixels == numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_ALL_OPAQUE_PIXELS);
-		}
-		if (m_iNumOpaquePixels < numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_NON_OPAQUE_PIXELS);
-		}
-		if (m_iNumTranslucentPixels > 0)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_TRANSLUCENT_PIXELS);
-		}
-		if (m_iNumTransparentPixels == numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_ALL_TRANSPARENT_PIXELS);
-		}
-		if (m_numColorValues.fB > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-		}
-		if (m_numColorValues.fG > 0.0f) 
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO);
-		}
-
-		if (m_numOutOfRangeValues.fR > 0.0f || m_numOutOfRangeValues.fG > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_RGBA_NOT_0_TO_1);
-		}
-		if (m_numOutOfRangeValues.fB > 0.0f || m_numOutOfRangeValues.fA > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_RGBA_NOT_0_TO_1);
-		}
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// return a string name for a given image format
-	//
-	const char * Image::EncodingFormatToString(Image::Format a_format)
-	{
-		switch (a_format)
-		{
-		case Image::Format::ETC1:
-			return "ETC1";
-		case Image::Format::RGB8:
-			return "RGB8";
-		case Image::Format::SRGB8:
-			return "SRGB8";
-
-		case Image::Format::RGB8A1:
-			return "RGB8A1";
-		case Image::Format::SRGB8A1:
-			return "SRGB8A1";
-		case Image::Format::RGBA8:
-			return "RGBA8";
-		case Image::Format::SRGBA8:
-			return "SRGBA8";
-
-		case Image::Format::R11:
-			return "R11";
-		case Image::Format::SIGNED_R11:
-			return "SIGNED_R11";
-
-		case Image::Format::RG11:
-			return "RG11";
-		case Image::Format::SIGNED_RG11:
-			return "SIGNED_RG11";
-		case Image::Format::FORMATS:
-		case Image::Format::UNKNOWN:
-		default:
-			return "UNKNOWN";
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return a string name for the image's format
-	//
-	const char * Image::EncodingFormatToString(void)
-	{
-		return EncodingFormatToString(m_format);
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// init image blocks prior to encoding
-	// init block sorter for subsequent sortings
-	// check for encoding warnings
-	//
-	void Image::InitBlocksAndBlockSorter(void)
-	{
-		
-		FindEncodingWarningTypesForCurFormat();
-
-		// init each block
-		Block4x4 *pblock = m_pablock;
-		unsigned char *paucEncodingBits = m_paucEncodingBits;
-		for (unsigned int uiBlockRow = 0; uiBlockRow < m_uiBlockRows; uiBlockRow++)
-		{
-			unsigned int uiBlockV = uiBlockRow * 4;
-
-			for (unsigned int uiBlockColumn = 0; uiBlockColumn < m_uiBlockColumns; uiBlockColumn++)
-			{
-				unsigned int uiBlockH = uiBlockColumn * 4;
-
-				pblock->InitFromSource(this, uiBlockH, uiBlockV, paucEncodingBits, m_errormetric);
-
-				paucEncodingBits += Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-
-				pblock++;
-			}
-		}
-
-		FindAndSetEncodingWarnings();
-
-		// init block sorter
-		{
-			m_psortedblocklist = new SortedBlockList(GetNumberOfBlocks(), 100);
-
-			for (unsigned int uiBlock = 0; uiBlock < GetNumberOfBlocks(); uiBlock++)
-			{
-				pblock = &m_pablock[uiBlock];
-				m_psortedblocklist->AddBlock(pblock);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// run the first pass of the encoder
-	// the encoder generally finds a reasonable, fast encoding
-	// this is run on all blocks regardless of effort to ensure that all blocks have a valid encoding
-	//
-	void Image::RunFirstPass(unsigned int a_uiMultithreadingOffset, unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-
-		for (unsigned int uiBlock = a_uiMultithreadingOffset;
-				uiBlock < GetNumberOfBlocks(); 
-				uiBlock += a_uiMultithreadingStride)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			pblock->PerformEncodingIteration(m_fEffort);
-		}
-	}
-
-    // ----------------------------------------------------------------------------------------------------
-	// set the encoding bits (for the output file) based on the best encoding for each block
-	//
-	void Image::SetEncodingBits(unsigned int a_uiMultithreadingOffset,
-								unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-
-		for (unsigned int uiBlock = a_uiMultithreadingOffset; 
-				uiBlock < GetNumberOfBlocks(); 
-				uiBlock += a_uiMultithreadingStride)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			pblock->SetEncodingBitsFromEncoding();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return the image error
-	// image error is the sum of all block errors
-	//
-	float Image::GetError(void)
-	{
-		float fError = 0.0f;
-
-		for (unsigned int uiBlock = 0; uiBlock < GetNumberOfBlocks(); uiBlock++)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			fError += pblock->GetError();
-		}
-
-		return fError;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// determine the encoding bits format based on the encoding format
-	// the encoding bits format is a family of bit encodings that are shared across various encoding formats
-	//
-	Block4x4EncodingBits::Format Image::DetermineEncodingBitsFormat(Format a_format)
-	{
-		Block4x4EncodingBits::Format encodingbitsformat;
-
-		// determine encoding bits format from image format
-		switch (a_format)
-		{
-		case Format::ETC1:
-		case Format::RGB8:
-		case Format::SRGB8:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGB8;
-			break;
-
-		case Format::RGBA8:
-		case Format::SRGBA8:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGBA8;
-			break;
-
-		case Format::R11:
-		case Format::SIGNED_R11:
-			encodingbitsformat = Block4x4EncodingBits::Format::R11;
-			break;
-
-		case Format::RG11:
-		case Format::SIGNED_RG11:
-			encodingbitsformat = Block4x4EncodingBits::Format::RG11;
-			break;
-
-		case Format::RGB8A1:
-		case Format::SRGB8A1:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGB8A1;
-			break;
-
-		default:
-			encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-			break;
-		}
-
-		return encodingbitsformat;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}	// namespace Etc
diff --git a/thirdparty/etc2comp/EtcImage.h b/thirdparty/etc2comp/EtcImage.h
deleted file mode 100644
index bd807ac32e..0000000000
--- a/thirdparty/etc2comp/EtcImage.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-//#include "Etc.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcErrorMetric.h"
-
-
-namespace Etc
-{
-	class Block4x4;
-	class EncoderSpec;
-	class SortedBlockList;
-
-    class Image
-    {
-    public:
-
-		//the differnt warning and errors that can come up during encoding
-		enum  EncodingStatus
-		{
-			SUCCESS = 0,
-			//
-			WARNING_THRESHOLD = 1 << 0,
-			//
-			WARNING_EFFORT_OUT_OF_RANGE = 1 << 1,
-			WARNING_JOBS_OUT_OF_RANGE = 1 << 2,
-			WARNING_SOME_NON_OPAQUE_PIXELS = 1 << 3,//just for opaque formats, etc1, rgb8, r11, rg11
-			WARNING_ALL_OPAQUE_PIXELS = 1 << 4,
-			WARNING_ALL_TRANSPARENT_PIXELS = 1 << 5,
-			WARNING_SOME_TRANSLUCENT_PIXELS = 1 << 6,//just for rgb8A1
-			WARNING_SOME_RGBA_NOT_0_TO_1 = 1 << 7,
-			WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO = 1 << 8,
-			WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO = 1 << 9,
-			//
-			ERROR_THRESHOLD = 1 << 16,
-			//
-			ERROR_UNKNOWN_FORMAT = 1 << 17,
-			ERROR_UNKNOWN_ERROR_METRIC = 1 << 18,
-			ERROR_ZERO_WIDTH_OR_HEIGHT = 1 << 19,
-			//
-		};
-		
-		enum class Format
-		{
-			UNKNOWN,
-			//
-			ETC1,
-			//
-			// ETC2 formats
-			RGB8,
-			SRGB8,
-			RGBA8,
-			SRGBA8,
-			R11,
-			SIGNED_R11,
-			RG11,
-			SIGNED_RG11,
-			RGB8A1,
-			SRGB8A1,
-			//
-			FORMATS,
-			//
-			DEFAULT = SRGB8
-		};
-
-		// constructor using source image
-		Image(float *a_pafSourceRGBA, unsigned int a_uiSourceWidth,
-				unsigned int a_uiSourceHeight,
-				ErrorMetric a_errormetric);
-
-		// constructor using encoding bits
-		Image(Format a_format, 
-				unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-				unsigned char *a_paucEncidingBits, unsigned int a_uiEncodingBitsBytes,
-				Image *a_pimageSource,
-				ErrorMetric a_errormetric);
-
-		~Image(void);
-
-		EncodingStatus Encode(Format a_format, ErrorMetric a_errormetric, float a_fEffort, 
-			unsigned int a_uiJobs, unsigned int a_uiMaxJobs);
-
-		inline void AddToEncodingStatus(EncodingStatus a_encStatus)
-		{
-			m_encodingStatus = (EncodingStatus)((unsigned int)m_encodingStatus | (unsigned int)a_encStatus);
-		}
-		
-		inline unsigned int GetSourceWidth(void)
-		{
-			return m_uiSourceWidth;
-		}
-
-		inline unsigned int GetSourceHeight(void)
-		{
-			return m_uiSourceHeight;
-		}
-
-		inline unsigned int GetExtendedWidth(void)
-		{
-			return m_uiExtendedWidth;
-		}
-
-		inline unsigned int GetExtendedHeight(void)
-		{
-			return m_uiExtendedHeight;
-		}
-
-		inline unsigned int GetNumberOfBlocks()
-		{
-			return m_uiBlockColumns * m_uiBlockRows;
-		}
-
-		inline Block4x4 * GetBlocks()
-		{
-			return m_pablock;
-		}
-
-		inline unsigned char * GetEncodingBits(void)
-		{
-			return m_paucEncodingBits;
-		}
-
-		inline unsigned int GetEncodingBitsBytes(void)
-		{
-			return m_uiEncodingBitsBytes;
-		}
-
-		inline int GetEncodingTimeMs(void)
-		{
-			return m_iEncodeTime_ms;
-		}
-
-		float GetError(void);
-
-		inline ColorFloatRGBA * GetSourcePixel(unsigned int a_uiH, unsigned int a_uiV)
-		{
-			if (a_uiH >= m_uiSourceWidth || a_uiV >= m_uiSourceHeight)
-			{
-				return nullptr;
-			}
-
-			return &m_pafrgbaSource[a_uiV*m_uiSourceWidth + a_uiH];
-		}
-
-		inline Format GetFormat(void)
-		{
-			return m_format;
-		}
-
-		static Block4x4EncodingBits::Format DetermineEncodingBitsFormat(Format a_format);
-
-		inline static unsigned short CalcExtendedDimension(unsigned short a_ushOriginalDimension)
-		{
-			return (unsigned short)((a_ushOriginalDimension + 3) & ~3);
-		}
-
-		inline ErrorMetric GetErrorMetric(void)
-		{
-			return m_errormetric;
-		}
-
-		static const char * EncodingFormatToString(Image::Format a_format);
-		const char * EncodingFormatToString(void);
-		//used to get basic information about the image data
-		int m_iNumOpaquePixels;
-		int m_iNumTranslucentPixels;
-		int m_iNumTransparentPixels;
-
-		ColorFloatRGBA m_numColorValues;
-		ColorFloatRGBA m_numOutOfRangeValues;
-
-		bool m_bVerboseOutput;
-	private:
-		//add a warning or error to check for while encoding
-		inline void TrackEncodingWarning(EncodingStatus a_encStatus)
-		{
-			m_warningsToCapture = (EncodingStatus)((unsigned int)m_warningsToCapture | (unsigned int)a_encStatus);
-		}
-
-		//report the warning if it is something we care about for this encoding
-		inline void AddToEncodingStatusIfSignfigant(EncodingStatus a_encStatus)
-		{
-			if ((EncodingStatus)((unsigned int)m_warningsToCapture & (unsigned int)a_encStatus) == a_encStatus)
-			{
-				AddToEncodingStatus(a_encStatus);
-			}
-		}
-
-		Image(void);
-		void FindEncodingWarningTypesForCurFormat();
-		void FindAndSetEncodingWarnings();
-
-		void InitBlocksAndBlockSorter(void);
-
-		void RunFirstPass(unsigned int a_uiMultithreadingOffset, 
-							unsigned int a_uiMultithreadingStride);
-
-		void SetEncodingBits(unsigned int a_uiMultithreadingOffset,
-								unsigned int a_uiMultithreadingStride);
-
-		unsigned int IterateThroughWorstBlocks(unsigned int a_uiMaxBlocks,
-												unsigned int a_uiMultithreadingOffset,
-												unsigned int a_uiMultithreadingStride);
-
-		// inputs
-		ColorFloatRGBA *m_pafrgbaSource;
-		unsigned int m_uiSourceWidth;
-		unsigned int m_uiSourceHeight;
-		unsigned int m_uiExtendedWidth;
-		unsigned int m_uiExtendedHeight;
-		unsigned int m_uiBlockColumns;
-		unsigned int m_uiBlockRows;
-		// intermediate data
-		Block4x4 *m_pablock;
-		// encoding
-		Format m_format;
-		Block4x4EncodingBits::Format m_encodingbitsformat;
-		unsigned int m_uiEncodingBitsBytes;		// for entire image
-		unsigned char *m_paucEncodingBits;
-		ErrorMetric m_errormetric;
-		float m_fEffort;
-		// stats
-		int m_iEncodeTime_ms;
-		
-		SortedBlockList *m_psortedblocklist;
-		//this will hold any warning or errors that happen during encoding
-		EncodingStatus m_encodingStatus;
-		//these will be the warnings we are tracking
-		EncodingStatus m_warningsToCapture;
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcIndividualTrys.cpp b/thirdparty/etc2comp/EtcIndividualTrys.cpp
deleted file mode 100644
index 56ff4c65ec..0000000000
--- a/thirdparty/etc2comp/EtcIndividualTrys.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcIndividualTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Individual mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcIndividualTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	IndividualTrys::IndividualTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-									const unsigned int *a_pauiPixelMapping1,
-									const unsigned int *a_pauiPixelMapping2,
-									unsigned int a_uiRadius)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR4G4B4();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR4G4B4();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(15.0f), a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(15.0f), a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(15.0f), a_uiRadius);
-		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(15.0f), a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(15.0f), a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(15.0f), a_uiRadius);
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void IndividualTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue,
-									const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcIndividualTrys.h b/thirdparty/etc2comp/EtcIndividualTrys.h
deleted file mode 100644
index 5fb12fbcf4..0000000000
--- a/thirdparty/etc2comp/EtcIndividualTrys.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class IndividualTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 1;
-
-		IndividualTrys(ColorFloatRGBA a_frgbaColor1,
-						ColorFloatRGBA a_frgbaColor2,
-						const unsigned int *a_pauiPixelMapping1,
-						const unsigned int *a_pauiPixelMapping2,
-						unsigned int a_uiRadius);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (15- a_iDistance))
-			{
-				return (15 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 27;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcMath.cpp b/thirdparty/etc2comp/EtcMath.cpp
deleted file mode 100644
index 096d5f7ab9..0000000000
--- a/thirdparty/etc2comp/EtcMath.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "EtcMath.h"
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the line that best fits the set of XY points contained in a_afX[] and a_afY[]
-	// use a_fSlope and a_fOffset to define that line
-	//
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset)
-	{
-		float fPoints = (float)a_Points;
-
-		float fSumX = 0.0f;
-		float fSumY = 0.0f;
-		float fSumXY = 0.0f;
-		float fSumX2 = 0.0f;
-
-		for (unsigned int uiPoint = 0; uiPoint < a_Points; uiPoint++)
-		{
-			fSumX += a_afX[uiPoint];
-			fSumY += a_afY[uiPoint];
-			fSumXY += a_afX[uiPoint] * a_afY[uiPoint];
-			fSumX2 += a_afX[uiPoint] * a_afX[uiPoint];
-		}
-
-		float fDivisor = fPoints*fSumX2 - fSumX*fSumX;
-
-		// if vertical line
-		if (fDivisor == 0.0f)
-		{
-			*a_fSlope = 0.0f;
-			*a_fOffset = 0.0f;
-			return true;
-		}
-
-		*a_fSlope = (fPoints*fSumXY - fSumX*fSumY) / fDivisor;
-		*a_fOffset = (fSumY - (*a_fSlope)*fSumX) / fPoints;
-
-		return false;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcMath.h b/thirdparty/etc2comp/EtcMath.h
deleted file mode 100644
index c58c9a91bc..0000000000
--- a/thirdparty/etc2comp/EtcMath.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// return true if vertical line
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset);
-
-	inline float ConvertMSEToPSNR(float a_fMSE)
-	{
-		if (a_fMSE == 0.0f)
-		{
-			return INFINITY;
-		}
-
-		return 10.0f * log10f(1.0f / a_fMSE);
-	}
-
-
-}
diff --git a/thirdparty/etc2comp/EtcSortedBlockList.cpp b/thirdparty/etc2comp/EtcSortedBlockList.cpp
deleted file mode 100644
index bfa6b7b3fa..0000000000
--- a/thirdparty/etc2comp/EtcSortedBlockList.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcSortedBlockList.cpp
-
-SortedBlockList is a list of 4x4 blocks that can be used by the "effort" system to prioritize
-the encoding of the 4x4 blocks.
-
-The sorting is done with buckets, where each bucket is an indication of how much error each 4x4 block has
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcSortedBlockList.h"
-
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct an empty list
-	//
-	// allocate enough memory to add all of the image's 4x4 blocks later
-	// allocate enough buckets to sort the blocks
-	//
-	SortedBlockList::SortedBlockList(unsigned int a_uiImageBlocks, unsigned int a_uiBuckets)
-	{
-		m_uiImageBlocks = a_uiImageBlocks;
-		m_iBuckets = (int)a_uiBuckets;
-
-		m_uiAddedBlocks = 0;
-		m_uiSortedBlocks = 0;
-		m_palinkPool = new Link[m_uiImageBlocks];
-		m_pabucket = new Bucket[m_iBuckets];
-		m_fMaxError = 0.0f;
-
-		InitBuckets();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	SortedBlockList::~SortedBlockList(void)
-	{
-		delete[] m_palinkPool;
-		delete[] m_pabucket;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-    // add a 4x4 block to the list
-	// the 4x4 block will be sorted later
-	//
-    void SortedBlockList::AddBlock(Block4x4 *a_pblock)
-    {
-        assert(m_uiAddedBlocks < m_uiImageBlocks);
-        Link *plink = &m_palinkPool[m_uiAddedBlocks++];
-		plink->Init(a_pblock);
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	// sort all of the 4x4 blocks that have been added to the list
-	//
-	// first, determine the maximum error, then assign an error range to each bucket
-	// next, determine which bucket each 4x4 block belongs to based on the 4x4 block's error
-	// add the 4x4 block to the appropriate bucket
-	// lastly, walk thru the buckets and add each bucket to a sorted linked list
-	//
-	// the resultant sorting is an approximate sorting from most to least error
-	//
-    void SortedBlockList::Sort(void)
-    {
-		assert(m_uiAddedBlocks == m_uiImageBlocks);
-        InitBuckets();
-
-        // find max block error
-        m_fMaxError = -1.0f;
-
-        for (unsigned int uiLink = 0; uiLink < m_uiAddedBlocks; uiLink++)
-        {
-            Link *plinkBlock = &m_palinkPool[uiLink];
-
-            float fBlockError = plinkBlock->GetBlock()->GetError();
-            if (fBlockError > m_fMaxError)
-            {
-                m_fMaxError = fBlockError;
-            }
-        }
-        // prevent divide by zero or divide by negative
-        if (m_fMaxError <= 0.0f)
-        {
-            m_fMaxError = 1.0f;
-        }
-		//used for debugging
-		//int numDone = 0;
-        // put all of the blocks with unfinished encodings into the appropriate bucket
-		m_uiSortedBlocks = 0;
-        for (unsigned int uiLink = 0; uiLink < m_uiAddedBlocks; uiLink++)
-        {
-            Link *plinkBlock = &m_palinkPool[uiLink];
-
-			// if the encoding is done, don't add it to the list
-			if (plinkBlock->GetBlock()->GetEncoding()->IsDone())
-			{
-				//numDone++;
-				continue;
-			}
-
-            // calculate the appropriate sort bucket
-            float fBlockError = plinkBlock->GetBlock()->GetError();
-            int iBucket = (int) floorf(m_iBuckets * fBlockError / m_fMaxError);
-            // clamp to bucket index
-            iBucket = iBucket < 0 ? 0 : iBucket >= m_iBuckets ? m_iBuckets - 1 : iBucket;
-
-            // add block to bucket
-			{
-				Bucket *pbucket = &m_pabucket[iBucket];
-				if (pbucket->plinkLast)
-				{
-					pbucket->plinkLast->SetNext(plinkBlock);
-					pbucket->plinkLast = plinkBlock;
-				}
-				else
-				{
-					pbucket->plinkFirst = pbucket->plinkLast = plinkBlock;
-				}
-				plinkBlock->SetNext(nullptr);
-			}
-
-			m_uiSortedBlocks++;
-
-            if (0)
-            {
-                printf("%u: e=%.3f\n", uiLink, fBlockError);
-                Print();
-                printf("\n\n\n");
-            }
-        }
-		//printf("num blocks already done: %d\n",numDone);
-		//link the blocks together across buckets
-		m_plinkFirst = nullptr;
-		m_plinkLast = nullptr;
-		for (int iBucket = m_iBuckets - 1; iBucket >= 0; iBucket--)
-		{
-			Bucket *pbucket = &m_pabucket[iBucket];
-
-			if (pbucket->plinkFirst)
-			{
-				if (m_plinkFirst == nullptr)
-				{
-					m_plinkFirst = pbucket->plinkFirst;
-				}
-				else
-				{
-					assert(pbucket->plinkLast->GetNext() == nullptr);
-					m_plinkLast->SetNext(pbucket->plinkFirst);
-				}
-
-				m_plinkLast = pbucket->plinkLast;
-			}
-		}
-
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// clear all of the buckets.  normally done in preparation for a sort
-	//
-	void SortedBlockList::InitBuckets(void)
-    {
-        for (int iBucket = 0; iBucket < m_iBuckets; iBucket++)
-        {
-            Bucket *pbucket = &m_pabucket[iBucket];
-
-            pbucket->plinkFirst = 0;
-            pbucket->plinkLast = 0;
-        }
-    }
-
-    // ----------------------------------------------------------------------------------------------------
-    // print out the list of sorted 4x4 blocks
-	// normally used for debugging
-	//
-    void SortedBlockList::Print(void)
-    {
-        for (int iBucket = m_iBuckets-1; iBucket >= 0; iBucket--)
-        {
-            Bucket *pbucket = &m_pabucket[iBucket];
-
-            unsigned int uiBlocks = 0;
-            for (Link *plink = pbucket->plinkFirst; plink != nullptr; plink = plink->GetNext() )
-            {
-                uiBlocks++;
-
-				if (plink == pbucket->plinkLast)
-				{
-					break;
-				}
-            }
-
-            float fBucketError = m_fMaxError * iBucket / m_iBuckets;
-            float fBucketRMS = sqrtf(fBucketError / (4.0f*16.0f) );
-            printf("%3d: e=%.3f rms=%.6f %u\n", iBucket, fBucketError, fBucketRMS, uiBlocks);
-        }
-    }
-
-    // ----------------------------------------------------------------------------------------------------
-    //
-
-}   // namespace Etc
diff --git a/thirdparty/etc2comp/EtcSortedBlockList.h b/thirdparty/etc2comp/EtcSortedBlockList.h
deleted file mode 100644
index 960e8adc34..0000000000
--- a/thirdparty/etc2comp/EtcSortedBlockList.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Etc
-{
-	class Block4x4;
-
-    class SortedBlockList
-    {
-    public:
-
-		class Link
-		{
-		public:
-
-			inline void Init(Block4x4 *a_pblock)
-			{
-				m_pblock = a_pblock;
-				m_plinkNext = nullptr;
-			}
-
-			inline Block4x4 * GetBlock(void)
-			{
-				return m_pblock;
-			}
-
-			inline void SetNext(Link *a_plinkNext)
-			{
-				m_plinkNext = a_plinkNext;
-			}
-
-			inline Link * GetNext(void)
-			{
-				return m_plinkNext;
-			}
-
-			inline Link * Advance(unsigned int a_uiSteps = 1)
-			{
-				Link *plink = this;
-
-				for (unsigned int uiStep = 0; uiStep < a_uiSteps; uiStep++)
-				{
-					if (plink == nullptr)
-					{
-						break;
-					}
-
-					plink = plink->m_plinkNext;
-				}
-
-				return plink;
-			}
-
-		private:
-
-			Block4x4 *m_pblock;
-			Link *m_plinkNext;
-		};
-
-		SortedBlockList(unsigned int a_uiImageBlocks, unsigned int a_uiBuckets);
-		~SortedBlockList(void);
-
-        void AddBlock(Block4x4 *a_pblock);
-
-        void Sort(void);
-
-		inline Link * GetLinkToFirstBlock(void)
-		{
-			return m_plinkFirst;
-		}
-
-		inline unsigned int GetNumberOfAddedBlocks(void)
-		{
-			return m_uiAddedBlocks;
-		}
-
-		inline unsigned int GetNumberOfSortedBlocks(void)
-		{
-			return m_uiSortedBlocks;
-		}
-
-		void Print(void);
-
-	private:
-
-        void InitBuckets(void);
-
-        class Bucket
-        {
-        public:
-            Link *plinkFirst;
-            Link *plinkLast;
-        };
-
-        unsigned int m_uiImageBlocks;
-        int m_iBuckets;
-
-		unsigned int m_uiAddedBlocks;
-		unsigned int m_uiSortedBlocks;
-		Link *m_palinkPool;
-        Bucket *m_pabucket;
-        float m_fMaxError;
-
-		Link *m_plinkFirst;
-		Link *m_plinkLast;
-
-    };
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/LICENSE b/thirdparty/etc2comp/LICENSE
deleted file mode 100644
index d645695673..0000000000
--- a/thirdparty/etc2comp/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/thirdparty/etc2comp/README.md b/thirdparty/etc2comp/README.md
deleted file mode 100644
index 2f4363d042..0000000000
--- a/thirdparty/etc2comp/README.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Etc2Comp - Texture to ETC2 compressor
-
-Etc2Comp is a command line tool that converts textures (e.g. bitmaps)
-into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression)
-format. The tool is built with a focus on encoding performance
-to reduce the amount of time required to compile asset heavy applications as
-well as reduce overall application size.
-
-This repo provides source code that can be compiled into a binary. The
-binary can then be used to convert textures to the ETC2 format.
-
-Important: This is not an official Google product. It is an experimental
-library published as-is. Please see the CONTRIBUTORS.md file for information
-about questions or issues.
-
-## Setup
-This project uses [CMake](https://cmake.org/) to generate platform-specific
-build files:
- - Linux: make files
- - OS X: Xcode workspace files
- - Microsoft Windows: Visual Studio solution files
- - Note: CMake supports other formats, but this doc only provides steps for
- one of each platform for brevity.
-
-Refer to each platform's setup section to setup your environment and build
-an Etc2Comp binary. Then skip to the usage section of this page for examples
-of how to use the library.
-
-### Setup for OS X
- build tested on this config:
-  OS X 10.9.5 i7 16GB RAM
-  Xcode 5.1.1
-  cmake 3.2.3
-  
-Start by downloading and installing the following components if they are not
-already installed on your development machine.
- - *Xcode* version 5.1.1, or greater
- - [CMake](https://cmake.org/download/) version 3.2.3, or greater
-
-To build the Etc2Comp binary:
- 1. Open a *Terminal* window and navigate to the project directory.
- 1. Run `mkdir build_xcode`
- 1. Run `cd build_xcode`
- 1. Run `cmake -G Xcode ../`
- 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file.
- 1. Open the Product menu and choose Build For -> Running.
- 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool`
-can be executed.
-
-Optional
-Xcode EtcTool ‘Run’ preferences
-note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences 
-will need to be set by hand after cmake is run (these prefs are retained across 
-cmake updates if the .xcodeproj is not deleted/removed)
-
-1. Set the active scheme to ‘EtcTool’
-1. Edit the scheme
-1. Select option ‘Run EtcTool’, then tab ‘Arguments’. 
-Add this launch argument: ‘-argfile ../../EtcTool/args.txt’
-1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’
-
-### SetUp for Windows
-
-1. Open a *Terminal* window and navigate to the project directory.
-1. Run `mkdir build_vs`
-1. Run `cd build_vs`
-1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; 
-  For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../`
-  For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../`
-  NOTE: To see what supported Visual Studio outputs there are, run `cmake -G`
-1. open the 'EtcTest' solution
-1. make the 'EtcTool' project the start up project 
-1. (optional) in the project properties, under 'Debugging ->command arguments' 
-add the argfile textfile thats included in the EtcTool directory. 
-example: -argfile C:\etc2\EtcTool\Args.txt
-
-### Setup For Linux
-The Linux build was tested on this config:
-  Ubuntu desktop 14.04
-  gcc/g++ 4.8
-  cmake 2.8.12.2
-
-1. Verify linux has cmake and C++-11 capable g++ installed
-1. Open shell
-1. Run `mkdir build_linux`
-1. Run `cd build_linux`
-1. Run `cmake ../`
-1. Run `make`
-1. navigate to the newly created EtcTool directory `cd EtcTool`
-1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt`
-
-Skip to the <a href="#usage">Usage</a> section for more information about using the
-tool.
-
-## Usage
-
-### Command Line Usage
-EtcTool can be run from the command line with the following usage:
-    etctool.exe source_image [options ...] -output encoded_image
-
-The encoder will use an array of RGBA floats read from the source_image to create 
-an ETC1 or ETC2 encoded image in encoded_image.  The RGBA floats should be in the 
-range [0:1].
-
-Options:
-
-    -analyze <analysis_folder>
-    -argfile <arg_file>           additional command line arguments read from a file
-    -blockAtHV <H V>              encodes a single block that contains the
-                                  pixel specified by the H V coordinates
-    -compare <comparison_image>   compares source_image to comparison_image
-    -effort <amount>              number between 0 and 100 to specify the encoding quality 
-                                  (100 is the highest quality)
-    -errormetric <error_metric>   specify the error metric, the options are
-                                  rgba, rgbx, rec709, numeric and normalxyz
-    -format <etc_format>          ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1,
-                                  SRGB8A1 or R11
-    -help                         prints this message
-    -jobs or -j <thread_count>    specifies the number of threads (default=1)
-    -normalizexyz                 normalize RGB to have a length of 1
-    -verbose or -v                shows status information during the encoding
-                                  process
-	-mipmaps or -m <mip_count>    sets the maximum number of mipaps to generate (default=1)
-	-mipwrap or -w <x|y|xy>       sets the mipmap filter wrap mode (default=clamp)
-
-* -analyze will run an analysis of the encoding and place it in folder 
-"analysis_folder" (e.g. ../analysis/kodim05).  within the analysis_folder, a folder 
-will be created with a name of the current date/time (e.g. 20151204_153306).  this 
-date/time folder is used to compare encodings of the same texture over time.  
-within the date/time folder is a text file with several encoding stats and a 2x png 
-image showing the encoding mode for each 4x4 block.
-
-* -argfile allows additional command line arguments to be placed in a text file
-
-* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V).  
-This is mainly used for debugging
-
-* -compare compares the source image to the created encoded image. The encoding
-will dictate what error analysis is used in the comparison.
-
-* -effort uses an "amount" between 0 and 100 to determine how much additional effort 
-to apply during the encoding.
-
-* -errormetric selects the fitting algorithm used by the encoder.  "rgba" calculates 
-RMS error using RGB components that are weighted by A.  "rgbx" calculates RMS error 
-using RGBA components, where A is treated as an additional data channel, instead of 
-as alpha.  "rec709" is similar to "rgba", except the RGB components are also weighted 
-according to Rec709.  "numeric" calculates RMS error using unweighted RGBA components.  
-"normalize" calculates error based on dot product and vector length for RGB and RMS 
-error for A.
-
-* -help prints out the usage message
-
-* -jobs enables multi-threading to speed up image encoding
-
-* -normalizexyz normalizes the source RGB to have a length of 1.
-
-* -verbose shows information on the current encoding process. It will then display the 
-PSNR and time time it took to encode the image.
-
-* -mipmaps takes an argument that specifies how many mipmaps to generate from the 
-source image.  The mipmaps are generated with a lanczos3 filter using edge clamping.
-If the mipmaps option is not specified no mipmaps are created.
-
-* -mipwrap takes an argument that specifies the mipmap filter wrap mode.  The options 
-are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively.
-The default options are clamping in both x and y.
-
-Note: Path names can use slashes or backslashes.  The tool will convert the 
-slashes to the appropriate polarity for the current platform.
-
-
-## API
-
-The library supports two different APIs - a C-like API that is not heavily 
-class-based and a class-based API.
-
-main() in EtcTool.cpp contains an example of both APIs.
-
-The Encode() method now returns an EncodingStatus that contains bit flags for
-reporting various warnings and flags encountered when encoding.
-
-
-## Copyright
-Copyright 2015 Etc2Comp Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch b/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch
deleted file mode 100644
index ea9b5640b6..0000000000
--- a/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch
+++ /dev/null
@@ -1,224 +0,0 @@
-diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-index 5656556db9..5c7ebed788 100644
---- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-+++ b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-@@ -508,7 +508,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -519,7 +519,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -530,7 +530,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -545,7 +545,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -556,7 +556,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -567,7 +567,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-@@ -761,7 +761,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -772,7 +772,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -783,7 +783,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -798,7 +798,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -809,7 +809,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -820,7 +820,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-index ba2b42fb05..b94b64e68c 100644
---- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-+++ b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-@@ -847,7 +847,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -858,7 +858,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -869,7 +869,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -884,7 +884,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -895,7 +895,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -906,7 +906,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-@@ -1108,7 +1108,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -1119,7 +1119,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -1130,7 +1130,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -1145,7 +1145,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -1156,7 +1156,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -1167,7 +1167,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
diff --git a/thirdparty/etcpak/AUTHORS.txt b/thirdparty/etcpak/AUTHORS.txt
new file mode 100644
index 0000000000..e7bae62c85
--- /dev/null
+++ b/thirdparty/etcpak/AUTHORS.txt
@@ -0,0 +1,3 @@
+Bartosz Taudul <wolf@nereid.pl>
+Daniel Jungmann <el.3d.source@gmail.com>
+Florian Penzkofer <fp@nullptr.de>
diff --git a/thirdparty/etcpak/Dither.cpp b/thirdparty/etcpak/Dither.cpp
new file mode 100644
index 0000000000..355686f26b
--- /dev/null
+++ b/thirdparty/etcpak/Dither.cpp
@@ -0,0 +1,120 @@
+#include <algorithm>
+#include <string.h>
+
+#include "Dither.hpp"
+#include "Math.hpp"
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i l0 = _mm256_inserti128_si256( _mm256_castsi128_si256( px0 ), px1, 1 );
+    __m256i l1 = _mm256_inserti128_si256( _mm256_castsi128_si256( px2 ), px3, 1 );
+
+    __m256i a0 = _mm256_adds_epu8( l0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( l1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+
+}
+#endif
+
+void Dither( uint8_t* data )
+{
+#ifdef __AVX2__
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i px0 = _mm256_loadu_si256( (__m256i*)(data   ) );
+    __m256i px1 = _mm256_loadu_si256( (__m256i*)(data+32) );
+
+    __m256i a0 = _mm256_adds_epu8( px0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( px1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+#else
+    static constexpr int8_t Bayer31[16] = {
+        ( 0-8)*2/3, ( 8-8)*2/3, ( 2-8)*2/3, (10-8)*2/3,
+        (12-8)*2/3, ( 4-8)*2/3, (14-8)*2/3, ( 6-8)*2/3,
+        ( 3-8)*2/3, (11-8)*2/3, ( 1-8)*2/3, ( 9-8)*2/3,
+        (15-8)*2/3, ( 7-8)*2/3, (13-8)*2/3, ( 5-8)*2/3
+    };
+    static constexpr int8_t Bayer63[16] = {
+        ( 0-8)*2/6, ( 8-8)*2/6, ( 2-8)*2/6, (10-8)*2/6,
+        (12-8)*2/6, ( 4-8)*2/6, (14-8)*2/6, ( 6-8)*2/6,
+        ( 3-8)*2/6, (11-8)*2/6, ( 1-8)*2/6, ( 9-8)*2/6,
+        (15-8)*2/6, ( 7-8)*2/6, (13-8)*2/6, ( 5-8)*2/6
+    };
+
+    for( int i=0; i<16; i++ )
+    {
+        uint32_t col;
+        memcpy( &col, data, 4 );
+        uint8_t r = col & 0xFF;
+        uint8_t g = ( col >> 8 ) & 0xFF;
+        uint8_t b = ( col >> 16 ) & 0xFF;
+
+        r = clampu8( r + Bayer31[i] );
+        g = clampu8( g + Bayer63[i] );
+        b = clampu8( b + Bayer31[i] );
+
+        col = r | ( g << 8 ) | ( b << 16 );
+        memcpy( data, &col, 4 );
+        data += 4;
+    }
+#endif
+}
diff --git a/thirdparty/etcpak/Dither.hpp b/thirdparty/etcpak/Dither.hpp
new file mode 100644
index 0000000000..e43ce5676d
--- /dev/null
+++ b/thirdparty/etcpak/Dither.hpp
@@ -0,0 +1,21 @@
+#ifndef __DITHER_HPP__
+#define __DITHER_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+void Dither( uint8_t* data );
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 );
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/ForceInline.hpp b/thirdparty/etcpak/ForceInline.hpp
new file mode 100644
index 0000000000..b6f012841b
--- /dev/null
+++ b/thirdparty/etcpak/ForceInline.hpp
@@ -0,0 +1,20 @@
+#ifndef __FORCEINLINE_HPP__
+#define __FORCEINLINE_HPP__
+
+#if defined(__GNUC__)
+#  define etcpak_force_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#  define etcpak_force_inline __forceinline
+#else
+#  define etcpak_force_inline inline
+#endif
+
+#if defined(__GNUC__)
+#  define etcpak_no_inline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#  define etcpak_no_inline __declspec(noinline)
+#else
+#  define etcpak_no_inline
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/LICENSE.txt b/thirdparty/etcpak/LICENSE.txt
new file mode 100644
index 0000000000..59e85d6ea5
--- /dev/null
+++ b/thirdparty/etcpak/LICENSE.txt
@@ -0,0 +1,26 @@
+etcpak, an extremely fast ETC compression utility (https://github.com/wolfpld/etcpak)
+
+Copyright (c) 2013-2021, Bartosz Taudul <wolf@nereid.pl>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/etcpak/Math.hpp b/thirdparty/etcpak/Math.hpp
new file mode 100644
index 0000000000..994e1ac4ea
--- /dev/null
+++ b/thirdparty/etcpak/Math.hpp
@@ -0,0 +1,92 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <stdint.h>
+
+#include "ForceInline.hpp"
+
+template<typename T>
+static etcpak_force_inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+static etcpak_force_inline int CountSetBits( uint32_t val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+static etcpak_force_inline int CountLeadingZeros( uint32_t val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+static etcpak_force_inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return pow( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+static etcpak_force_inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+static etcpak_force_inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+static etcpak_force_inline uint8_t clampu8( int32_t val )
+{
+    if( ( val & ~0xFF ) == 0 ) return val;
+    return ( ( ~val ) >> 31 ) & 0xFF;
+}
+
+template<class T>
+static etcpak_force_inline T sq( T val )
+{
+    return val * val;
+}
+
+static etcpak_force_inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessCommon.hpp b/thirdparty/etcpak/ProcessCommon.hpp
new file mode 100644
index 0000000000..657d68888f
--- /dev/null
+++ b/thirdparty/etcpak/ProcessCommon.hpp
@@ -0,0 +1,50 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64_t FixByteOrder( uint64_t d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessDxtc.cpp b/thirdparty/etcpak/ProcessDxtc.cpp
new file mode 100644
index 0000000000..508d55fd75
--- /dev/null
+++ b/thirdparty/etcpak/ProcessDxtc.cpp
@@ -0,0 +1,956 @@
+#include "Dither.hpp"
+#include "ForceInline.hpp"
+#include "ProcessDxtc.hpp"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#if defined __AVX__ && !defined __SSE4_1__
+#  define __SSE4_1__
+#endif
+
+#if defined __SSE4_1__ || defined __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#    ifndef _mm256_cvtsi256_si32
+#      define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
+#    endif
+#  endif
+#endif
+
+
+static etcpak_force_inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
+{
+    return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
+}
+
+static etcpak_force_inline uint16_t to565( uint32_t c )
+{
+    return
+        ( ( c & 0xF80000 ) >> 19 ) |
+        ( ( c & 0x00FC00 ) >> 5 ) |
+        ( ( c & 0x0000F8 ) << 8 );
+}
+
+static const uint8_t DxtcIndexTable[256] = {
+    85,     87,     86,     84,     93,     95,     94,     92,     89,     91,     90,     88,     81,     83,     82,     80,
+    117,    119,    118,    116,    125,    127,    126,    124,    121,    123,    122,    120,    113,    115,    114,    112,
+    101,    103,    102,    100,    109,    111,    110,    108,    105,    107,    106,    104,    97,     99,     98,     96,
+    69,     71,     70,     68,     77,     79,     78,     76,     73,     75,     74,     72,     65,     67,     66,     64,
+    213,    215,    214,    212,    221,    223,    222,    220,    217,    219,    218,    216,    209,    211,    210,    208,
+    245,    247,    246,    244,    253,    255,    254,    252,    249,    251,    250,    248,    241,    243,    242,    240,
+    229,    231,    230,    228,    237,    239,    238,    236,    233,    235,    234,    232,    225,    227,    226,    224,
+    197,    199,    198,    196,    205,    207,    206,    204,    201,    203,    202,    200,    193,    195,    194,    192,
+    149,    151,    150,    148,    157,    159,    158,    156,    153,    155,    154,    152,    145,    147,    146,    144,
+    181,    183,    182,    180,    189,    191,    190,    188,    185,    187,    186,    184,    177,    179,    178,    176,
+    165,    167,    166,    164,    173,    175,    174,    172,    169,    171,    170,    168,    161,    163,    162,    160,
+    133,    135,    134,    132,    141,    143,    142,    140,    137,    139,    138,    136,    129,    131,    130,    128,
+    21,     23,     22,     20,     29,     31,     30,     28,     25,     27,     26,     24,     17,     19,     18,     16,
+    53,     55,     54,     52,     61,     63,     62,     60,     57,     59,     58,     56,     49,     51,     50,     48,
+    37,     39,     38,     36,     45,     47,     46,     44,     41,     43,     42,     40,     33,     35,     34,     32,
+    5,      7,      6,      4,      13,     15,     14,     12,     9,      11,     10,     8,      1,      3,      2,      0
+};
+
+static const uint8_t AlphaIndexTable_SSE[64] = {
+    9,      15,     14,     13,     12,     11,     10,     8,      57,     63,     62,     61,     60,     59,     58,     56,
+    49,     55,     54,     53,     52,     51,     50,     48,     41,     47,     46,     45,     44,     43,     42,     40,
+    33,     39,     38,     37,     36,     35,     34,     32,     25,     31,     30,     29,     28,     27,     26,     24,
+    17,     23,     22,     21,     20,     19,     18,     16,     1,      7,      6,      5,      4,      3,      2,      0,
+};
+
+static const uint16_t DivTable[255*3+1] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
+    0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
+    0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
+    0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
+    0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
+    0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
+    0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
+    0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
+    0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
+    0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
+    0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
+    0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
+    0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
+    0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
+    0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
+    0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
+    0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
+    0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
+    0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
+    0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
+    0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
+    0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
+    0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
+    0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
+    0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
+    0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
+    0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
+    0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
+    0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
+    0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
+    0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
+    0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
+    0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
+    0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
+    0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
+    0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
+    0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
+    0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
+    0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
+    0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
+    0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
+    0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
+    0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
+    0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
+    0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
+    0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
+    0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
+    0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
+};
+static const uint16_t DivTableNEON[255*3+1] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
+    0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
+    0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
+    0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
+    0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
+    0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
+    0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
+    0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
+    0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
+    0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
+    0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
+    0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
+    0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
+    0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
+    0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
+    0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
+    0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
+    0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
+    0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
+    0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
+    0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
+    0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
+    0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
+    0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
+    0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
+    0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
+    0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
+    0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
+    0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
+    0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
+    0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
+    0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
+    0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
+    0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
+    0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
+    0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
+    0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
+    0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
+    0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
+    0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
+    0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
+    0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
+    0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
+    0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
+    0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
+    0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
+    0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
+};
+
+static const uint16_t DivTableAlpha[256] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xe38e, 0xcccc, 0xba2e, 0xaaaa, 0x9d89, 0x9249, 0x8888, 0x8000,
+    0x7878, 0x71c7, 0x6bca, 0x6666, 0x6186, 0x5d17, 0x590b, 0x5555, 0x51eb, 0x4ec4, 0x4bda, 0x4924, 0x469e, 0x4444, 0x4210, 0x4000,
+    0x3e0f, 0x3c3c, 0x3a83, 0x38e3, 0x3759, 0x35e5, 0x3483, 0x3333, 0x31f3, 0x30c3, 0x2fa0, 0x2e8b, 0x2d82, 0x2c85, 0x2b93, 0x2aaa,
+    0x29cb, 0x28f5, 0x2828, 0x2762, 0x26a4, 0x25ed, 0x253c, 0x2492, 0x23ee, 0x234f, 0x22b6, 0x2222, 0x2192, 0x2108, 0x2082, 0x2000,
+    0x1f81, 0x1f07, 0x1e91, 0x1e1e, 0x1dae, 0x1d41, 0x1cd8, 0x1c71, 0x1c0e, 0x1bac, 0x1b4e, 0x1af2, 0x1a98, 0x1a41, 0x19ec, 0x1999,
+    0x1948, 0x18f9, 0x18ac, 0x1861, 0x1818, 0x17d0, 0x178a, 0x1745, 0x1702, 0x16c1, 0x1681, 0x1642, 0x1605, 0x15c9, 0x158e, 0x1555,
+    0x151d, 0x14e5, 0x14af, 0x147a, 0x1446, 0x1414, 0x13e2, 0x13b1, 0x1381, 0x1352, 0x1323, 0x12f6, 0x12c9, 0x129e, 0x1273, 0x1249,
+    0x121f, 0x11f7, 0x11cf, 0x11a7, 0x1181, 0x115b, 0x1135, 0x1111, 0x10ec, 0x10c9, 0x10a6, 0x1084, 0x1062, 0x1041, 0x1020, 0x1000,
+    0x0fe0, 0x0fc0, 0x0fa2, 0x0f83, 0x0f66, 0x0f48, 0x0f2b, 0x0f0f, 0x0ef2, 0x0ed7, 0x0ebb, 0x0ea0, 0x0e86, 0x0e6c, 0x0e52, 0x0e38,
+    0x0e1f, 0x0e07, 0x0dee, 0x0dd6, 0x0dbe, 0x0da7, 0x0d90, 0x0d79, 0x0d62, 0x0d4c, 0x0d36, 0x0d20, 0x0d0b, 0x0cf6, 0x0ce1, 0x0ccc,
+    0x0cb8, 0x0ca4, 0x0c90, 0x0c7c, 0x0c69, 0x0c56, 0x0c43, 0x0c30, 0x0c1e, 0x0c0c, 0x0bfa, 0x0be8, 0x0bd6, 0x0bc5, 0x0bb3, 0x0ba2,
+    0x0b92, 0x0b81, 0x0b70, 0x0b60, 0x0b50, 0x0b40, 0x0b30, 0x0b21, 0x0b11, 0x0b02, 0x0af3, 0x0ae4, 0x0ad6, 0x0ac7, 0x0ab8, 0x0aaa,
+    0x0a9c, 0x0a8e, 0x0a80, 0x0a72, 0x0a65, 0x0a57, 0x0a4a, 0x0a3d, 0x0a30, 0x0a23, 0x0a16, 0x0a0a, 0x09fd, 0x09f1, 0x09e4, 0x09d8,
+    0x09cc, 0x09c0, 0x09b4, 0x09a9, 0x099d, 0x0991, 0x0986, 0x097b, 0x0970, 0x0964, 0x095a, 0x094f, 0x0944, 0x0939, 0x092f, 0x0924,
+    0x091a, 0x090f, 0x0905, 0x08fb, 0x08f1, 0x08e7, 0x08dd, 0x08d3, 0x08ca, 0x08c0, 0x08b7, 0x08ad, 0x08a4, 0x089a, 0x0891, 0x0888,
+    0x087f, 0x0876, 0x086d, 0x0864, 0x085b, 0x0853, 0x084a, 0x0842, 0x0839, 0x0831, 0x0828, 0x0820, 0x0818, 0x0810, 0x0808, 0x0800,
+};
+
+static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        uint32_t c;
+        memcpy( &c, src, 4 );
+        return uint64_t( to565( c ) ) << 16;
+    }
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#elif defined __ARM_NEON
+#  ifdef __aarch64__
+    uint8x16x4_t px = vld4q_u8( src );
+
+    uint8x16_t lr = px.val[0];
+    uint8x16_t lg = px.val[1];
+    uint8x16_t lb = px.val[2];
+
+    uint8_t rmaxr = vmaxvq_u8( lr );
+    uint8_t rmaxg = vmaxvq_u8( lg );
+    uint8_t rmaxb = vmaxvq_u8( lb );
+
+    uint8_t rminr = vminvq_u8( lr );
+    uint8_t rming = vminvq_u8( lg );
+    uint8_t rminb = vminvq_u8( lb );
+
+    int rr = rmaxr - rminr;
+    int rg = rmaxg - rming;
+    int rb = rmaxb - rminb;
+
+    int vrange1 = rr + rg + rb;
+    uint16_t vrange2 = DivTableNEON[vrange1];
+
+    uint8_t insetr = rr >> 4;
+    uint8_t insetg = rg >> 4;
+    uint8_t insetb = rb >> 4;
+
+    uint8_t minr = rminr + insetr;
+    uint8_t ming = rming + insetg;
+    uint8_t minb = rminb + insetb;
+
+    uint8_t maxr = rmaxr - insetr;
+    uint8_t maxg = rmaxg - insetg;
+    uint8_t maxb = rmaxb - insetb;
+
+    uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
+    uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
+    uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
+
+    uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
+    uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
+    uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
+    uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
+
+    int16x8_t range = vdupq_n_s16( vrange2 );
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vp;
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
+#  else
+    uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
+    uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
+    uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
+    uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
+
+    uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
+    uint32x4_t sd0 = vandq_u32( smask, px0 );
+    uint32x4_t sd1 = vandq_u32( smask, px1 );
+    uint32x4_t sd2 = vandq_u32( smask, px2 );
+    uint32x4_t sd3 = vandq_u32( smask, px3 );
+
+    uint32x4_t sc = vdupq_n_u32( sd0[0] );
+
+    uint32x4_t sc0 = vceqq_u32( sd0, sc );
+    uint32x4_t sc1 = vceqq_u32( sd1, sc );
+    uint32x4_t sc2 = vceqq_u32( sd2, sc );
+    uint32x4_t sc3 = vceqq_u32( sd3, sc );
+
+    uint32x4_t sm0 = vandq_u32( sc0, sc1 );
+    uint32x4_t sm1 = vandq_u32( sc2, sc3 );
+    int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
+
+    if( sm[0] == -1 && sm[1] == -1 )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
+    uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
+    uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
+    uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
+    uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
+
+    uint8x16_t min0 = vminq_u8( l0, l1 );
+    uint8x16_t min1 = vminq_u8( l2, l3 );
+    uint8x16_t min2 = vminq_u8( min0, min1 );
+
+    uint8x16_t max0 = vmaxq_u8( l0, l1 );
+    uint8x16_t max1 = vmaxq_u8( l2, l3 );
+    uint8x16_t max2 = vmaxq_u8( max0, max1 );
+
+    uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
+    uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
+
+    uint8x16_t min4 = vminq_u8( min2, min3 );
+    uint8x16_t max4 = vmaxq_u8( max2, max3 );
+
+    uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
+    uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
+
+    uint8x16_t rmin = vminq_u8( min4, min5 );
+    uint8x16_t rmax = vmaxq_u8( max4, max5 );
+
+    uint8x16_t range1 = vsubq_u8( rmax, rmin );
+    uint8x8_t range2 = vget_low_u8( range1 );
+    uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
+    uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
+
+    uint16_t vrange1;
+    uint16x4_t range5 = vpadd_u16( range4, range4 );
+    uint16x4_t range6 = vpadd_u16( range5, range5 );
+    vst1_lane_u16( &vrange1, range6, 0 );
+
+    uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
+    uint16x8_t range = vdupq_n_u16( vrange2 );
+
+    uint8x16_t inset = vshrq_n_u8( range1, 4 );
+    uint8x16_t min = vaddq_u8( rmin, inset );
+    uint8x16_t max = vsubq_u8( rmax, inset );
+
+    uint8x16_t c0 = vsubq_u8( l0, rmin );
+    uint8x16_t c1 = vsubq_u8( l1, rmin );
+    uint8x16_t c2 = vsubq_u8( l2, rmin );
+    uint8x16_t c3 = vsubq_u8( l3, rmin );
+
+    uint16x8_t is0 = vpaddlq_u8( c0 );
+    uint16x8_t is1 = vpaddlq_u8( c1 );
+    uint16x8_t is2 = vpaddlq_u8( c2 );
+    uint16x8_t is3 = vpaddlq_u8( c3 );
+
+    uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
+    uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
+    uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
+    uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
+
+    uint16x8_t s0 = vcombine_u16( is4, is5 );
+    uint16x8_t s1 = vcombine_u16( is6, is7 );
+
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vmin, vmax, vp;
+    vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
+    vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#  endif
+#else
+    uint32_t ref;
+    memcpy( &ref, src, 4 );
+    uint32_t refMask = ref & 0xF8FCF8;
+    auto stmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        uint32_t px;
+        memcpy( &px, stmp, 4 );
+        if( ( px & 0xF8FCF8 ) != refMask ) break;
+        stmp += 4;
+    }
+    if( stmp == src + 64 )
+    {
+        return uint64_t( to565( ref ) ) << 16;
+    }
+
+    uint8_t min[3] = { src[0], src[1], src[2] };
+    uint8_t max[3] = { src[0], src[1], src[2] };
+    auto tmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            if( tmp[j] < min[j] ) min[j] = tmp[j];
+            else if( tmp[j] > max[j] ) max[j] = tmp[j];
+        }
+        tmp += 4;
+    }
+
+    const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
+    const uint32_t rmin = min[0] + min[1] + min[2];
+    for( int i=0; i<3; i++ )
+    {
+        const uint8_t inset = ( max[i] - min[i] ) >> 4;
+        min[i] += inset;
+        max[i] -= inset;
+    }
+
+    uint32_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        const uint32_t c = src[0] + src[1] + src[2] - rmin;
+        const uint8_t idx = ( c * range ) >> 16;
+        data |= idx << (i*2);
+        src += 4;
+    }
+
+    return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
+#endif
+}
+
+#ifdef __AVX2__
+static etcpak_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
+{
+    __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+    __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
+    __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
+
+    __m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
+    __m256i sd0 = _mm256_and_si256( px0, smask );
+    __m256i sd1 = _mm256_and_si256( px1, smask );
+    __m256i sd2 = _mm256_and_si256( px2, smask );
+    __m256i sd3 = _mm256_and_si256( px3, smask );
+
+    __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m256i sc0 = _mm256_cmpeq_epi8(sd0, sc);
+    __m256i sc1 = _mm256_cmpeq_epi8(sd1, sc);
+    __m256i sc2 = _mm256_cmpeq_epi8(sd2, sc);
+    __m256i sc3 = _mm256_cmpeq_epi8(sd3, sc);
+
+    __m256i sm0 = _mm256_and_si256(sc0, sc1);
+    __m256i sm1 = _mm256_and_si256(sc2, sc3);
+    __m256i sm = _mm256_and_si256(sm0, sm1);
+
+    const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
+    const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
+
+    if( solid0 + solid1 == 0 )
+    {
+        const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) );
+        const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) );
+        memcpy( dst, &c0, 8 );
+        memcpy( dst+8, &c1, 8 );
+        dst += 16;
+        return;
+    }
+
+    __m256i min0 = _mm256_min_epu8( px0, px1 );
+    __m256i min1 = _mm256_min_epu8( px2, px3 );
+    __m256i min2 = _mm256_min_epu8( min0, min1 );
+
+    __m256i max0 = _mm256_max_epu8( px0, px1 );
+    __m256i max1 = _mm256_max_epu8( px2, px3 );
+    __m256i max2 = _mm256_max_epu8( max0, max1 );
+
+    __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i min4 = _mm256_min_epu8( min2, min3 );
+    __m256i max4 = _mm256_max_epu8( max2, max3 );
+
+    __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i rmin = _mm256_min_epu8( min4, min5 );
+    __m256i rmax = _mm256_max_epu8( max4, max5 );
+
+    __m256i range1 = _mm256_subs_epu8( rmax, rmin );
+    __m256i range2 = _mm256_sad_epu8( rmax, rmin );
+
+    uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
+    uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
+    __m256i range00 = _mm256_set1_epi16( vrange0 );
+    __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
+
+    __m256i inset1 = _mm256_srli_epi16( range1, 4 );
+    __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
+    __m256i min = _mm256_adds_epu8( rmin, inset );
+    __m256i max = _mm256_subs_epu8( rmax, inset );
+
+    __m256i c0 = _mm256_subs_epu8( px0, rmin );
+    __m256i c1 = _mm256_subs_epu8( px1, rmin );
+    __m256i c2 = _mm256_subs_epu8( px2, rmin );
+    __m256i c3 = _mm256_subs_epu8( px3, rmin );
+
+    __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
+    __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
+    __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
+    __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
+
+    __m256i s0 = _mm256_hadd_epi16( is0, is1 );
+    __m256i s1 = _mm256_hadd_epi16( is2, is3 );
+
+    __m256i m0 = _mm256_mulhi_epu16( s0, range );
+    __m256i m1 = _mm256_mulhi_epu16( s1, range );
+
+    __m256i p0 = _mm256_packus_epi16( m0, m1 );
+
+    __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
+    __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
+    __m256i p3 = _mm256_or_si256( p1, p2 );
+    __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
+
+    __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
+    __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
+    __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
+    __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
+    __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
+    __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
+    __m256i mm3 = _mm256_or_si256( mmr, mmg );
+    __m256i mm4 = _mm256_or_si256( mm3, mmb );
+    __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
+
+    __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
+    __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
+    __m128i d2 = _mm256_castsi256_si128( d1 );
+
+    __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
+    __m128i d3 = _mm_and_si128( d2, mask );
+    _mm_storeu_si128( (__m128i*)dst, d3 );
+
+    for( int j=4; j<8; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
+    for( int j=12; j<16; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
+
+    dst += 16;
+}
+#endif
+
+static const uint8_t AlphaIndexTable[8] = { 1, 7, 6, 5, 4, 3, 2, 0 };
+
+static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src )
+{
+    uint8_t solid8 = *src;
+    uint16_t solid16 = uint16_t( solid8 ) | ( uint16_t( solid8 ) << 8 );
+    uint32_t solid32 = uint32_t( solid16 ) | ( uint32_t( solid16 ) << 16 );
+    uint64_t solid64 = uint64_t( solid32 ) | ( uint64_t( solid32 ) << 32 );
+    if( memcmp( src, &solid64, 8 ) == 0 && memcmp( src+8, &solid64, 8 ) == 0 )
+    {
+        return solid8;
+    }
+
+    uint8_t min = src[0];
+    uint8_t max = min;
+    for( int i=1; i<16; i++ )
+    {
+        const auto v = src[i];
+        if( v > max ) max = v;
+        else if( v < min ) min = v;
+    }
+
+    uint32_t range = ( 8 << 13 ) / ( 1 + max - min );
+    uint64_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        uint8_t a = src[i] - min;
+        uint64_t idx = AlphaIndexTable[( a * range ) >> 13];
+        data |= idx << (i*3);
+    }
+
+    return max | ( min << 8 ) | ( data << 16 );
+}
+
+#ifdef __SSE4_1__
+static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        return uint64_t( to565( _mm_cvtsi128_si32( px0 ) ) ) << 16;
+    }
+
+    px0 = _mm_and_si128( px0, _mm_set1_epi32( 0xFFFFFF ) );
+    px1 = _mm_and_si128( px1, _mm_set1_epi32( 0xFFFFFF ) );
+    px2 = _mm_and_si128( px2, _mm_set1_epi32( 0xFFFFFF ) );
+    px3 = _mm_and_si128( px3, _mm_set1_epi32( 0xFFFFFF ) );
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+}
+
+static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
+
+    __m128i m0 = _mm_shuffle_epi8( px0, mask );
+    __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+    __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+    __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+    __m128i m4 = _mm_or_si128( m0, m1 );
+    __m128i m5 = _mm_or_si128( m2, m3 );
+    __m128i a = _mm_or_si128( m4, m5 );
+
+    __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
+    __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return _mm_cvtsi128_si32( a ) & 0xFF;
+    }
+
+    __m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( a, a1 );
+    __m128i min1 = _mm_min_epu8( a, a1 );
+    __m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, amax2 );
+    __m128i min2 = _mm_min_epu8( min1, amin2 );
+    __m128i amax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i amin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, amax3 );
+    __m128i min3 = _mm_min_epu8( min2, amin3 );
+    __m128i amax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i amin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, amax4 );
+    __m128i min = _mm_min_epu8( min3, amin4 );
+    __m128i minmax = _mm_unpacklo_epi8( max, min );
+
+    __m128i r = _mm_sub_epi8( max, min );
+    int range = _mm_cvtsi128_si32( r ) & 0xFF;
+    __m128i rv = _mm_set1_epi16( DivTableAlpha[range] );
+
+    __m128i v = _mm_sub_epi8( a, min );
+
+    __m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() );
+    __m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() );
+
+    __m128i lomul = _mm_mulhi_epu16( lo16, rv );
+    __m128i himul = _mm_mulhi_epu16( hi16, rv );
+
+    __m128i p0 = _mm_packus_epi16( lomul, himul );
+    __m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) );
+    __m128i p2 = _mm_packus_epi16( p1, p1 );
+
+    uint64_t pi = _mm_cvtsi128_si64( p2 );
+    uint64_t data = 0;
+    for( int i=0; i<8; i++ )
+    {
+        uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F];
+        data |= idx << (i*6);
+    }
+    return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
+}
+#endif
+
+void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+#ifdef __AVX2__
+    if( width%8 == 0 )
+    {
+        blocks /= 2;
+        uint32_t buf[8*4];
+        int i = 0;
+        char* dst8 = (char*)dst;
+
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src + width * 0, 8*4 );
+            memcpy( tmp + 8*4,  src + width * 1, 8*4 );
+            memcpy( tmp + 16*4, src + width * 2, 8*4 );
+            memcpy( tmp + 24*4, src + width * 3, 8*4 );
+            src += 8;
+            if( ++i == width/8 )
+            {
+                src += width * 3;
+                i = 0;
+            }
+
+            ProcessRGB_AVX( (uint8_t*)buf, dst8 );
+        }
+        while( --blocks );
+    }
+    else
+#endif
+    {
+        uint32_t buf[4*4];
+        int i = 0;
+
+        auto ptr = dst;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src + width * 0, 4*4 );
+            memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+            memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+            memcpy( tmp + 12*4, src + width * 3, 4*4 );
+            src += 4;
+            if( ++i == width/4 )
+            {
+                src += width * 3;
+                i = 0;
+            }
+
+            const auto c = ProcessRGB( (uint8_t*)buf );
+            uint8_t fix[8];
+            memcpy( fix, &c, 8 );
+            for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+            memcpy( ptr, fix, sizeof( uint64_t ) );
+            ptr++;
+        }
+        while( --blocks );
+    }
+}
+
+void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    uint32_t buf[4*4];
+    int i = 0;
+
+    auto ptr = dst;
+    do
+    {
+        auto tmp = (char*)buf;
+        memcpy( tmp,        src + width * 0, 4*4 );
+        memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+        memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+        memcpy( tmp + 12*4, src + width * 3, 4*4 );
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        Dither( (uint8_t*)buf );
+
+        const auto c = ProcessRGB( (uint8_t*)buf );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+    }
+    while( --blocks );
+}
+
+void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int i = 0;
+    auto ptr = dst;
+    do
+    {
+#ifdef __SSE4_1__
+        __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
+        __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
+        __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
+        __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        *ptr++ = ProcessAlpha_SSE( px0, px1, px2, px3 );
+
+        const auto c = ProcessRGB_SSE( px0, px1, px2, px3 );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+#else
+        uint32_t rgba[4*4];
+        uint8_t alpha[4*4];
+
+        auto tmp = (char*)rgba;
+        memcpy( tmp,        src + width * 0, 4*4 );
+        memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+        memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+        memcpy( tmp + 12*4, src + width * 3, 4*4 );
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        for( int i=0; i<16; i++ )
+        {
+            alpha[i] = rgba[i] >> 24;
+            rgba[i] &= 0xFFFFFF;
+        }
+        *ptr++ = ProcessAlpha( alpha );
+
+        const auto c = ProcessRGB( (uint8_t*)rgba );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+#endif
+    }
+    while( --blocks );
+}
diff --git a/thirdparty/etcpak/ProcessDxtc.hpp b/thirdparty/etcpak/ProcessDxtc.hpp
new file mode 100644
index 0000000000..8e0b12e4bd
--- /dev/null
+++ b/thirdparty/etcpak/ProcessDxtc.hpp
@@ -0,0 +1,11 @@
+#ifndef __PROCESSDXT1_HPP__
+#define __PROCESSDXT1_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
new file mode 100644
index 0000000000..7f4524d105
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -0,0 +1,3100 @@
+#include <array>
+#include <string.h>
+#include <limits>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#include "Dither.hpp"
+#include "ForceInline.hpp"
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB.hpp"
+#include "Tables.hpp"
+#include "Vector.hpp"
+#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#    define _bswap(x) _byteswap_ulong(x)
+#    define _bswap64(x) _byteswap_uint64(x)
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifndef _bswap
+#  define _bswap(x) __builtin_bswap32(x)
+#  define _bswap64(x) __builtin_bswap64(x)
+#endif
+
+namespace
+{
+
+#if defined _MSC_VER && !defined __clang__
+static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
+{
+    unsigned long ret;
+    _BitScanForward( &ret, mask );
+    return ret;
+}
+#endif
+
+typedef std::array<uint16_t, 4> v4i;
+
+#ifdef __AVX2__
+static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m256i t0 = _mm256_cvtepu8_epi16(dm0);
+    __m256i t1 = _mm256_cvtepu8_epi16(dm1);
+    __m256i t2 = _mm256_cvtepu8_epi16(dm2);
+    __m256i t3 = _mm256_cvtepu8_epi16(dm3);
+
+    __m256i sum0 = _mm256_add_epi16(t0, t1);
+    __m256i sum1 = _mm256_add_epi16(t2, t3);
+
+    __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
+    __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
+
+    __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
+    __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
+    __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
+    __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
+
+    __m256i sum5 = _mm256_add_epi16(s2, s3); //   3,   0,   3,   0
+    __m256i sum6 = _mm256_add_epi16(s4, s5); //   2,   1,   1,   2
+    return _mm256_add_epi16(sum5, sum6);     // 3+2, 0+1, 3+1, 3+2
+}
+
+static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
+{
+    __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
+
+    return _mm256_srli_epi16(a, 3);
+}
+
+static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
+{
+    //
+    __m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
+    __m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
+
+    // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    __m256i a4 = _mm256_madd_epi16(a0, a0);
+    __m256i a5 = _mm256_madd_epi16(a1, a1);
+
+    __m256i a6 = _mm256_hadd_epi32(a4, a5);
+    __m256i a7 = _mm256_slli_epi32(a6, 3);
+
+    __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
+
+    // average is not swapped
+    // err -= block[0] * 2 * average[0];
+    // err -= block[1] * 2 * average[1];
+    // err -= block[2] * 2 * average[2];
+    __m256i a2 = _mm256_slli_epi16(a0, 1);
+    __m256i a3 = _mm256_slli_epi16(a1, 1);
+    __m256i b0 = _mm256_madd_epi16(a2, data);
+    __m256i b1 = _mm256_madd_epi16(a3, data);
+
+    __m256i b2 = _mm256_hadd_epi32(b0, b1);
+    __m256i b3 = _mm256_sub_epi32(a8, b2);
+    __m256i b4 = _mm256_hadd_epi32(b3, b3);
+
+    __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
+
+    return _mm256_castsi256_si128(b5);
+}
+
+static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
+{
+    __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
+
+    __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
+
+    __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256i diff = _mm256_sub_epi16(c, c1);
+    diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
+    diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
+
+    __m256i co = _mm256_add_epi16(c1, diff);
+
+    c = _mm256_blend_epi16(co, c, 0xF0);
+
+    __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
+
+    _mm256_store_si256((__m256i*)a[4].data(), a0);
+
+    __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
+    __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
+
+    __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
+
+    _mm256_store_si256((__m256i*)a[0].data(), t2);
+}
+
+static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
+{
+    uint64_t d = ( idx << 24 );
+    size_t base = idx << 1;
+
+    __m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
+
+    __m128i r0, r1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        r0 = _mm_srli_epi16(a0, 4);
+
+        __m128i a1 = _mm_unpackhi_epi64(r0, r0);
+        r1 = _mm_slli_epi16(a1, 4);
+    }
+    else
+    {
+        __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
+
+        r0 = _mm_unpackhi_epi64(a1, a1);
+        __m128i a2 = _mm_sub_epi16(a1, r0);
+        __m128i a3 = _mm_srai_epi16(a2, 3);
+        r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
+    }
+
+    __m128i r2 = _mm_or_si128(r0, r1);
+    // do missing swap for average values
+    __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
+    __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
+    d |= _mm_cvtsi128_si32(r4);
+
+    return d;
+}
+
+static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
+{
+    __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+
+    __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
+
+    __m256i c0 = _mm256_cmpeq_epi8(d0, c);
+    __m256i c1 = _mm256_cmpeq_epi8(d1, c);
+
+    __m256i m = _mm256_and_si256(c0, c1);
+
+    if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
+    {
+        return 0;
+    }
+
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
+{
+    __m256i sum4 = Sum4_AVX2( src );
+
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
+{
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    for (unsigned int j = 0; j < 2; ++j)
+    {
+        unsigned int bid = offset + 1 - j;
+
+        __m256i squareErrorSum = _mm256_setzero_si256();
+
+        __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
+        __m256i a1 = _mm256_broadcastq_epi64(a0);
+
+        // Processing one full row each iteration
+        for (size_t i = 0; i < 8; i += 4)
+        {
+            __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+            __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+            __m256i d = _mm256_sub_epi16(a1, rgb16);
+
+            // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+            // This produces slightly different results, but is significant faster
+            __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+            __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+            __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+            __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+            __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+            __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing first two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                // This produces slightly different results, but is significant faster
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo2);
+                sel1 = _mm256_or_si256(sel1, minIndexHi2);
+            }
+
+            pixel3 = _mm256_extracti128_si256(pixel2, 1);
+            pix0 = _mm_broadcastw_epi16(pixel3);
+            pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing second two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+                __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo3);
+                sel1 = _mm256_or_si256(sel1, minIndexHi3);
+            }
+        }
+
+        data += 8 * 4;
+
+        _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
+    }
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    __m256i squareErrorSum0 = _mm256_setzero_si256();
+    __m256i squareErrorSum1 = _mm256_setzero_si256();
+
+    __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
+    __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
+
+    __m128i a2 = _mm_broadcastq_epi64(a0);
+    __m128i a3 = _mm_broadcastq_epi64(a1);
+    __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
+
+    // Processing one full row each iteration
+    for (size_t i = 0; i < 16; i += 4)
+    {
+        __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+        __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+        __m256i d = _mm256_sub_epi16(a4, rgb16);
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+        __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+        __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+        __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+        __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+        __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing first two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo2);
+            sel1 = _mm256_or_si256(sel1, minIndexHi2);
+        }
+
+        pixel3 = _mm256_extracti128_si256(pixel2, 1);
+        pix0 = _mm_broadcastw_epi16(pixel3);
+        pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing second two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+            __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+            __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo3);
+            sel1 = _mm256_or_si256(sel1, minIndexHi3);
+        }
+    }
+
+    _mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
+    _mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
+{
+    __m128i co = _mm_cvttps_epi32(cof);
+    __m128i ch = _mm_cvttps_epi32(chf);
+    __m128i cv = _mm_cvttps_epi32(cvf);
+
+    __m128i coh = _mm_packus_epi32(co, ch);
+    __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
+
+    __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
+    __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
+
+    __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
+    __m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
+
+    __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
+    __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
+    __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
+    __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
+
+    __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
+    __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
+    __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
+    __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
+
+    __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
+    __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
+    __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
+    __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
+
+    __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
+    __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
+
+    __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
+
+    __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
+    return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
+}
+
+struct Plane
+{
+    uint64_t plane;
+    uint64_t error;
+    __m256i sum4;
+};
+
+static etcpak_force_inline Plane Planar_AVX2(const uint8_t* src)
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+
+    __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
+    __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
+    __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
+    __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
+
+    // swap channels
+    __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
+    __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
+    __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+
+    __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
+    __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
+    __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+
+    __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+
+    __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
+    __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
+    __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+
+    __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
+    __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
+    __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+
+    __m256i sr1 = _mm256_slli_epi64(sr0, 32);
+    __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+
+    __m256i srb = _mm256_or_si256(sr1, sb0);
+    __m256i srgb = _mm256_or_si256(srb, sg1);
+
+    __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
+    __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i t5 = _mm_hadd_epi32(t3, t4);
+    __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+
+    __m256i sr = _mm256_broadcastw_epi16(t5);
+    __m256i sg = _mm256_broadcastw_epi16(t6);
+    __m256i sb = _mm256_broadcastw_epi16(t7);
+
+    __m256i r08 = _mm256_cvtepu8_epi16(r8);
+    __m256i g08 = _mm256_cvtepu8_epi16(g8);
+    __m256i b08 = _mm256_cvtepu8_epi16(b8);
+
+    __m256i r16 = _mm256_slli_epi16(r08, 4);
+    __m256i g16 = _mm256_slli_epi16(g08, 4);
+    __m256i b16 = _mm256_slli_epi16(b08, 4);
+
+    __m256i difR0 = _mm256_sub_epi16(r16, sr);
+    __m256i difG0 = _mm256_sub_epi16(g16, sg);
+    __m256i difB0 = _mm256_sub_epi16(b16, sb);
+
+    __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+
+    __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+
+    __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
+    __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+
+    __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+
+    __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
+    __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
+    __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+
+    __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
+    __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+
+    __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+
+    __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
+    __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+
+    const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+
+    __m128 scale = _mm_set1_ps(-4.0f / value);
+
+    __m128 af = _mm_mul_ps(sumRGBxzf, scale);
+    __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+
+    __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+
+    // convert to r6g7b6
+    __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+
+    uint64_t rgbho = _mm_extract_epi64(cohv, 0);
+    uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+
+    // Error calculation
+    auto ro0 = (rgbho >> 48) & 0x3F;
+    auto go0 = (rgbho >> 40) & 0x7F;
+    auto bo0 = (rgbho >> 32) & 0x3F;
+    auto ro1 = (ro0 >> 4) | (ro0 << 2);
+    auto go1 = (go0 >> 6) | (go0 << 1);
+    auto bo1 = (bo0 >> 4) | (bo0 << 2);
+    auto ro2 = (ro1 << 2) + 2;
+    auto go2 = (go1 << 2) + 2;
+    auto bo2 = (bo1 << 2) + 2;
+
+    __m256i ro3 = _mm256_set1_epi16(ro2);
+    __m256i go3 = _mm256_set1_epi16(go2);
+    __m256i bo3 = _mm256_set1_epi16(bo2);
+
+    auto rh0 = (rgbho >> 16) & 0x3F;
+    auto gh0 = (rgbho >>  8) & 0x7F;
+    auto bh0 = (rgbho >>  0) & 0x3F;
+    auto rh1 = (rh0 >> 4) | (rh0 << 2);
+    auto gh1 = (gh0 >> 6) | (gh0 << 1);
+    auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    auto rh2 = rh1 - ro1;
+    auto gh2 = gh1 - go1;
+    auto bh2 = bh1 - bo1;
+
+    __m256i rh3 = _mm256_set1_epi16(rh2);
+    __m256i gh3 = _mm256_set1_epi16(gh2);
+    __m256i bh3 = _mm256_set1_epi16(bh2);
+
+    auto rv0 = (rgbv0 >> 16) & 0x3F;
+    auto gv0 = (rgbv0 >>  8) & 0x7F;
+    auto bv0 = (rgbv0 >>  0) & 0x3F;
+    auto rv1 = (rv0 >> 4) | (rv0 << 2);
+    auto gv1 = (gv0 >> 6) | (gv0 << 1);
+    auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    auto rv2 = rv1 - ro1;
+    auto gv2 = gv1 - go1;
+    auto bv2 = bv1 - bo1;
+
+    __m256i rv3 = _mm256_set1_epi16(rv2);
+    __m256i gv3 = _mm256_set1_epi16(gv2);
+    __m256i bv3 = _mm256_set1_epi16(bv2);
+
+    __m256i x = _mm256_set_epi16(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+    __m256i rh4 = _mm256_mullo_epi16(rh3, x);
+    __m256i gh4 = _mm256_mullo_epi16(gh3, x);
+    __m256i bh4 = _mm256_mullo_epi16(bh3, x);
+
+    __m256i y = _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+
+    __m256i rv4 = _mm256_mullo_epi16(rv3, y);
+    __m256i gv4 = _mm256_mullo_epi16(gv3, y);
+    __m256i bv4 = _mm256_mullo_epi16(bv3, y);
+
+    __m256i rxy = _mm256_add_epi16(rh4, rv4);
+    __m256i gxy = _mm256_add_epi16(gh4, gv4);
+    __m256i bxy = _mm256_add_epi16(bh4, bv4);
+
+    __m256i rp0 = _mm256_add_epi16(rxy, ro3);
+    __m256i gp0 = _mm256_add_epi16(gxy, go3);
+    __m256i bp0 = _mm256_add_epi16(bxy, bo3);
+
+    __m256i rp1 = _mm256_srai_epi16(rp0, 2);
+    __m256i gp1 = _mm256_srai_epi16(gp0, 2);
+    __m256i bp1 = _mm256_srai_epi16(bp0, 2);
+
+    __m256i rp2 = _mm256_max_epi16(_mm256_min_epi16(rp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i gp2 = _mm256_max_epi16(_mm256_min_epi16(gp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i bp2 = _mm256_max_epi16(_mm256_min_epi16(bp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+
+    __m256i rdif = _mm256_sub_epi16(r08, rp2);
+    __m256i gdif = _mm256_sub_epi16(g08, gp2);
+    __m256i bdif = _mm256_sub_epi16(b08, bp2);
+
+    __m256i rerr = _mm256_mullo_epi16(rdif, _mm256_set1_epi16(38));
+    __m256i gerr = _mm256_mullo_epi16(gdif, _mm256_set1_epi16(76));
+    __m256i berr = _mm256_mullo_epi16(bdif, _mm256_set1_epi16(14));
+
+    __m256i sum0 = _mm256_add_epi16(rerr, gerr);
+    __m256i sum1 = _mm256_add_epi16(sum0, berr);
+
+    __m256i sum2 = _mm256_madd_epi16(sum1, sum1);
+
+    __m128i sum3 = _mm_add_epi32(_mm256_castsi256_si128(sum2), _mm256_extracti128_si256(sum2, 1));
+
+    uint32_t err0 = _mm_extract_epi32(sum3, 0);
+    uint32_t err1 = _mm_extract_epi32(sum3, 1);
+    uint32_t err2 = _mm_extract_epi32(sum3, 2);
+    uint32_t err3 = _mm_extract_epi32(sum3, 3);
+
+    uint64_t error = err0 + err1 + err2 + err3;
+    /**/
+
+    uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 );
+    uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 );
+    uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 );
+
+    uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
+    rgbho0 >>= 13;
+    uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 );
+
+    uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 );
+    lo |= g_flags[idx];
+    uint64_t result = static_cast<uint32_t>(_bswap(lo));
+    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
+
+    Plane plane;
+
+    plane.plane = result;
+    plane.error = error;
+    plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
+
+    return plane;
+}
+
+static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+#endif
+
+static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
+    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
+    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
+
+    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
+    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
+#elif defined __ARM_NEON
+    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
+    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
+    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
+    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
+
+    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
+    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
+    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
+    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
+
+    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t());
+    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t());
+    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t());
+    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t());
+
+    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
+    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
+    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
+    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
+
+    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
+    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
+    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
+    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
+
+    uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3);
+    uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3);
+    uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3);
+    uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3);
+
+    uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
+    uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
+
+    a[0] = v4i{o0[2], o0[1], o0[0], 0};
+    a[1] = v4i{o0[6], o0[5], o0[4], 0};
+    a[2] = v4i{o1[2], o1[1], o1[0], 0};
+    a[3] = v4i{o1[6], o1[5], o1[4], 0};
+#else
+    uint32_t r[4];
+    uint32_t g[4];
+    uint32_t b[4];
+
+    memset(r, 0, sizeof(r));
+    memset(g, 0, sizeof(g));
+    memset(b, 0, sizeof(b));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            b[index] += *data++;
+            g[index] += *data++;
+            r[index] += *data++;
+            data++;
+        }
+    }
+
+    a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
+    a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
+    a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
+    a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
+#endif
+}
+
+static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_add_epi32(b2, b3);
+    __m128i a1 = _mm_add_epi32(b0, b1);
+    __m128i a2 = _mm_add_epi32(b1, b3);
+    __m128i a3 = _mm_add_epi32(b0, b2);
+
+    _mm_storeu_si128((__m128i*)&err[0], a0);
+    _mm_storeu_si128((__m128i*)&err[1], a1);
+    _mm_storeu_si128((__m128i*)&err[2], a2);
+    _mm_storeu_si128((__m128i*)&err[3], a3);
+#elif defined __ARM_NEON
+    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
+    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
+    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
+    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
+
+    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
+    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
+    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
+    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
+
+    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t());
+    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t());
+    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t());
+    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t());
+
+    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
+    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
+    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
+    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
+
+    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
+    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
+    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
+    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
+
+    uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+
+    vst1q_u32(err[0], a0);
+    vst1q_u32(err[1], a1);
+    vst1q_u32(err[2], a2);
+    vst1q_u32(err[3], a3);
+#else
+    unsigned int terr[4][4];
+
+    memset(terr, 0, 16 * sizeof(unsigned int));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            unsigned int d = *data++;
+            terr[index][0] += d;
+            d = *data++;
+            terr[index][1] += d;
+            d = *data++;
+            terr[index][2] += d;
+            data++;
+        }
+    }
+
+    for( int i=0; i<3; i++ )
+    {
+        err[0][i] = terr[2][i] + terr[3][i];
+        err[1][i] = terr[0][i] + terr[1][i];
+        err[2][i] = terr[1][i] + terr[3][i];
+        err[3][i] = terr[0][i] + terr[2][i];
+    }
+    for( int i=0; i<4; i++ )
+    {
+        err[i][3] = 0;
+    }
+#endif
+}
+
+static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average )
+{
+    unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
+    err -= block[0] * 2 * average[2];
+    err -= block[1] * 2 * average[1];
+    err -= block[2] * 2 * average[0];
+    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    return err;
+}
+
+static etcpak_force_inline void ProcessAverages( v4i* a )
+{
+#ifdef __SSE4_1__
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
+
+        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
+
+        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+        __m128i diff = _mm_sub_epi16(c, c1);
+        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
+        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
+
+        __m128i co = _mm_add_epi16(c1, diff);
+
+        c = _mm_blend_epi16(co, c, 0xF0);
+
+        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
+
+        _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
+        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
+
+        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
+
+        _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
+    }
+#elif defined __ARM_NEON
+    for( int i=0; i<2; i++ )
+    {
+        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
+        int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128));
+        int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8);
+
+        int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
+        int16x8_t diff = vsubq_s16(c, c1);
+        diff = vmaxq_s16(diff, vdupq_n_s16(-4));
+        diff = vminq_s16(diff, vdupq_n_s16(3));
+
+        int16x8_t co = vaddq_s16(c1, diff);
+
+        c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
+
+        int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2));
+
+        vst1q_s16((int16_t*)&a[4+i*2], a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
+
+        int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128));
+        int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8);
+
+        int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4));
+
+        vst1q_s16((int16_t*)&a[i*2], t2);
+    }
+#else
+    for( int i=0; i<2; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            int32_t c1 = mul8bit( a[i*2+1][j], 31 );
+            int32_t c2 = mul8bit( a[i*2][j], 31 );
+
+            int32_t diff = c2 - c1;
+            if( diff > 3 ) diff = 3;
+            else if( diff < -4 ) diff = -4;
+
+            int32_t co = c1 + diff;
+
+            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
+            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
+        }
+    }
+
+    for( int i=0; i<4; i++ )
+    {
+        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
+        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
+        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
+    }
+#endif
+}
+
+static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
+{
+    auto d = _d;
+    d |= ( idx << 24 );
+    size_t base = idx << 1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
+            d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
+        }
+    }
+    else
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
+            int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
+            c &= ~0xFFFFFFF8;
+            d |= ((uint64_t)c) << ( i*8 );
+        }
+    }
+    _d = d;
+}
+
+static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i c0 = _mm_cmpeq_epi8(d0, c);
+    __m128i c1 = _mm_cmpeq_epi8(d1, c);
+    __m128i c2 = _mm_cmpeq_epi8(d2, c);
+    __m128i c3 = _mm_cmpeq_epi8(d3, c);
+
+    __m128i m0 = _mm_and_si128(c0, c1);
+    __m128i m1 = _mm_and_si128(c2, c3);
+    __m128i m = _mm_and_si128(m0, m1);
+
+    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
+    {
+        return 0;
+    }
+#elif defined __ARM_NEON
+    int32x4_t d0 = vld1q_s32((int32_t*)src +  0);
+    int32x4_t d1 = vld1q_s32((int32_t*)src +  4);
+    int32x4_t d2 = vld1q_s32((int32_t*)src +  8);
+    int32x4_t d3 = vld1q_s32((int32_t*)src + 12);
+
+    int32x4_t c = vdupq_n_s32(d0[0]);
+
+    int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
+    int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
+    int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
+    int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
+
+    int32x4_t m0 = vandq_s32(c0, c1);
+    int32x4_t m1 = vandq_s32(c2, c3);
+    int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
+
+    if (m[0] != -1 || m[1] != -1)
+    {
+        return 0;
+    }
+#else
+    const uint8_t* ptr = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( memcmp( src, ptr, 4 ) != 0 )
+        {
+            return 0;
+        }
+        ptr += 4;
+    }
+#endif
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
+{
+    Average( src, a );
+    ProcessAverages( a );
+
+    unsigned int errblock[4][4];
+    CalcErrorBlock( src, errblock );
+
+    for( int i=0; i<4; i++ )
+    {
+        err[i/2] += CalcError( errblock[i], a[i] );
+        err[2+i/2] += CalcError( errblock[i], a[i+4] );
+    }
+}
+
+static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint64_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // Reference implementation
+
+        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
+        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
+        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
+        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
+
+        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        __m128i minError0 = _mm_min_epi32(error0, error1);
+
+        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        __m128i minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        __m128i minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
+        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
+        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
+        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
+
+        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        minError0 = _mm_min_epi32(error0, error1);
+
+        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
+        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
+        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
+        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
+        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#elif defined __ARM_NEON
+        int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0])));
+        uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1])));
+        uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0])));
+        uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1])));
+
+        uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
+        uint32x4_t minError0 = vminq_u32(error0, error1);
+
+        uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
+        uint32x4_t minError1 = vminq_u32(error2, error3);
+
+        uint32x4_t blendMask = vcltq_u32(minError1, minError0);
+        uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
+        uint32x4_t minError = vminq_u32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
+        uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1);
+        uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
+        uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) };
+        squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0));
+        squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2));
+        vst1q_u64(ter + 0, squareError.val[0]);
+        vst1q_u64(ter + 2, squareError.val[1]);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2])));
+        error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3])));
+        error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2])));
+        error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3])));
+
+        index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
+        minError0 = vminq_u32(error0, error1);
+
+        index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
+        minError1 = vminq_u32(error2, error3);
+
+        blendMask = vcltq_u32(minError1, minError0);
+        uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
+        minError = vminq_u32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        squareErrorLow = vmulq_u32(minError, minError);
+        squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 );
+        squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
+        squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4));
+        squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6));
+        vst1q_u64(ter + 4, squareError.val[0]);
+        vst1q_u64(ter + 6, squareError.val[1]);
+
+        uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
+        vst1q_u16(sel, minIndex);
+#else
+        int pix = dr * 77 + dg * 151 + db * 28;
+
+        for( int t=0; t<8; t++ )
+        {
+            const int64_t* tab = g_table256[t];
+            unsigned int idx = 0;
+            uint64_t err = sq( tab[0] + pix );
+            for( int j=1; j<4; j++ )
+            {
+                uint64_t local = sq( tab[j] + pix );
+                if( local < err )
+                {
+                    err = local;
+                    idx = j;
+                }
+            }
+            *sel++ = idx;
+            *ter++ += err;
+        }
+#endif
+    }
+}
+
+#if defined __SSE4_1__ || defined __ARM_NEON
+// Non-reference implementation, but faster. Produces same results as the AVX2 version
+static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint32_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
+        __m128i pix = _mm_abs_epi16(pixel);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
+        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
+
+        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
+        __m128i minError = _mm_min_epi16(error0, error1);
+
+        // Exploiting symmetry of the selector table and use the sign bit
+        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
+        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
+        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
+        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
+
+        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
+        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
+
+        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#elif defined __ARM_NEON
+        int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 );
+        int16x8_t pix = vabsq_s16( pixel );
+
+        int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) );
+        int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) );
+
+        int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) );
+        int16x8_t minError = vminq_s16( error0, error1 );
+
+        int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) );
+        int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
+
+        int16x4_t minErrorLow = vget_low_s16( minError );
+        int16x4_t minErrorHigh = vget_high_s16( minError );
+
+        int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
+        int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
+
+        int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
+        int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) );
+
+        vst1q_s32( (int32_t*)ter, squareErrorSumLow );
+        vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh );
+
+        vst1q_s16( (int16_t*)sel, minIndex );
+#endif
+    }
+}
+#endif
+
+static etcpak_force_inline uint8_t convert6(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
+}
+
+static etcpak_force_inline uint8_t convert7(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
+}
+
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar(const uint8_t* src)
+{
+    int32_t r = 0;
+    int32_t g = 0;
+    int32_t b = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        b += src[i * 4 + 0];
+        g += src[i * 4 + 1];
+        r += src[i * 4 + 2];
+    }
+
+    int32_t difRyz = 0;
+    int32_t difGyz = 0;
+    int32_t difByz = 0;
+    int32_t difRxz = 0;
+    int32_t difGxz = 0;
+    int32_t difBxz = 0;
+
+    const int32_t scaling[] = { -255, -85, 85, 255 };
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
+        int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
+        int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
+
+        difRyz += difR * scaling[i % 4];
+        difGyz += difG * scaling[i % 4];
+        difByz += difB * scaling[i % 4];
+
+        difRxz += difR * scaling[i / 4];
+        difGxz += difG * scaling[i / 4];
+        difBxz += difB * scaling[i / 4];
+    }
+
+    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
+
+    float aR = difRxz * scale;
+    float aG = difGxz * scale;
+    float aB = difBxz * scale;
+
+    float bR = difRyz * scale;
+    float bG = difGyz * scale;
+    float bB = difByz * scale;
+
+    float dR = r * (4.0f / 16.0f);
+    float dG = g * (4.0f / 16.0f);
+    float dB = b * (4.0f / 16.0f);
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    float cofR = std::fma(aR,  255.0f, std::fma(bR,  255.0f, dR));
+    float cofG = std::fma(aG,  255.0f, std::fma(bG,  255.0f, dG));
+    float cofB = std::fma(aB,  255.0f, std::fma(bB,  255.0f, dB));
+    float chfR = std::fma(aR, -425.0f, std::fma(bR,  255.0f, dR));
+    float chfG = std::fma(aG, -425.0f, std::fma(bG,  255.0f, dG));
+    float chfB = std::fma(aB, -425.0f, std::fma(bB,  255.0f, dB));
+    float cvfR = std::fma(aR,  255.0f, std::fma(bR, -425.0f, dR));
+    float cvfG = std::fma(aG,  255.0f, std::fma(bG, -425.0f, dG));
+    float cvfB = std::fma(aB,  255.0f, std::fma(bB, -425.0f, dB));
+
+    // convert to r6g7b6
+    int32_t coR = convert6(cofR);
+    int32_t coG = convert7(cofG);
+    int32_t coB = convert6(cofB);
+    int32_t chR = convert6(chfR);
+    int32_t chG = convert7(chfG);
+    int32_t chB = convert6(chfB);
+    int32_t cvR = convert6(cvfR);
+    int32_t cvG = convert7(cvfG);
+    int32_t cvB = convert6(cvfB);
+
+    // Error calculation
+    auto ro0 = coR;
+    auto go0 = coG;
+    auto bo0 = coB;
+    auto ro1 = (ro0 >> 4) | (ro0 << 2);
+    auto go1 = (go0 >> 6) | (go0 << 1);
+    auto bo1 = (bo0 >> 4) | (bo0 << 2);
+    auto ro2 = (ro1 << 2) + 2;
+    auto go2 = (go1 << 2) + 2;
+    auto bo2 = (bo1 << 2) + 2;
+
+    auto rh0 = chR;
+    auto gh0 = chG;
+    auto bh0 = chB;
+    auto rh1 = (rh0 >> 4) | (rh0 << 2);
+    auto gh1 = (gh0 >> 6) | (gh0 << 1);
+    auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    auto rh2 = rh1 - ro1;
+    auto gh2 = gh1 - go1;
+    auto bh2 = bh1 - bo1;
+
+    auto rv0 = cvR;
+    auto gv0 = cvG;
+    auto bv0 = cvB;
+    auto rv1 = (rv0 >> 4) | (rv0 << 2);
+    auto gv1 = (gv0 >> 6) | (gv0 << 1);
+    auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    auto rv2 = rv1 - ro1;
+    auto gv2 = gv1 - go1;
+    auto bv2 = bv1 - bo1;
+
+    uint64_t error = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
+        int32_t cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
+        int32_t cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
+
+        int32_t difB = static_cast<int>(src[i * 4 + 0]) - cB;
+        int32_t difG = static_cast<int>(src[i * 4 + 1]) - cG;
+        int32_t difR = static_cast<int>(src[i * 4 + 2]) - cR;
+
+        int32_t dif = difR * 38 + difG * 76 + difB * 14;
+
+        error += dif * dif;
+    }
+
+    /**/
+    uint32_t rgbv = cvB | (cvG << 6) | (cvR << 13);
+    uint32_t rgbh = chB | (chG << 6) | (chR << 13);
+    uint32_t hi = rgbv | ((rgbh & 0x1FFF) << 19);
+    uint32_t lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
+    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
+    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
+    lo |= coR << 25;
+
+    const auto idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
+
+    lo |= g_flags[idx];
+
+    uint64_t result = static_cast<uint32_t>(_bswap(lo));
+    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
+
+    return std::make_pair(result, error);
+}
+
+#ifdef __ARM_NEON
+
+static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
+{
+    int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 );
+    int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 );
+    int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 );
+    int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 );
+    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
+
+#ifndef __aarch64__
+    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
+    return vpadd_s32( dif5, dif5 );
+#else
+    return vdup_n_s32( vaddvq_s32( dif4 ) );
+#endif
+}
+
+static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
+{
+    int16x4_t scaling = { -255, -85, 85, 255 };
+    int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
+    int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
+    int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
+    int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
+    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
+
+#ifndef __aarch64__
+    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
+    return vpadd_s32( dif5, dif5 );
+#else
+    return vdup_n_s32( vaddvq_s32( dif4 ) );
+#endif
+}
+
+static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
+{
+    uint16x8_t accu8 = vpaddlq_u8( src );
+#ifndef __aarch64__
+    uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
+    uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
+    uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
+    return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
+#else 
+    return vdupq_n_s16( vaddvq_u16( accu8 ) );
+#endif
+}
+
+static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
+{
+    uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
+    int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023
+    i = vhsubq_s16( i, vdupq_n_s16( 15 ) );
+
+    int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) );
+    int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) );
+
+    return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 );
+}
+
+static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
+{
+    int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023
+    i = vhsub_s16( i, vdup_n_s16( 15 ) );
+
+    int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) );
+    int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) );
+    return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
+}
+
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src )
+{
+    uint8x16x4_t srcBlock = vld4q_u8( src );
+
+    int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] );
+    int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] );
+    int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] );
+
+    int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide );
+    int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide );
+
+    int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
+    int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
+
+    int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide );
+    int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide );
+
+    int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
+    int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] );
+    int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
+    int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] );
+
+    const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f );
+    float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
+    float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
+    int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0];
+    float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f);
+
+    float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f );
+    float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f );
+    float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f );
+
+    int32x4_t coi = vcvtq_s32_f32( cof );
+    int32x4_t chi = vcvtq_s32_f32( chf );
+    int32x4_t cvi = vcvtq_s32_f32( cvf );
+
+    int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
+    int32x4x2_t tr_o = vtrnq_s32( coi, coi );
+
+    int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] );
+    int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) );
+    int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) );
+    int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
+
+    int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
+
+    rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) );
+    int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 );
+    int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 );
+    int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 );
+
+    int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
+    int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) );
+
+    int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 };
+    int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 };
+    int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 };
+
+    int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 );
+    int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) );
+    int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) );
+
+    int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 );
+    int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) );
+    int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) );
+
+    int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 );
+    int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) );
+    int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) );
+
+    int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo );
+    int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi );
+
+    int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo );
+    int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi );
+
+    int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo );
+    int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi );
+
+    int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 );
+    int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 );
+
+    int16x4_t tmpDif = vget_low_s16( dif_lo );
+    int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_high_s16( dif_lo );
+    int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_low_s16( dif_hi );
+    int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_high_s16( dif_hi );
+    int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
+
+    uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
+    uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3) );
+
+    uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
+    uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
+
+    uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
+
+#ifdef __aarch64__
+    uint64_t error = vaddvq_u64( difsq_9 );
+#else
+    uint64_t error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 );
+#endif
+
+    int32_t coR = c_hvoo_br_6[6];
+    int32_t coG = c_hvox_g_7[2];
+    int32_t coB = c_hvoo_br_6[4];
+
+    int32_t chR = c_hvoo_br_6[2];
+    int32_t chG = c_hvox_g_7[0];
+    int32_t chB = c_hvoo_br_6[0];
+
+    int32_t cvR = c_hvoo_br_6[3];
+    int32_t cvG = c_hvox_g_7[1];
+    int32_t cvB = c_hvoo_br_6[1];
+
+    uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
+    uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
+    uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
+    uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
+    lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
+    lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 );
+    lo |= coR << 25;
+
+    const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
+
+    lo |= g_flags[idx];
+
+    uint64_t result = static_cast<uint32_t>( _bswap(lo) );
+    result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
+
+    return std::make_pair( result, error );
+}
+
+#endif
+
+template<class T, class S>
+static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return FixByteOrder(d);
+}
+
+}
+
+static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __AVX2__
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, src );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    uint32_t idx = _bit_scan_forward(mask) >> 2;
+
+    d |= EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
+#else
+    uint64_t d = CheckSolid( src );
+    if( d != 0 ) return d;
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
+#endif
+}
+
+static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src )
+{
+#ifdef __AVX2__
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    auto plane = Planar_AVX2( src );
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    size_t idx = _bit_scan_forward(mask) >> 2;
+
+    d = EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+#else
+    uint64_t d = CheckSolid( src );
+    if (d != 0) return d;
+
+#ifdef __ARM_NEON
+    auto result = Planar_NEON( src );
+#else
+    auto result = Planar( src );
+#endif
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
+#endif
+}
+
+#ifdef __SSE4_1__
+template<int K>
+static etcpak_force_inline __m128i Widen( const __m128i src )
+{
+    static_assert( K >= 0 && K <= 7, "Index out of range" );
+
+    __m128i tmp;
+    switch( K )
+    {
+    case 0:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 1:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 2:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 3:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 4:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 5:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 6:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 7:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    }
+}
+
+static etcpak_force_inline int GetMulSel( int sel )
+{
+    switch( sel )
+    {
+    case 0:
+        return 0;
+    case 1:
+    case 2:
+    case 3:
+        return 1;
+    case 4:
+        return 2;
+    case 5:
+    case 6:
+    case 7:
+        return 3;
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+        return 4;
+    case 14:
+    case 15:
+        return 5;
+    }
+}
+
+#endif
+
+#ifdef __ARM_NEON
+
+static constexpr etcpak_force_inline int GetMulSel(int sel)
+{
+    return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5;
+}
+
+static constexpr int ClampConstant( int x, int min, int max )
+{
+    return x < min ? min : x > max ? max : x;
+}
+
+template <int Index>
+etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
+{
+    uint8x8_t srcValWide;
+#ifndef __aarch64__
+    if( Index < 8 )
+        srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 8 ) );
+    else
+        srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 8 ) );
+#else
+    srcValWide = vdup_laneq_u8( alphaBlock, Index );
+#endif
+
+    uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
+    return vmull_u8( deltaVal, deltaVal );
+}
+
+etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
+{
+#ifndef __aarch64__
+    uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
+    tmpErr = vpmin_u16( tmpErr, tmpErr );
+    return vpmin_u16( tmpErr, tmpErr )[0];
+#else
+    return vminvq_u16( errProbe );
+#endif
+}
+
+template <int Index>
+etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
+{
+    uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
+    uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
+    uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) );
+    idx >>= 3;
+    idx <<= 45 - Index * 3;
+
+    return idx;
+}
+
+template <int Index>
+etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
+{
+    constexpr int Lane = GetMulSel( Index );
+#ifndef __aarch64__
+    if( Lane < 4 )
+        return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 4 ) );
+    else
+        return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 4 ) );
+#else
+    return vdupq_laneq_s16( multipliers, Lane );
+#endif
+}
+
+#endif
+
+static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
+{
+#if defined __SSE4_1__
+    // Check solid
+    __m128i s = _mm_loadu_si128( (__m128i*)src );
+    __m128i solidCmp = _mm_set1_epi8( src[0] );
+    __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return src[0];
+    }
+
+    // Calculate min, max
+    __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( s, s1 );
+    __m128i min1 = _mm_min_epu8( s, s1 );
+    __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, smax2 );
+    __m128i min2 = _mm_min_epu8( min1, smin2 );
+    __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, smax3 );
+    __m128i min3 = _mm_min_epu8( min2, smin3 );
+    __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, smax4 );
+    __m128i min = _mm_min_epu8( min3, smin4 );
+    __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
+    __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
+
+    // src range, mid
+    __m128i srcRange = _mm_sub_epi16( max16, min16 );
+    __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
+    __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
+
+    // multiplier
+    __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
+    __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
+
+    // wide source
+    __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
+
+    __m128i sr[16] = {
+        Widen<0>( s16[0] ),
+        Widen<1>( s16[0] ),
+        Widen<2>( s16[0] ),
+        Widen<3>( s16[0] ),
+        Widen<4>( s16[0] ),
+        Widen<5>( s16[0] ),
+        Widen<6>( s16[0] ),
+        Widen<7>( s16[0] ),
+        Widen<0>( s16[1] ),
+        Widen<1>( s16[1] ),
+        Widen<2>( s16[1] ),
+        Widen<3>( s16[1] ),
+        Widen<4>( s16[1] ),
+        Widen<5>( s16[1] ),
+        Widen<6>( s16[1] ),
+        Widen<7>( s16[1] )
+    };
+
+#ifdef __AVX2__
+    __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
+    __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
+
+    __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
+    __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) );
+
+    __m256i modMul[8] = {
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ),
+    };
+
+    // find selector
+    __m256i mulErr = _mm256_setzero_si256();
+    for( int j=0; j<16; j++ )
+    {
+        __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
+        __m256i err1, err2;
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[0] );
+        __m256i localErr = _mm256_mullo_epi16( err1, err1 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[1] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[2] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[3] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[4] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[5] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[6] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[7] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
+        mulErr = _mm256_adds_epu16( mulErr, localErr );
+    }
+    uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
+    uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) );
+    int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) );
+
+    __m128i recVal16;
+    switch( sel )
+    {
+    case 0:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() );
+        break;
+    case 1:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() );
+        break;
+    case 2:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() );
+        break;
+    case 3:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() );
+        break;
+    case 4:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() );
+        break;
+    case 5:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() );
+        break;
+    case 6:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() );
+        break;
+    case 7:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() );
+        break;
+    case 8:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() );
+        break;
+    case 9:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() );
+        break;
+    case 10:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() );
+        break;
+    case 11:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() );
+        break;
+    case 12:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() );
+        break;
+    case 13:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() );
+        break;
+    case 14:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() );
+        break;
+    case 15:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() );
+        break;
+    default:
+        assert( false );
+        break;
+    }
+#else
+    // wide multiplier
+    __m128i rangeMul[16] = {
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
+    };
+
+    // find selector
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    for( int r=0; r<16; r++ )
+    {
+        __m128i err1, err2, minerr;
+        __m128i recVal16 = rangeMul[r];
+        int rangeErr;
+
+        err1 = _mm_sub_epi16( sr[0], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[1], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[2], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[3], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[4], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[5], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[6], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[7], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[8], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[9], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[10], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[11], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[12], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[13], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[14], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[15], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if( err == 0 ) break;
+        }
+    }
+
+    __m128i recVal16 = rangeMul[sel];
+#endif
+
+    // find indices
+    __m128i err1, err2, minerr;
+    uint64_t idx = 0, tmp;
+
+    err1 = _mm_sub_epi16( sr[0], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 15*3;
+
+    err1 = _mm_sub_epi16( sr[1], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 14*3;
+
+    err1 = _mm_sub_epi16( sr[2], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 13*3;
+
+    err1 = _mm_sub_epi16( sr[3], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 12*3;
+
+    err1 = _mm_sub_epi16( sr[4], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 11*3;
+
+    err1 = _mm_sub_epi16( sr[5], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 10*3;
+
+    err1 = _mm_sub_epi16( sr[6], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 9*3;
+
+    err1 = _mm_sub_epi16( sr[7], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 8*3;
+
+    err1 = _mm_sub_epi16( sr[8], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 7*3;
+
+    err1 = _mm_sub_epi16( sr[9], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 6*3;
+
+    err1 = _mm_sub_epi16( sr[10], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 5*3;
+
+    err1 = _mm_sub_epi16( sr[11], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 4*3;
+
+    err1 = _mm_sub_epi16( sr[12], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 3*3;
+
+    err1 = _mm_sub_epi16( sr[13], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 2*3;
+
+    err1 = _mm_sub_epi16( sr[14], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 1*3;
+
+    err1 = _mm_sub_epi16( sr[15], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 0*3;
+
+    uint16_t rm[8];
+    _mm_storeu_si128( (__m128i*)rm, mul );
+    uint16_t sm = _mm_cvtsi128_si64( srcMid );
+
+    uint64_t d = ( uint64_t( sm ) << 56 ) |
+        ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48 ) |
+        idx;
+
+    return _bswap64( d );
+#elif defined __ARM_NEON
+
+    int16x8_t srcMidWide, multipliers;
+    int srcMid;
+    uint8x16_t srcAlphaBlock = vld1q_u8( src );
+    {
+        uint8_t ref = src[0];
+        uint8x16_t a0 = vdupq_n_u8( ref );
+        uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
+        int64x2_t m = vreinterpretq_s64_u8( r );
+        if( m[0] == -1 && m[1] == -1 )
+            return ref;
+
+        // srcRange
+#ifdef __aarch64__
+        uint8_t min = vminvq_u8( srcAlphaBlock );
+        uint8_t max = vmaxvq_u8( srcAlphaBlock );
+        uint8_t srcRange = max - min;
+        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) );
+        srcMid = min + srcRange / 2;
+        srcMidWide = vdupq_n_s16( srcMid );
+#else
+        uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
+        vmin = vpmin_u8( vmin, vmin );
+        vmin = vpmin_u8( vmin, vmin );
+        vmin = vpmin_u8( vmin, vmin );
+        uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
+        vmax = vpmax_u8( vmax, vmax );
+        vmax = vpmax_u8( vmax, vmax );
+        vmax = vpmax_u8( vmax, vmax );
+
+        int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
+        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) );
+        srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1);
+        srcMid = vgetq_lane_s16( srcMidWide, 0 );
+#endif
+    }
+
+    // calculate reconstructed values
+#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
+
+#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
+    uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
+
+    // find selector
+    int err = std::numeric_limits<int>::max();
+    int sel = 0;
+    for( int r = 0; r < 16; r++ )
+    {
+        uint8x8_t recVal = recVals[r];
+
+        int rangeErr = 0;
+#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
+        EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if ( err == 0 ) break;
+        }
+    }
+
+    // combine results
+    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
+        ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48);
+
+    // generate indices
+    uint8x8_t recVal = recVals[sel];
+#define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
+    EAC_APPLY_16X( EAC_INSERT_INDEX )
+
+    return _bswap64( d );
+
+#undef EAC_APPLY_16X
+#undef EAC_INSERT_INDEX
+#undef EAC_ACCUMULATE_ERROR
+#undef EAC_RECONSTRUCT_VALUE
+
+#else
+    {
+        bool solid = true;
+        const uint8_t* ptr = src + 1;
+        const uint8_t ref = *src;
+        for( int i=1; i<16; i++ )
+        {
+            if( ref != *ptr++ )
+            {
+                solid = false;
+                break;
+            }
+        }
+        if( solid )
+        {
+            return ref;
+        }
+    }
+
+    uint8_t min = src[0];
+    uint8_t max = src[0];
+    for( int i=1; i<16; i++ )
+    {
+        if( min > src[i] ) min = src[i];
+        else if( max < src[i] ) max = src[i];
+    }
+    int srcRange = max - min;
+    int srcMid = min + srcRange / 2;
+
+    uint8_t buf[16][16];
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    int selmul;
+    for( int r=0; r<16; r++ )
+    {
+        int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
+
+        int rangeErr = 0;
+        for( int i=0; i<16; i++ )
+        {
+            const auto srcVal = src[i];
+
+            int idx = 0;
+            const auto modVal = g_alpha[r][0] * mul;
+            const auto recVal = clampu8( srcMid + modVal );
+            int localErr = sq( srcVal - recVal );
+
+            if( localErr != 0 )
+            {
+                for( int j=1; j<8; j++ )
+                {
+                    const auto modVal = g_alpha[r][j] * mul;
+                    const auto recVal = clampu8( srcMid + modVal );
+                    const auto errProbe = sq( srcVal - recVal );
+                    if( errProbe < localErr )
+                    {
+                        localErr = errProbe;
+                        idx = j;
+                    }
+                }
+            }
+
+            buf[r][i] = idx;
+            rangeErr += localErr;
+        }
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            selmul = mul;
+            if( err == 0 ) break;
+        }
+    }
+
+    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
+        ( uint64_t( selmul ) << 52 ) |
+        ( uint64_t( sel ) << 48 );
+
+    int offset = 45;
+    auto ptr = buf[sel];
+    for( int i=0; i<16; i++ )
+    {
+        d |= uint64_t( *ptr++ ) << offset;
+        offset -= 3;
+    }
+
+    return _bswap64( d );
+#endif
+}
+
+
+void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
+        __m128i p0 = _mm_shuffle_epi8( c0, mask );
+        __m128i p1 = _mm_shuffle_epi8( c1, mask );
+        __m128i p2 = _mm_shuffle_epi8( c2, mask );
+        __m128i p3 = _mm_shuffle_epi8( c3, mask );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  p0 );
+        _mm_store_si128( (__m128i*)(buf + 4),  p1 );
+        _mm_store_si128( (__m128i*)(buf + 8),  p2 );
+        _mm_store_si128( (__m128i*)(buf + 12), p3 );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            unsigned int a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
+        __m128i p0 = _mm_shuffle_epi8( c0, mask );
+        __m128i p1 = _mm_shuffle_epi8( c1, mask );
+        __m128i p2 = _mm_shuffle_epi8( c2, mask );
+        __m128i p3 = _mm_shuffle_epi8( c3, mask );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  p0 );
+        _mm_store_si128( (__m128i*)(buf + 4),  p1 );
+        _mm_store_si128( (__m128i*)(buf + 8),  p2 );
+        _mm_store_si128( (__m128i*)(buf + 12), p3 );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            unsigned int a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+#include <chrono>
+#include <thread>
+
+void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+#  ifdef __AVX2__
+        DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
+#  else
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        Dither( (uint8_t*)buf );
+#  endif
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t rgba[4*4];
+    uint8_t alpha[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        _mm_store_si128( (__m128i*)(rgba + 0),  c0 );
+        _mm_store_si128( (__m128i*)(rgba + 4),  c1 );
+        _mm_store_si128( (__m128i*)(rgba + 8),  c2 );
+        _mm_store_si128( (__m128i*)(rgba + 12), c3 );
+
+        __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
+
+        __m128i a0 = _mm_shuffle_epi8( c0, mask );
+        __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+        __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+        __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+
+        __m128i s0 = _mm_or_si128( a0, a1 );
+        __m128i s1 = _mm_or_si128( a2, a3 );
+        __m128i s2 = _mm_or_si128( s0, s1 );
+
+        _mm_store_si128( (__m128i*)alpha, s2 );
+
+        src += 4;
+#else
+        auto ptr = rgba;
+        auto ptr8 = alpha;
+        for( int x=0; x<4; x++ )
+        {
+            auto v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessAlpha_ETC2( alpha );
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba );
+    }
+    while( --blocks );
+}
diff --git a/thirdparty/etcpak/ProcessRGB.hpp b/thirdparty/etcpak/ProcessRGB.hpp
new file mode 100644
index 0000000000..c5555a5bb1
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.hpp
@@ -0,0 +1,13 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include <stdint.h>
+
+void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
diff --git a/thirdparty/etcpak/Tables.cpp b/thirdparty/etcpak/Tables.cpp
new file mode 100644
index 0000000000..5c7fd9cf61
--- /dev/null
+++ b/thirdparty/etcpak/Tables.cpp
@@ -0,0 +1,221 @@
+#include "Tables.hpp"
+
+const int32_t g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64_t g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32_t g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32_t g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32_t g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+const int32_t g_alpha[16][8] = {
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int32_t g_alphaRange[16] = {
+    0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ),
+    0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ),
+    0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ),
+    0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ),
+    0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ),
+    0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ),
+    0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ),
+    0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ),
+    0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ),
+    0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ),
+    0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ),
+    0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ),
+    0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ),
+    0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ),
+    0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ),
+    0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ),
+};
+
+#ifdef __SSE4_1__
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+
+const __m128i g_alpha_SIMD[16] = {
+    _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ),
+    _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ),
+    _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ),
+    _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ),
+    _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ),
+    _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ),
+    _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ),
+    _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ),
+    _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ),
+    _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ),
+    _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ),
+    _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ),
+    _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ),
+    _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ),
+    _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ),
+    _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ),
+};
+
+const __m128i g_alphaRange_SIMD = _mm_setr_epi16(
+    g_alphaRange[0],
+    g_alphaRange[1],
+    g_alphaRange[4],
+    g_alphaRange[5],
+    g_alphaRange[8],
+    g_alphaRange[14],
+    0,
+    0 );
+#endif
+
+#ifdef __AVX2__
+const __m256i g_alpha_AVX[8] = {
+    _mm256_setr_epi16( g_alpha[ 0][0], g_alpha[ 1][0], g_alpha[ 2][0], g_alpha[ 3][0], g_alpha[ 4][0], g_alpha[ 5][0], g_alpha[ 6][0], g_alpha[ 7][0], g_alpha[ 8][0], g_alpha[ 9][0], g_alpha[10][0], g_alpha[11][0], g_alpha[12][0], g_alpha[13][0], g_alpha[14][0], g_alpha[15][0] ),
+    _mm256_setr_epi16( g_alpha[ 0][1], g_alpha[ 1][1], g_alpha[ 2][1], g_alpha[ 3][1], g_alpha[ 4][1], g_alpha[ 5][1], g_alpha[ 6][1], g_alpha[ 7][1], g_alpha[ 8][1], g_alpha[ 9][1], g_alpha[10][1], g_alpha[11][1], g_alpha[12][1], g_alpha[13][1], g_alpha[14][1], g_alpha[15][1] ),
+    _mm256_setr_epi16( g_alpha[ 0][2], g_alpha[ 1][2], g_alpha[ 2][2], g_alpha[ 3][2], g_alpha[ 4][2], g_alpha[ 5][2], g_alpha[ 6][2], g_alpha[ 7][2], g_alpha[ 8][2], g_alpha[ 9][2], g_alpha[10][2], g_alpha[11][2], g_alpha[12][2], g_alpha[13][2], g_alpha[14][2], g_alpha[15][2] ),
+    _mm256_setr_epi16( g_alpha[ 0][3], g_alpha[ 1][3], g_alpha[ 2][3], g_alpha[ 3][3], g_alpha[ 4][3], g_alpha[ 5][3], g_alpha[ 6][3], g_alpha[ 7][3], g_alpha[ 8][3], g_alpha[ 9][3], g_alpha[10][3], g_alpha[11][3], g_alpha[12][3], g_alpha[13][3], g_alpha[14][3], g_alpha[15][3] ),
+    _mm256_setr_epi16( g_alpha[ 0][4], g_alpha[ 1][4], g_alpha[ 2][4], g_alpha[ 3][4], g_alpha[ 4][4], g_alpha[ 5][4], g_alpha[ 6][4], g_alpha[ 7][4], g_alpha[ 8][4], g_alpha[ 9][4], g_alpha[10][4], g_alpha[11][4], g_alpha[12][4], g_alpha[13][4], g_alpha[14][4], g_alpha[15][4] ),
+    _mm256_setr_epi16( g_alpha[ 0][5], g_alpha[ 1][5], g_alpha[ 2][5], g_alpha[ 3][5], g_alpha[ 4][5], g_alpha[ 5][5], g_alpha[ 6][5], g_alpha[ 7][5], g_alpha[ 8][5], g_alpha[ 9][5], g_alpha[10][5], g_alpha[11][5], g_alpha[12][5], g_alpha[13][5], g_alpha[14][5], g_alpha[15][5] ),
+    _mm256_setr_epi16( g_alpha[ 0][6], g_alpha[ 1][6], g_alpha[ 2][6], g_alpha[ 3][6], g_alpha[ 4][6], g_alpha[ 5][6], g_alpha[ 6][6], g_alpha[ 7][6], g_alpha[ 8][6], g_alpha[ 9][6], g_alpha[10][6], g_alpha[11][6], g_alpha[12][6], g_alpha[13][6], g_alpha[14][6], g_alpha[15][6] ),
+    _mm256_setr_epi16( g_alpha[ 0][7], g_alpha[ 1][7], g_alpha[ 2][7], g_alpha[ 3][7], g_alpha[ 4][7], g_alpha[ 5][7], g_alpha[ 6][7], g_alpha[ 7][7], g_alpha[ 8][7], g_alpha[ 9][7], g_alpha[10][7], g_alpha[11][7], g_alpha[12][7], g_alpha[13][7], g_alpha[14][7], g_alpha[15][7] ),
+};
+
+const __m256i g_alphaRange_AVX = _mm256_setr_epi16(
+    g_alphaRange[ 0], g_alphaRange[ 1], g_alphaRange[ 2], g_alphaRange[ 3], g_alphaRange[ 4], g_alphaRange[ 5], g_alphaRange[ 6], g_alphaRange[ 7],
+    g_alphaRange[ 8], g_alphaRange[ 9], g_alphaRange[10], g_alphaRange[11], g_alphaRange[12], g_alphaRange[13], g_alphaRange[14], g_alphaRange[15]
+);
+#endif
+
+#ifdef __ARM_NEON
+const int16x8_t g_table128_NEON[2] =
+{
+    { 2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128 },
+    { 8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128 }
+};
+
+const int32x4_t g_table256_NEON[4] =
+{
+    {  2*256,   5*256,   9*256,  13*256 },
+    {  8*256,  17*256,  29*256,  42*256 },
+    { 18*256,  24*256,  33*256,  47*256 },
+    { 60*256,  80*256, 106*256, 183*256 }
+};
+
+const int16x8_t g_alpha_NEON[16] =
+{
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int16x8_t g_alphaRange_NEON =
+{
+    (int16_t)g_alphaRange[0],
+    (int16_t)g_alphaRange[1],
+    (int16_t)g_alphaRange[4],
+    (int16_t)g_alphaRange[5],
+    (int16_t)g_alphaRange[8],
+    (int16_t)g_alphaRange[14],
+    0,
+    0
+};
+#endif
diff --git a/thirdparty/etcpak/Tables.hpp b/thirdparty/etcpak/Tables.hpp
new file mode 100644
index 0000000000..69d7e8aa07
--- /dev/null
+++ b/thirdparty/etcpak/Tables.hpp
@@ -0,0 +1,49 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif
+#ifdef __SSE4_1__
+#  include <smmintrin.h>
+#endif
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+extern const int32_t g_table[8][4];
+extern const int64_t g_table256[8][4];
+
+extern const uint32_t g_id[4][16];
+
+extern const uint32_t g_avg2[16];
+
+extern const uint32_t g_flags[64];
+
+extern const int32_t g_alpha[16][8];
+extern const int32_t g_alphaRange[16];
+
+#ifdef __SSE4_1__
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+
+extern const __m128i g_alpha_SIMD[16];
+extern const __m128i g_alphaRange_SIMD;
+#endif
+
+#ifdef __AVX2__
+extern const __m256i g_alpha_AVX[8];
+extern const __m256i g_alphaRange_AVX;
+#endif
+
+#ifdef __ARM_NEON
+extern const int16x8_t g_table128_NEON[2];
+extern const int32x4_t g_table256_NEON[4];
+extern const int16x8_t g_alpha_NEON[16];
+extern const int16x8_t g_alphaRange_NEON;
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/Vector.hpp b/thirdparty/etcpak/Vector.hpp
new file mode 100644
index 0000000000..3370a88aea
--- /dev/null
+++ b/thirdparty/etcpak/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+#include <stdint.h>
+
+#include "Math.hpp"
+
+template<class T>
+struct Vector2
+{
+    Vector2() : x( 0 ), y( 0 ) {}
+    Vector2( T v ) : x( v ), y( v ) {}
+    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    Vector2<T>& operator+=( const Vector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator-=( const Vector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator*=( const Vector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
+{
+    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
+{
+    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef Vector2<int32_t> v2i;
+typedef Vector2<float> v2f;
+
+
+template<class T>
+struct Vector3
+{
+    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    Vector3( T v ) : x( v ), y( v ), z( v ) {}
+    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( unsigned int idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( unsigned int idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    Vector3<T> operator+=( const Vector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const Vector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
+{
+    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
+{
+    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef Vector3<int32_t> v3i;
+typedef Vector3<float> v3f;
+typedef Vector3<uint8_t> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8_t( std::min( 1.f, v.x ) * 255 ), uint8_t( std::min( 1.f, v.y ) * 255 ), uint8_t( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v )
+{
+    T l = v.Luminance();
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> pow( const Vector3<T>& base, float exponent )
+{
+    return Vector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+Vector3<T> sRGB2linear( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+Vector3<T> linear2sRGB( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
diff --git a/thirdparty/fonts/OpenSans_SemiBold.ttf b/thirdparty/fonts/OpenSans_SemiBold.ttf
new file mode 100644
index 0000000000..54e7059cf3
--- /dev/null
+++ b/thirdparty/fonts/OpenSans_SemiBold.ttf
diff --git a/thirdparty/harfbuzz/NEWS b/thirdparty/harfbuzz/NEWS
index f09c2fafd1..321c550188 100644
--- a/thirdparty/harfbuzz/NEWS
+++ b/thirdparty/harfbuzz/NEWS
@@ -1,3 +1,18 @@
+Overview of changes leading to 2.8.0
+Tuesday, March 16, 2021
+====================================
+- Shape joining scripts other than Arabic/Syriac using the Universal Shaping Engine.
+  Previously these were shaped using the generalized Arabic shaper. (David Corbett)
+- Fix regression in shaping of U+0B55 ORIYA SIGN OVERLINE. (David Corbett)
+- Update language tags. (David Corbett)
+- Variations: reduce error: do not round each interpolated delta. (Just van Rossum) 
+- Documentation improvements. (Khaled Hosny, Nathan Willis)
+- Subsetter improvements: subsets most, if not all, lookup types now. (Garret Rieger, Qunxin Liu)
+- Fuzzer-found fixes and other improvements when memory failures happen. (Behdad)
+- Removed most atomic implementations now that we have C++11 atomic impl. (Behdad)
+- General codebase upkeep; using more C++11 features: constexpr constructors, etc. (Behdad)
+
+
 Overview of changes leading to 2.7.4
 Sunday, December 27, 2020
 ====================================
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
index 75d523f5fc..98ed20d8eb 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
@@ -510,7 +510,7 @@ struct StateTable
   const Entry<Extra> &get_entry (int state, unsigned int klass) const
   {
     if (unlikely (klass >= nClasses))
-      klass = StateTable<Types, Entry<Extra>>::CLASS_OUT_OF_BOUNDS;
+      klass = StateTable::CLASS_OUT_OF_BOUNDS;
 
     const HBUSHORT *states = (this+stateArrayTable).arrayZ;
     const Entry<Extra> *entries = (this+entryTable).arrayZ;
@@ -576,7 +576,7 @@ struct StateTable
 	  if (unlikely (stop > states))
 	    return_trace (false);
 	  for (const HBUSHORT *p = states; stop < p; p--)
-	    num_entries = hb_max (num_entries, *(p - 1) + 1);
+	    num_entries = hb_max (num_entries, *(p - 1) + 1u);
 	  state_neg = min_state;
 	}
       }
@@ -597,7 +597,7 @@ struct StateTable
 	  if (unlikely (stop < states))
 	    return_trace (false);
 	  for (const HBUSHORT *p = &states[state_pos * num_classes]; p < stop; p++)
-	    num_entries = hb_max (num_entries, *p + 1);
+	    num_entries = hb_max (num_entries, *p + 1u);
 	  state_pos = max_state + 1;
 	}
       }
@@ -729,7 +729,10 @@ struct ExtendedTypes
 template <typename Types, typename EntryData>
 struct StateTableDriver
 {
-  StateTableDriver (const StateTable<Types, EntryData> &machine_,
+  using StateTableT = StateTable<Types, EntryData>;
+  using EntryT = Entry<EntryData>;
+
+  StateTableDriver (const StateTableT &machine_,
 		    hb_buffer_t *buffer_,
 		    hb_face_t *face_) :
 	      machine (machine_),
@@ -742,59 +745,101 @@ struct StateTableDriver
     if (!c->in_place)
       buffer->clear_output ();
 
-    int state = StateTable<Types, EntryData>::STATE_START_OF_TEXT;
+    int state = StateTableT::STATE_START_OF_TEXT;
     for (buffer->idx = 0; buffer->successful;)
     {
       unsigned int klass = buffer->idx < buffer->len ?
 			   machine.get_class (buffer->info[buffer->idx].codepoint, num_glyphs) :
-			   (unsigned) StateTable<Types, EntryData>::CLASS_END_OF_TEXT;
+			   (unsigned) StateTableT::CLASS_END_OF_TEXT;
       DEBUG_MSG (APPLY, nullptr, "c%u at %u", klass, buffer->idx);
-      const Entry<EntryData> &entry = machine.get_entry (state, klass);
+      const EntryT &entry = machine.get_entry (state, klass);
+      const int next_state = machine.new_state (entry.newState);
 
-      /* Unsafe-to-break before this if not in state 0, as things might
-       * go differently if we start from state 0 here.
+      /* Conditions under which it's guaranteed safe-to-break before current glyph:
        *
-       * Ugh.  The indexing here is ugly... */
-      if (state && buffer->backtrack_len () && buffer->idx < buffer->len)
-      {
-	/* If there's no action and we're just epsilon-transitioning to state 0,
-	 * safe to break. */
-	if (c->is_actionable (this, entry) ||
-	    !(entry.newState == StateTable<Types, EntryData>::STATE_START_OF_TEXT &&
-	      entry.flags == context_t::DontAdvance))
-	  buffer->unsafe_to_break_from_outbuffer (buffer->backtrack_len () - 1, buffer->idx + 1);
-      }
-
-      /* Unsafe-to-break if end-of-text would kick in here. */
-      if (buffer->idx + 2 <= buffer->len)
-      {
-	const Entry<EntryData> &end_entry = machine.get_entry (state, StateTable<Types, EntryData>::CLASS_END_OF_TEXT);
-	if (c->is_actionable (this, end_entry))
-	  buffer->unsafe_to_break (buffer->idx, buffer->idx + 2);
-      }
+       * 1. There was no action in this transition; and
+       *
+       * 2. If we break before current glyph, the results will be the same. That
+       *    is guaranteed if:
+       *
+       *    2a. We were already in start-of-text state; or
+       *
+       *    2b. We are epsilon-transitioning to start-of-text state; or
+       *
+       *    2c. Starting from start-of-text state seeing current glyph:
+       *
+       *        2c'. There won't be any actions; and
+       *
+       *        2c". We would end up in the same state that we were going to end up
+       *             in now, including whether epsilon-transitioning.
+       *
+       *    and
+       *
+       * 3. If we break before current glyph, there won't be any end-of-text action
+       *    after previous glyph.
+       *
+       * This triples the transitions we need to look up, but is worth returning
+       * granular unsafe-to-break results. See eg.:
+       *
+       *   https://github.com/harfbuzz/harfbuzz/issues/2860
+       */
+      const EntryT *wouldbe_entry;
+      bool safe_to_break =
+	/* 1. */
+	!c->is_actionable (this, entry)
+      &&
+	/* 2. */
+	(
+	  /* 2a. */
+	  state == StateTableT::STATE_START_OF_TEXT
+	||
+	  /* 2b. */
+	  (
+	    (entry.flags & context_t::DontAdvance) &&
+	    next_state == StateTableT::STATE_START_OF_TEXT
+	  )
+	||
+	  /* 2c. */
+	  (
+	    wouldbe_entry = &machine.get_entry (StateTableT::STATE_START_OF_TEXT, klass)
+	  ,
+	    /* 2c'. */
+	    !c->is_actionable (this, *wouldbe_entry)
+	  &&
+	    /* 2c". */
+	    (
+	      next_state == machine.new_state (wouldbe_entry->newState)
+	    &&
+	      (entry.flags & context_t::DontAdvance) == (wouldbe_entry->flags & context_t::DontAdvance)
+	    )
+	  )
+	)
+      &&
+	/* 3. */
+	!c->is_actionable (this, machine.get_entry (state, StateTableT::CLASS_END_OF_TEXT))
+      ;
+
+      if (!safe_to_break && buffer->backtrack_len () && buffer->idx < buffer->len)
+	buffer->unsafe_to_break_from_outbuffer (buffer->backtrack_len () - 1, buffer->idx + 1);
 
       c->transition (this, entry);
 
-      state = machine.new_state (entry.newState);
+      state = next_state;
       DEBUG_MSG (APPLY, nullptr, "s%d", state);
 
-      if (buffer->idx == buffer->len)
+      if (buffer->idx == buffer->len || unlikely (!buffer->successful))
 	break;
 
       if (!(entry.flags & context_t::DontAdvance) || buffer->max_ops-- <= 0)
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
     }
 
     if (!c->in_place)
-    {
-      for (; buffer->successful && buffer->idx < buffer->len;)
-	buffer->next_glyph ();
       buffer->swap_buffers ();
-    }
   }
 
   public:
-  const StateTable<Types, EntryData> &machine;
+  const StateTableT &machine;
   hb_buffer_t *buffer;
   unsigned int num_glyphs;
 };
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh b/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh
index 04027a61be..e3bc268d26 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh
@@ -337,9 +337,9 @@ struct ContextualSubtable
       const EntryData &data = entries[i].data;
 
       if (data.markIndex != 0xFFFF)
-	num_lookups = hb_max (num_lookups, 1 + data.markIndex);
+	num_lookups = hb_max (num_lookups, 1u + data.markIndex);
       if (data.currentIndex != 0xFFFF)
-	num_lookups = hb_max (num_lookups, 1 + data.currentIndex);
+	num_lookups = hb_max (num_lookups, 1u + data.currentIndex);
     }
 
     return_trace (substitutionTables.sanitize (c, this, num_lookups));
@@ -499,7 +499,7 @@ struct LigatureSubtable
 	  }
 
 	  DEBUG_MSG (APPLY, nullptr, "Moving to stack position %u", cursor - 1);
-	  buffer->move_to (match_positions[--cursor % ARRAY_LENGTH (match_positions)]);
+	  if (unlikely (!buffer->move_to (match_positions[--cursor % ARRAY_LENGTH (match_positions)]))) return;
 
 	  if (unlikely (!actionData->sanitize (&c->sanitizer))) break;
 	  action = *actionData;
@@ -525,25 +525,25 @@ struct LigatureSubtable
 	    hb_codepoint_t lig = ligatureData;
 
 	    DEBUG_MSG (APPLY, nullptr, "Produced ligature %u", lig);
-	    buffer->replace_glyph (lig);
+	    if (unlikely (!buffer->replace_glyph (lig))) return;
 
 	    unsigned int lig_end = match_positions[(match_length - 1u) % ARRAY_LENGTH (match_positions)] + 1u;
 	    /* Now go and delete all subsequent components. */
 	    while (match_length - 1u > cursor)
 	    {
 	      DEBUG_MSG (APPLY, nullptr, "Skipping ligature component");
-	      buffer->move_to (match_positions[--match_length % ARRAY_LENGTH (match_positions)]);
-	      buffer->replace_glyph (DELETED_GLYPH);
+	      if (unlikely (!buffer->move_to (match_positions[--match_length % ARRAY_LENGTH (match_positions)]))) return;
+	      if (unlikely (!buffer->replace_glyph (DELETED_GLYPH))) return;
 	    }
 
-	    buffer->move_to (lig_end);
+	    if (unlikely (!buffer->move_to (lig_end))) return;
 	    buffer->merge_out_clusters (match_positions[cursor % ARRAY_LENGTH (match_positions)], buffer->out_len);
 	  }
 
 	  actionData++;
 	}
 	while (!(action & LigActionLast));
-	buffer->move_to (end);
+	if (unlikely (!buffer->move_to (end))) return;
       }
     }
 
@@ -733,17 +733,16 @@ struct InsertionSubtable
 	bool before = flags & MarkedInsertBefore;
 
 	unsigned int end = buffer->out_len;
-	buffer->move_to (mark);
+	if (unlikely (!buffer->move_to (mark))) return;
 
 	if (buffer->idx < buffer->len && !before)
-	  buffer->copy_glyph ();
+	  if (unlikely (!buffer->copy_glyph ())) return;
 	/* TODO We ignore KashidaLike setting. */
-	for (unsigned int i = 0; i < count; i++)
-	  buffer->output_glyph (glyphs[i]);
+	if (unlikely (!buffer->replace_glyphs (0, count, glyphs))) return;
 	if (buffer->idx < buffer->len && !before)
 	  buffer->skip_glyph ();
 
-	buffer->move_to (end + count);
+	if (unlikely (!buffer->move_to (end + count))) return;
 
 	buffer->unsafe_to_break_from_outbuffer (mark, hb_min (buffer->idx + 1, buffer->len));
       }
@@ -764,10 +763,9 @@ struct InsertionSubtable
 	unsigned int end = buffer->out_len;
 
 	if (buffer->idx < buffer->len && !before)
-	  buffer->copy_glyph ();
+	  if (unlikely (!buffer->copy_glyph ())) return;
 	/* TODO We ignore KashidaLike setting. */
-	for (unsigned int i = 0; i < count; i++)
-	  buffer->output_glyph (glyphs[i]);
+	if (unlikely (!buffer->replace_glyphs (0, count, glyphs))) return;
 	if (buffer->idx < buffer->len && !before)
 	  buffer->skip_glyph ();
 
@@ -786,7 +784,7 @@ struct InsertionSubtable
 	 *
 	 * https://github.com/harfbuzz/harfbuzz/issues/1224#issuecomment-427691417
 	 */
-	buffer->move_to ((flags & DontAdvance) ? end : end + count);
+	if (unlikely (!buffer->move_to ((flags & DontAdvance) ? end : end + count))) return;
       }
     }
 
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout.cc b/thirdparty/harfbuzz/src/hb-aat-layout.cc
index 74ebaa64ec..0e9f2b4954 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout.cc
+++ b/thirdparty/harfbuzz/src/hb-aat-layout.cc
@@ -227,7 +227,7 @@ hb_aat_layout_compile_map (const hb_aat_map_builder_t *mapper,
  *
  * <note>Note: does not examine the `GSUB` table.</note>
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.3.0
  */
@@ -294,7 +294,7 @@ hb_aat_layout_remove_deleted_glyphs (hb_buffer_t *buffer)
  *
  * <note>Note: does not examine the `GPOS` table.</note>
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.3.0
  */
@@ -325,7 +325,7 @@ hb_aat_layout_position (const hb_ot_shape_plan_t *plan,
  * Tests whether the specified face includes any tracking information
  * in the `trak` table.
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.3.0
  */
@@ -350,7 +350,7 @@ hb_aat_layout_track (const hb_ot_shape_plan_t *plan,
  * hb_aat_layout_get_feature_types:
  * @face: #hb_face_t to work upon
  * @start_offset: offset of the first feature type to retrieve
- * @feature_count: (inout) (allow-none): Input = the maximum number of feature types to return;
+ * @feature_count: (inout) (optional): Input = the maximum number of feature types to return;
  *                 Output = the actual number of feature types returned (may be zero)
  * @features: (out caller-allocates) (array length=feature_count): Array of feature types found
  *
@@ -374,9 +374,9 @@ hb_aat_layout_get_feature_types (hb_face_t                    *face,
  * @face: #hb_face_t to work upon
  * @feature_type: The #hb_aat_layout_feature_type_t of the requested feature type
  *
- * Fetches the name ID of the specified feature type in the face's `name` table.
+ * Fetches the name identifier of the specified feature type in the face's `name` table.
  *
- * Return value: Name ID of the requested feature type
+ * Return value: Name identifier of the requested feature type
  *
  * Since: 2.2.0
  */
@@ -388,15 +388,15 @@ hb_aat_layout_feature_type_get_name_id (hb_face_t                    *face,
 }
 
 /**
- * hb_aat_layout_feature_type_get_selectors:
+ * hb_aat_layout_feature_type_get_selector_infos:
  * @face: #hb_face_t to work upon
  * @feature_type: The #hb_aat_layout_feature_type_t of the requested feature type
  * @start_offset: offset of the first feature type to retrieve
- * @selector_count: (inout) (allow-none): Input = the maximum number of selectors to return;
+ * @selector_count: (inout) (optional): Input = the maximum number of selectors to return;
  *                  Output = the actual number of selectors returned (may be zero)
- * @selectors: (out caller-allocates) (array length=selector_count): A buffer pointer.
- *             The selectors available for the feature type queries.
- * @default_index: (out) (allow-none): The index of the feature's default selector, if any
+ * @selectors: (out caller-allocates) (array length=selector_count) (optional):
+ *             A buffer pointer. The selectors available for the feature type queries.
+ * @default_index: (out) (optional): The index of the feature's default selector, if any
  *
  * Fetches a list of the selectors available for the specified feature in the given face.
  *
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout.h b/thirdparty/harfbuzz/src/hb-aat-layout.h
index dc1bf96573..9af2740088 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout.h
+++ b/thirdparty/harfbuzz/src/hb-aat-layout.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_AAT_H_IN
+#if !defined(HB_AAT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-aat.h> instead."
 #endif
 
@@ -38,47 +38,47 @@ HB_BEGIN_DECLS
 /**
  * hb_aat_layout_feature_type_t:
  * @HB_AAT_LAYOUT_FEATURE_TYPE_INVALID: Initial, unset feature type
- * @HB_AAT_LAYOUT_FEATURE_TYPE_ALL_TYPOGRAPHIC:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_LIGATURES:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CURISVE_CONNECTION:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_LETTER_CASE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_VERTICAL_SUBSTITUTION:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_LINGUISTIC_REARRANGEMENT:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_NUMBER_SPACING:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_SMART_SWASH_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_DIACRITICS_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_VERTICAL_POSITION:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_FRACTIONS:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_OVERLAPPING_CHARACTERS_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_TYPOGRAPHIC_EXTRAS:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_MATHEMATICAL_EXTRAS:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_ORNAMENT_SETS_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CHARACTER_ALTERNATIVES:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_DESIGN_COMPLEXITY_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_STYLE_OPTIONS:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CHARACTER_SHAPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_NUMBER_CASE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_TEXT_SPACING:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_TRANSLITERATION:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_ANNOTATION_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_KANA_SPACING_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_IDEOGRAPHIC_SPACING_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_UNICODE_DECOMPOSITION_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_RUBY_KANA:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_SYMBOL_ALTERNATIVES_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_IDEOGRAPHIC_ALTERNATIVES_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_VERTICAL_ROMAN_PLACEMENT_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_ITALIC_CJK_ROMAN:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CASE_SENSITIVE_LAYOUT:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_ALTERNATE_KANA:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_STYLISTIC_ALTERNATIVES:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CONTEXTUAL_ALTERNATIVES:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_LOWER_CASE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_UPPER_CASE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_LANGUAGE_TAG_TYPE:
- * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_ROMAN_SPACING_TYPE:
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_ALL_TYPOGRAPHIC: [All Typographic Features](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type0)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_LIGATURES: [Ligatures](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type1)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CURISVE_CONNECTION: [Cursive Connection](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type2)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_LETTER_CASE: [Letter Case](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type3)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_VERTICAL_SUBSTITUTION: [Vertical Substitution](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type4)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_LINGUISTIC_REARRANGEMENT: [Linguistic Rearrangement](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type5)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_NUMBER_SPACING: [Number Spacing](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type6)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_SMART_SWASH_TYPE: [Smart Swash](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type8)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_DIACRITICS_TYPE: [Diacritics](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type9)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_VERTICAL_POSITION: [Vertical Position](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type10)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_FRACTIONS: [Fractions](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type11)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_OVERLAPPING_CHARACTERS_TYPE: [Overlapping Characters](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type13)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_TYPOGRAPHIC_EXTRAS: [Typographic Extras](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type14)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_MATHEMATICAL_EXTRAS: [Mathematical Extras](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type15)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_ORNAMENT_SETS_TYPE: [Ornament Sets](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type16)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CHARACTER_ALTERNATIVES: [Character Alternatives](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type17)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_DESIGN_COMPLEXITY_TYPE: [Design Complexity](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type18)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_STYLE_OPTIONS: [Style Options](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type19)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CHARACTER_SHAPE: [Character Shape](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type20)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_NUMBER_CASE: [Number Case](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type21)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_TEXT_SPACING: [Text Spacing](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type22)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_TRANSLITERATION: [Transliteration](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type23)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_ANNOTATION_TYPE: [Annotation](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type24)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_KANA_SPACING_TYPE: [Kana Spacing](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type25)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_IDEOGRAPHIC_SPACING_TYPE: [Ideographic Spacing](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type26)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_UNICODE_DECOMPOSITION_TYPE: [Unicode Decomposition](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type27)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_RUBY_KANA: [Ruby Kana](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type28)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_SYMBOL_ALTERNATIVES_TYPE: [CJK Symbol Alternatives](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type29)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_IDEOGRAPHIC_ALTERNATIVES_TYPE: [Ideographic Alternatives](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type30)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_VERTICAL_ROMAN_PLACEMENT_TYPE: [CJK Vertical Roman Placement](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type31)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_ITALIC_CJK_ROMAN: [Italic CJK Roman](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type32)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CASE_SENSITIVE_LAYOUT: [Case Sensitive Layout](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type33)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_ALTERNATE_KANA: [Alternate Kana](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type34)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_STYLISTIC_ALTERNATIVES: [Stylistic Alternatives](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type35)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CONTEXTUAL_ALTERNATIVES: [Contextual Alternatives](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type36)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_LOWER_CASE: [Lower Case](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type37)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_UPPER_CASE: [Upper Case](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type38)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_LANGUAGE_TAG_TYPE: [Language Tag](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type39)
+ * @HB_AAT_LAYOUT_FEATURE_TYPE_CJK_ROMAN_SPACING_TYPE: [CJK Roman Spacing](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html#Type103)
  *
- * The possible feature types defined for AAT shaping.
+ * The possible feature types defined for AAT shaping, from Apple [Font Feature Registry](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM09/AppendixF.html).
  *
  * Since: 2.2.0
  */
@@ -732,6 +732,14 @@ HB_EXTERN hb_ot_name_id_t
 hb_aat_layout_feature_type_get_name_id (hb_face_t                    *face,
 					hb_aat_layout_feature_type_t  feature_type);
 
+/**
+ * hb_aat_layout_feature_selector_info_t:
+ * @name_id: The selector's name identifier
+ * @enable: The value to turn the selector on
+ * @disable: The value to turn the selector off
+ *
+ * Structure representing a setting for an #hb_aat_layout_feature_type_t.
+ */
 typedef struct hb_aat_layout_feature_selector_info_t {
   hb_ot_name_id_t			name_id;
   hb_aat_layout_feature_selector_t	enable;
diff --git a/thirdparty/harfbuzz/src/hb-algs.hh b/thirdparty/harfbuzz/src/hb-algs.hh
index 98de61f3e8..bc170b0546 100644
--- a/thirdparty/harfbuzz/src/hb-algs.hh
+++ b/thirdparty/harfbuzz/src/hb-algs.hh
@@ -35,6 +35,132 @@
 #include "hb-number.hh"
 
 
+/*
+ * Flags
+ */
+
+/* Enable bitwise ops on enums marked as flags_t */
+/* To my surprise, looks like the function resolver is happy to silently cast
+ * one enum to another...  So this doesn't provide the type-checking that I
+ * originally had in mind... :(.
+ *
+ * For MSVC warnings, see: https://github.com/harfbuzz/harfbuzz/pull/163
+ */
+#ifdef _MSC_VER
+# pragma warning(disable:4200)
+# pragma warning(disable:4800)
+#endif
+#define HB_MARK_AS_FLAG_T(T) \
+	extern "C++" { \
+	  static inline constexpr T operator | (T l, T r) { return T ((unsigned) l | (unsigned) r); } \
+	  static inline constexpr T operator & (T l, T r) { return T ((unsigned) l & (unsigned) r); } \
+	  static inline constexpr T operator ^ (T l, T r) { return T ((unsigned) l ^ (unsigned) r); } \
+	  static inline constexpr T operator ~ (T r) { return T (~(unsigned int) r); } \
+	  static inline T& operator |= (T &l, T r) { l = l | r; return l; } \
+	  static inline T& operator &= (T& l, T r) { l = l & r; return l; } \
+	  static inline T& operator ^= (T& l, T r) { l = l ^ r; return l; } \
+	} \
+	static_assert (true, "")
+
+/* Useful for set-operations on small enums.
+ * For example, for testing "x ∈ {x1, x2, x3}" use:
+ * (FLAG_UNSAFE(x) & (FLAG(x1) | FLAG(x2) | FLAG(x3)))
+ */
+#define FLAG(x) (static_assert_expr ((unsigned)(x) < 32) + (((uint32_t) 1U) << (unsigned)(x)))
+#define FLAG_UNSAFE(x) ((unsigned)(x) < 32 ? (((uint32_t) 1U) << (unsigned)(x)) : 0)
+#define FLAG_RANGE(x,y) (static_assert_expr ((x) < (y)) + FLAG(y+1) - FLAG(x))
+#define FLAG64(x) (static_assert_expr ((unsigned)(x) < 64) + (((uint64_t) 1ULL) << (unsigned)(x)))
+#define FLAG64_UNSAFE(x) ((unsigned)(x) < 64 ? (((uint64_t) 1ULL) << (unsigned)(x)) : 0)
+
+
+/*
+ * Big-endian integers.
+ */
+
+/* Endian swap, used in Windows related backends */
+static inline constexpr uint16_t hb_uint16_swap (uint16_t v)
+{ return (v >> 8) | (v << 8); }
+static inline constexpr uint32_t hb_uint32_swap (uint32_t v)
+{ return (hb_uint16_swap (v) << 16) | hb_uint16_swap (v >> 16); }
+
+template <typename Type, int Bytes = sizeof (Type)>
+struct BEInt;
+template <typename Type>
+struct BEInt<Type, 1>
+{
+  public:
+  BEInt () = default;
+  constexpr BEInt (Type V) : v {uint8_t (V)} {}
+  constexpr operator Type () const { return v; }
+  private: uint8_t v;
+};
+template <typename Type>
+struct BEInt<Type, 2>
+{
+  public:
+  BEInt () = default;
+  constexpr BEInt (Type V) : v {uint8_t ((V >>  8) & 0xFF),
+			        uint8_t ((V      ) & 0xFF)} {}
+
+  struct __attribute__((packed)) packed_uint16_t { uint16_t v; };
+  constexpr operator Type () const
+  {
+#if ((defined(__GNUC__) && __GNUC__ >= 5) || defined(__clang__)) && \
+    defined(__BYTE_ORDER) && \
+    (__BYTE_ORDER == __LITTLE_ENDIAN || __BYTE_ORDER == __BIG_ENDIAN)
+    /* Spoon-feed the compiler a big-endian integer with alignment 1.
+     * https://github.com/harfbuzz/harfbuzz/pull/1398 */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+    return __builtin_bswap16 (((packed_uint16_t *) this)->v);
+#else /* __BYTE_ORDER == __BIG_ENDIAN */
+    return ((packed_uint16_t *) this)->v;
+#endif
+#else
+    return (v[0] <<  8)
+	 + (v[1]      );
+#endif
+  }
+  private: uint8_t v[2];
+};
+template <typename Type>
+struct BEInt<Type, 3>
+{
+  static_assert (!hb_is_signed (Type), "");
+  public:
+  BEInt () = default;
+  constexpr BEInt (Type V) : v {uint8_t ((V >> 16) & 0xFF),
+				uint8_t ((V >>  8) & 0xFF),
+				uint8_t ((V      ) & 0xFF)} {}
+
+  constexpr operator Type () const { return (v[0] << 16)
+					  + (v[1] <<  8)
+					  + (v[2]      ); }
+  private: uint8_t v[3];
+};
+template <typename Type>
+struct BEInt<Type, 4>
+{
+  public:
+  BEInt () = default;
+  constexpr BEInt (Type V) : v {uint8_t ((V >> 24) & 0xFF),
+			        uint8_t ((V >> 16) & 0xFF),
+			        uint8_t ((V >>  8) & 0xFF),
+			        uint8_t ((V      ) & 0xFF)} {}
+  constexpr operator Type () const { return (v[0] << 24)
+					  + (v[1] << 16)
+					  + (v[2] <<  8)
+					  + (v[3]      ); }
+  private: uint8_t v[4];
+};
+
+/* Floats. */
+
+/* We want our rounding towards +infinity. */
+static inline float
+_hb_roundf (float x) { return floorf (x + .5f); }
+#define roundf(x) _hb_roundf(x)
+
+
 /* Encodes three unsigned integers in one 64-bit number.  If the inputs have more than 21 bits,
  * values will be truncated / overlap, and might not decode exactly. */
 #define HB_CODEPOINT_ENCODE3(x,y,z) (((uint64_t) (x) << 42) | ((uint64_t) (y) << 21) | (uint64_t) (z))
@@ -48,6 +174,7 @@
 #define HB_CODEPOINT_DECODE3_11_7_14_2(v) ((hb_codepoint_t) (((v) >> 14) & 0x007Fu) | 0x0300)
 #define HB_CODEPOINT_DECODE3_11_7_14_3(v) ((hb_codepoint_t) (v) & 0x3FFFu)
 
+
 struct
 {
   /* Note.  This is dangerous in that if it's passed an rvalue, it returns rvalue-reference. */
@@ -215,7 +342,9 @@ struct
 
   template <typename Pred, typename Val> auto
   impl (Pred&& p, Val &&v, hb_priority<1>) const HB_AUTO_RETURN
-  (hb_deref (hb_forward<Pred> (p)).has (hb_forward<Val> (v)))
+  (
+    hb_deref (hb_forward<Pred> (p)).has (hb_forward<Val> (v))
+  )
 
   template <typename Pred, typename Val> auto
   impl (Pred&& p, Val &&v, hb_priority<0>) const HB_AUTO_RETURN
@@ -269,7 +398,9 @@ struct
 
   template <typename Proj, typename Val> auto
   impl (Proj&& f, Val &&v, hb_priority<2>) const HB_AUTO_RETURN
-  (hb_deref (hb_forward<Proj> (f)).get (hb_forward<Val> (v)))
+  (
+    hb_deref (hb_forward<Proj> (f)).get (hb_forward<Val> (v))
+  )
 
   template <typename Proj, typename Val> auto
   impl (Proj&& f, Val &&v, hb_priority<1>) const HB_AUTO_RETURN
@@ -296,6 +427,40 @@ struct
 }
 HB_FUNCOBJ (hb_get);
 
+struct
+{
+  private:
+
+  template <typename T1, typename T2> auto
+  impl (T1&& v1, T2 &&v2, hb_priority<2>) const HB_AUTO_RETURN
+  (
+    hb_forward<T2> (v2).cmp (hb_forward<T1> (v1)) == 0
+  )
+
+  template <typename T1, typename T2> auto
+  impl (T1&& v1, T2 &&v2, hb_priority<1>) const HB_AUTO_RETURN
+  (
+    hb_forward<T1> (v1).cmp (hb_forward<T2> (v2)) == 0
+  )
+
+  template <typename T1, typename T2> auto
+  impl (T1&& v1, T2 &&v2, hb_priority<0>) const HB_AUTO_RETURN
+  (
+    hb_forward<T1> (v1) == hb_forward<T2> (v2)
+  )
+
+  public:
+
+  template <typename T1, typename T2> auto
+  operator () (T1&& v1, T2 &&v2) const HB_AUTO_RETURN
+  (
+    impl (hb_forward<T1> (v1),
+	  hb_forward<T2> (v2),
+	  hb_prioritize)
+  )
+}
+HB_FUNCOBJ (hb_equal);
+
 
 template <typename T1, typename T2>
 struct hb_pair_t
@@ -375,7 +540,7 @@ HB_FUNCOBJ (hb_clamp);
 
 /* Return the number of 1 bits in v. */
 template <typename T>
-static inline HB_CONST_FUNC unsigned int
+static inline unsigned int
 hb_popcount (T v)
 {
 #if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
@@ -416,7 +581,7 @@ hb_popcount (T v)
 
 /* Returns the number of bits needed to store number */
 template <typename T>
-static inline HB_CONST_FUNC unsigned int
+static inline unsigned int
 hb_bit_storage (T v)
 {
   if (unlikely (!v)) return 0;
@@ -490,7 +655,7 @@ hb_bit_storage (T v)
 
 /* Returns the number of zero bits in the least significant side of v */
 template <typename T>
-static inline HB_CONST_FUNC unsigned int
+static inline unsigned int
 hb_ctz (T v)
 {
   if (unlikely (!v)) return 8 * sizeof (T);
@@ -988,32 +1153,24 @@ hb_codepoint_parse (const char *s, unsigned int len, int base, hb_codepoint_t *o
 
 struct hb_bitwise_and
 { HB_PARTIALIZE(2);
-  static constexpr bool passthru_left = false;
-  static constexpr bool passthru_right = false;
   template <typename T> constexpr auto
   operator () (const T &a, const T &b) const HB_AUTO_RETURN (a & b)
 }
 HB_FUNCOBJ (hb_bitwise_and);
 struct hb_bitwise_or
 { HB_PARTIALIZE(2);
-  static constexpr bool passthru_left = true;
-  static constexpr bool passthru_right = true;
   template <typename T> constexpr auto
   operator () (const T &a, const T &b) const HB_AUTO_RETURN (a | b)
 }
 HB_FUNCOBJ (hb_bitwise_or);
 struct hb_bitwise_xor
 { HB_PARTIALIZE(2);
-  static constexpr bool passthru_left = true;
-  static constexpr bool passthru_right = true;
   template <typename T> constexpr auto
   operator () (const T &a, const T &b) const HB_AUTO_RETURN (a ^ b)
 }
 HB_FUNCOBJ (hb_bitwise_xor);
 struct hb_bitwise_sub
 { HB_PARTIALIZE(2);
-  static constexpr bool passthru_left = true;
-  static constexpr bool passthru_right = false;
   template <typename T> constexpr auto
   operator () (const T &a, const T &b) const HB_AUTO_RETURN (a & ~b)
 }
diff --git a/thirdparty/harfbuzz/src/hb-array.hh b/thirdparty/harfbuzz/src/hb-array.hh
index 568cd02c79..02bd8d81c2 100644
--- a/thirdparty/harfbuzz/src/hb-array.hh
+++ b/thirdparty/harfbuzz/src/hb-array.hh
@@ -142,7 +142,7 @@ struct hb_array_t : hb_iter_with_fallback_t<hb_array_t<Type>, Type&>
   bool lfind (const T &x, unsigned *pos = nullptr) const
   {
     for (unsigned i = 0; i < length; ++i)
-      if (!this->arrayZ[i].cmp (x))
+      if (hb_equal (x, this->arrayZ[i]))
       {
 	if (pos)
 	  *pos = i;
diff --git a/thirdparty/harfbuzz/src/hb-atomic.hh b/thirdparty/harfbuzz/src/hb-atomic.hh
index b3fb296b4e..93265f655f 100644
--- a/thirdparty/harfbuzz/src/hb-atomic.hh
+++ b/thirdparty/harfbuzz/src/hb-atomic.hh
@@ -52,7 +52,7 @@
 
 #elif !defined(HB_NO_MT) && defined(__ATOMIC_ACQUIRE)
 
-/* C++11-style GCC primitives. */
+/* C++11-style GCC primitives. We prefer these as they don't require linking to libstdc++ / libc++. */
 
 #define _hb_memory_barrier()			__sync_synchronize ()
 
@@ -73,7 +73,8 @@ _hb_atomic_ptr_impl_cmplexch (const void **P, const void *O_, const void *N)
 }
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	_hb_atomic_ptr_impl_cmplexch ((const void **) (P), (O), (N))
 
-#elif !defined(HB_NO_MT) && __cplusplus >= 201103L
+
+#elif !defined(HB_NO_MT)
 
 /* C++11 atomics. */
 
@@ -101,117 +102,6 @@ _hb_atomic_ptr_impl_cmplexch (const void **P, const void *O_, const void *N)
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	_hb_atomic_ptr_impl_cmplexch ((const void **) (P), (O), (N))
 
 
-#elif !defined(HB_NO_MT) && defined(_WIN32)
-
-#include <windows.h>
-
-static inline void _hb_memory_barrier ()
-{
-#if !defined(MemoryBarrier) && !defined(__MINGW32_VERSION)
-  /* MinGW has a convoluted history of supporting MemoryBarrier. */
-  LONG dummy = 0;
-  InterlockedExchange (&dummy, 1);
-#else
-  MemoryBarrier ();
-#endif
-}
-#define _hb_memory_barrier()			_hb_memory_barrier ()
-
-#define hb_atomic_int_impl_add(AI, V)		InterlockedExchangeAdd ((LONG *) (AI), (V))
-static_assert ((sizeof (LONG) == sizeof (int)), "");
-
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	(InterlockedCompareExchangePointer ((P), (N), (O)) == (O))
-
-
-#elif !defined(HB_NO_MT) && defined(HAVE_INTEL_ATOMIC_PRIMITIVES)
-
-#define _hb_memory_barrier()			__sync_synchronize ()
-
-#define hb_atomic_int_impl_add(AI, V)		__sync_fetch_and_add ((AI), (V))
-
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	__sync_bool_compare_and_swap ((P), (O), (N))
-
-
-#elif !defined(HB_NO_MT) && defined(HAVE_SOLARIS_ATOMIC_OPS)
-
-#include <atomic.h>
-#include <mbarrier.h>
-
-#define _hb_memory_r_barrier()			__machine_r_barrier ()
-#define _hb_memory_w_barrier()			__machine_w_barrier ()
-#define _hb_memory_barrier()			__machine_rw_barrier ()
-
-static inline int _hb_fetch_and_add (int *AI, int V)
-{
-  _hb_memory_w_barrier ();
-  int result = atomic_add_int_nv ((uint_t *) AI, V) - V;
-  _hb_memory_r_barrier ();
-  return result;
-}
-static inline bool _hb_compare_and_swap_ptr (void **P, void *O, void *N)
-{
-  _hb_memory_w_barrier ();
-  bool result = atomic_cas_ptr (P, O, N) == O;
-  _hb_memory_r_barrier ();
-  return result;
-}
-
-#define hb_atomic_int_impl_add(AI, V)           _hb_fetch_and_add ((AI), (V))
-
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)       _hb_compare_and_swap_ptr ((P), (O), (N))
-
-
-#elif !defined(HB_NO_MT) && defined(__APPLE__)
-
-#include <libkern/OSAtomic.h>
-#ifdef __MAC_OS_X_MIN_REQUIRED
-#include <AvailabilityMacros.h>
-#elif defined(__IPHONE_OS_MIN_REQUIRED)
-#include <Availability.h>
-#endif
-
-#define _hb_memory_barrier()			OSMemoryBarrier ()
-
-#define hb_atomic_int_impl_add(AI, V)		(OSAtomicAdd32Barrier ((V), (AI)) - (V))
-
-#if (MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_4 || __IPHONE_VERSION_MIN_REQUIRED >= 20100)
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	OSAtomicCompareAndSwapPtrBarrier ((O), (N), (P))
-#else
-#if __ppc64__ || __x86_64__ || __aarch64__
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	OSAtomicCompareAndSwap64Barrier ((int64_t) (O), (int64_t) (N), (int64_t*) (P))
-#else
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	OSAtomicCompareAndSwap32Barrier ((int32_t) (O), (int32_t) (N), (int32_t*) (P))
-#endif
-#endif
-
-
-#elif !defined(HB_NO_MT) && defined(_AIX) && (defined(__IBMCPP__) || defined(__ibmxl__))
-
-#include <builtins.h>
-
-#define _hb_memory_barrier()			__lwsync ()
-
-static inline int _hb_fetch_and_add (int *AI, int V)
-{
-  _hb_memory_barrier ();
-  int result = __fetch_and_add (AI, V);
-  _hb_memory_barrier ();
-  return result;
-}
-static inline bool _hb_compare_and_swaplp (long *P, long O, long N)
-{
-  _hb_memory_barrier ();
-  bool result = __compare_and_swaplp (P, &O, N);
-  _hb_memory_barrier ();
-  return result;
-}
-
-#define hb_atomic_int_impl_add(AI, V)           _hb_fetch_and_add ((AI), (V))
-
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)       _hb_compare_and_swaplp ((long *) (P), (long) (O), (long) (N))
-static_assert ((sizeof (long) == sizeof (void *)), "");
-
-
 #elif defined(HB_NO_MT)
 
 #define hb_atomic_int_impl_add(AI, V)		((*(AI) += (V)) - (V))
@@ -259,9 +149,11 @@ inline void *hb_atomic_ptr_impl_get (void ** const P)	{ void *v = *P; _hb_memory
 #endif
 
 
-#define HB_ATOMIC_INT_INIT(V)          {V}
 struct hb_atomic_int_t
 {
+  hb_atomic_int_t () = default;
+  constexpr hb_atomic_int_t (int v) : v (v) {}
+
   void set_relaxed (int v_) { hb_atomic_int_impl_set_relaxed (&v, v_); }
   void set (int v_) { hb_atomic_int_impl_set (&v, v_); }
   int get_relaxed () const { return hb_atomic_int_impl_get_relaxed (&v); }
@@ -269,16 +161,17 @@ struct hb_atomic_int_t
   int inc () { return hb_atomic_int_impl_add (&v,  1); }
   int dec () { return hb_atomic_int_impl_add (&v, -1); }
 
-  int v;
+  int v = 0;
 };
 
-
-#define HB_ATOMIC_PTR_INIT(V)          {V}
 template <typename P>
 struct hb_atomic_ptr_t
 {
   typedef hb_remove_pointer<P> T;
 
+  hb_atomic_ptr_t () = default;
+  constexpr hb_atomic_ptr_t (T* v) : v (v) {}
+
   void init (T* v_ = nullptr) { set_relaxed (v_); }
   void set_relaxed (T* v_) { hb_atomic_ptr_impl_set_relaxed (&v, v_); }
   T *get_relaxed () const { return (T *) hb_atomic_ptr_impl_get_relaxed (&v); }
@@ -288,7 +181,7 @@ struct hb_atomic_ptr_t
   T * operator -> () const                    { return get (); }
   template <typename C> operator C * () const { return get (); }
 
-  T *v;
+  T *v = nullptr;
 };
 
 
diff --git a/thirdparty/harfbuzz/src/hb-blob.cc b/thirdparty/harfbuzz/src/hb-blob.cc
index e340bc346d..71b1b1fc4f 100644
--- a/thirdparty/harfbuzz/src/hb-blob.cc
+++ b/thirdparty/harfbuzz/src/hb-blob.cc
@@ -35,9 +35,6 @@
 #include <sys/mman.h>
 #endif /* HAVE_SYS_MMAN_H */
 
-#include <stdio.h>
-#include <stdlib.h>
-
 
 /**
  * SECTION: hb-blob
@@ -58,7 +55,7 @@
  * @length: Length of @data in bytes.
  * @mode: Memory mode for @data.
  * @user_data: Data parameter to pass to @destroy.
- * @destroy: (optional): Callback to call when @data is not needed anymore.
+ * @destroy: (nullable): Callback to call when @data is not needed anymore.
  *
  * Creates a new "blob" object wrapping @data.  The @mode parameter is used
  * to negotiate ownership and lifecycle of @data.
@@ -116,7 +113,7 @@ _hb_blob_destroy (void *data)
  * @length: Length of sub-blob.
  *
  * Returns a blob that represents a range of bytes in @parent.  The new
- * blob is always created with %HB_MEMORY_MODE_READONLY, meaning that it
+ * blob is always created with #HB_MEMORY_MODE_READONLY, meaning that it
  * will never modify data in the parent blob.  The parent data is not
  * expected to be modified, and will result in undefined behavior if it
  * is.
@@ -237,7 +234,7 @@ hb_blob_destroy (hb_blob_t *blob)
  * @blob: An #hb_blob_t
  * @key: The user-data key to set
  * @data: A pointer to the user data to set
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified blob.
@@ -299,7 +296,7 @@ hb_blob_make_immutable (hb_blob_t *blob)
  *
  * Tests whether a blob is immutable.
  *
- * Return value: %true if @blob is immutable, false otherwise
+ * Return value: %true if @blob is immutable, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -365,16 +362,14 @@ hb_blob_get_data (hb_blob_t *blob, unsigned int *length)
 char *
 hb_blob_get_data_writable (hb_blob_t *blob, unsigned int *length)
 {
-  if (!blob->try_make_writable ()) {
-    if (length)
-      *length = 0;
-
+  if (hb_object_is_immutable (blob) ||
+     !blob->try_make_writable ())
+  {
+    if (length) *length = 0;
     return nullptr;
   }
 
-  if (length)
-    *length = blob->length;
-
+  if (length) *length = blob->length;
   return const_cast<char *> (blob->data);
 }
 
@@ -440,8 +435,8 @@ hb_blob_t::try_make_writable_inplace ()
 bool
 hb_blob_t::try_make_writable ()
 {
-  if (hb_object_is_immutable (this))
-    return false;
+  if (unlikely (!length))
+    mode = HB_MEMORY_MODE_WRITABLE;
 
   if (this->mode == HB_MEMORY_MODE_WRITABLE)
     return true;
diff --git a/thirdparty/harfbuzz/src/hb-blob.h b/thirdparty/harfbuzz/src/hb-blob.h
index 00e41f3ce3..86f12788d2 100644
--- a/thirdparty/harfbuzz/src/hb-blob.h
+++ b/thirdparty/harfbuzz/src/hb-blob.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -38,10 +38,12 @@ HB_BEGIN_DECLS
 
 /**
  * hb_memory_mode_t:
- * @HB_MEMORY_MODE_DUPLICATE
- * @HB_MEMORY_MODE_READONLY
- * @HB_MEMORY_MODE_WRITABLE
- * @HB_MEMORY_MODE_READONLY_MAY_MAKE_WRITABLE
+ * @HB_MEMORY_MODE_DUPLICATE: HarfBuzz immediately makes a copy of the data.
+ * @HB_MEMORY_MODE_READONLY: HarfBuzz client will never modify the data,
+ *     and HarfBuzz will never modify the data.
+ * @HB_MEMORY_MODE_WRITABLE: HarfBuzz client made a copy of the data solely
+ *     for HarfBuzz, so HarfBuzz may modify the data.
+ * @HB_MEMORY_MODE_READONLY_MAY_MAKE_WRITABLE: See above
  *
  * Data type holding the memory modes available to
  * client programs.
diff --git a/thirdparty/harfbuzz/src/hb-buffer-serialize.cc b/thirdparty/harfbuzz/src/hb-buffer-serialize.cc
index f65bad45bb..6539b89640 100644
--- a/thirdparty/harfbuzz/src/hb-buffer-serialize.cc
+++ b/thirdparty/harfbuzz/src/hb-buffer-serialize.cc
@@ -400,8 +400,8 @@ _hb_buffer_serialize_unicode_text (hb_buffer_t *buffer,
  * @buf: (out) (array length=buf_size) (element-type uint8_t): output string to
  *       write serialized buffer into.
  * @buf_size: the size of @buf.
- * @buf_consumed: (out) (allow-none): if not %NULL, will be set to the number of byes written into @buf.
- * @font: (allow-none): the #hb_font_t used to shape this buffer, needed to
+ * @buf_consumed: (out) (optional): if not %NULL, will be set to the number of byes written into @buf.
+ * @font: (nullable): the #hb_font_t used to shape this buffer, needed to
  *        read glyph names and extents. If %NULL, and empty font will be used.
  * @format: the #hb_buffer_serialize_format_t to use for formatting the output.
  * @flags: the #hb_buffer_serialize_flags_t that control what glyph properties
@@ -514,8 +514,10 @@ hb_buffer_serialize_glyphs (hb_buffer_t *buffer,
  * @buf: (out) (array length=buf_size) (element-type uint8_t): output string to
  *       write serialized buffer into.
  * @buf_size: the size of @buf.
- * @buf_consumed: (out) (allow-none): if not %NULL, will be set to the number of byes written into @buf.
+ * @buf_consumed: (out) (optional): if not %NULL, will be set to the number of byes written into @buf.
  * @format: the #hb_buffer_serialize_format_t to use for formatting the output.
+ * @flags: the #hb_buffer_serialize_flags_t that control what glyph properties
+ *         to serialize.
  *
  * Serializes @buffer into a textual representation of its content,
  * when the buffer contains Unicode codepoints (i.e., before shaping). This is
@@ -635,8 +637,8 @@ _hb_buffer_serialize_invalid (hb_buffer_t *buffer,
  * @buf: (out) (array length=buf_size) (element-type uint8_t): output string to
  *       write serialized buffer into.
  * @buf_size: the size of @buf.
- * @buf_consumed: (out) (allow-none): if not %NULL, will be set to the number of byes written into @buf.
- * @font: (allow-none): the #hb_font_t used to shape this buffer, needed to
+ * @buf_consumed: (out) (optional): if not %NULL, will be set to the number of byes written into @buf.
+ * @font: (nullable): the #hb_font_t used to shape this buffer, needed to
  *        read glyph names and extents. If %NULL, and empty font will be used.
  * @format: the #hb_buffer_serialize_format_t to use for formatting the output.
  * @flags: the #hb_buffer_serialize_flags_t that control what glyph properties
@@ -724,15 +726,17 @@ parse_hex (const char *pp, const char *end, uint32_t *pv)
 /**
  * hb_buffer_deserialize_glyphs:
  * @buffer: an #hb_buffer_t buffer.
- * @buf: (array length=buf_len):
- * @buf_len:
- * @end_ptr: (out):
- * @font:
- * @format:
- *
+ * @buf: (array length=buf_len): string to deserialize
+ * @buf_len: the size of @buf, or -1 if it is %NULL-terminated
+ * @end_ptr: (out) (optional): output pointer to the character after last
+ *                               consumed one.
+ * @font: (nullable): font for getting glyph IDs
+ * @format: the #hb_buffer_serialize_format_t of the input @buf
  *
+ * Deserializes glyphs @buffer from textual representation in the format
+ * produced by hb_buffer_serialize_glyphs().
  *
- * Return value:
+ * Return value: %true if @buf is not fully consumed, %false otherwise.
  *
  * Since: 0.9.7
  **/
@@ -795,14 +799,16 @@ hb_buffer_deserialize_glyphs (hb_buffer_t *buffer,
 /**
  * hb_buffer_deserialize_unicode:
  * @buffer: an #hb_buffer_t buffer.
- * @buf: (array length=buf_len):
- * @buf_len:
- * @end_ptr: (out):
- * @format:
- *
+ * @buf: (array length=buf_len): string to deserialize
+ * @buf_len: the size of @buf, or -1 if it is %NULL-terminated
+ * @end_ptr: (out) (optional): output pointer to the character after last
+ *                               consumed one.
+ * @format: the #hb_buffer_serialize_format_t of the input @buf
  *
+ * Deserializes Unicode @buffer from textual representation in the format
+ * produced by hb_buffer_serialize_unicode().
  *
- * Return value:
+ * Return value: %true if @buf is not fully consumed, %false otherwise.
  *
  * Since: 2.7.3
  **/
diff --git a/thirdparty/harfbuzz/src/hb-buffer.cc b/thirdparty/harfbuzz/src/hb-buffer.cc
index 10063db050..8cad6ab8e6 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.cc
+++ b/thirdparty/harfbuzz/src/hb-buffer.cc
@@ -218,9 +218,6 @@ hb_buffer_t::get_scratch_buffer (unsigned int *size)
 void
 hb_buffer_t::reset ()
 {
-  if (unlikely (hb_object_is_immutable (this)))
-    return;
-
   hb_unicode_funcs_destroy (unicode);
   unicode = hb_unicode_funcs_reference (hb_unicode_funcs_get_default ());
   flags = HB_BUFFER_FLAG_DEFAULT;
@@ -233,9 +230,6 @@ hb_buffer_t::reset ()
 void
 hb_buffer_t::clear ()
 {
-  if (unlikely (hb_object_is_immutable (this)))
-    return;
-
   hb_segment_properties_t default_props = HB_SEGMENT_PROPERTIES_DEFAULT;
   props = default_props;
   scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
@@ -290,9 +284,6 @@ hb_buffer_t::add_info (const hb_glyph_info_t &glyph_info)
 void
 hb_buffer_t::remove_output ()
 {
-  if (unlikely (hb_object_is_immutable (this)))
-    return;
-
   have_output = false;
   have_positions = false;
 
@@ -303,9 +294,6 @@ hb_buffer_t::remove_output ()
 void
 hb_buffer_t::clear_output ()
 {
-  if (unlikely (hb_object_is_immutable (this)))
-    return;
-
   have_output = true;
   have_positions = false;
 
@@ -316,9 +304,6 @@ hb_buffer_t::clear_output ()
 void
 hb_buffer_t::clear_positions ()
 {
-  if (unlikely (hb_object_is_immutable (this)))
-    return;
-
   have_output = false;
   have_positions = true;
 
@@ -333,15 +318,19 @@ hb_buffer_t::swap_buffers ()
 {
   if (unlikely (!successful)) return;
 
+  assert (idx <= len);
+  if (unlikely (!next_glyphs (len - idx))) return;
+
   assert (have_output);
   have_output = false;
 
   if (out_info != info)
   {
-    hb_glyph_info_t *tmp_string;
-    tmp_string = info;
+    hb_glyph_info_t *tmp;
+    tmp = info;
     info = out_info;
-    out_info = tmp_string;
+    out_info = tmp;
+
     pos = (hb_glyph_position_t *) out_info;
   }
 
@@ -353,31 +342,6 @@ hb_buffer_t::swap_buffers ()
   idx = 0;
 }
 
-
-void
-hb_buffer_t::replace_glyphs (unsigned int num_in,
-			     unsigned int num_out,
-			     const uint32_t *glyph_data)
-{
-  if (unlikely (!make_room_for (num_in, num_out))) return;
-
-  assert (idx + num_in <= len);
-
-  merge_clusters (idx, idx + num_in);
-
-  hb_glyph_info_t orig_info = info[idx];
-  hb_glyph_info_t *pinfo = &out_info[out_len];
-  for (unsigned int i = 0; i < num_out; i++)
-  {
-    *pinfo = orig_info;
-    pinfo->codepoint = glyph_data[i];
-    pinfo++;
-  }
-
-  idx  += num_in;
-  out_len += num_out;
-}
-
 bool
 hb_buffer_t::move_to (unsigned int i)
 {
@@ -768,7 +732,7 @@ hb_buffer_destroy (hb_buffer_t *buffer)
  * @buffer: An #hb_buffer_t
  * @key: The user-data key
  * @data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified buffer. 
@@ -795,7 +759,7 @@ hb_buffer_set_user_data (hb_buffer_t        *buffer,
  * Fetches the user data associated with the specified key,
  * attached to the specified buffer.
  *
- * Return value: (transfer-none): A pointer to the user data
+ * Return value: (transfer none): A pointer to the user data
  *
  * Since: 0.9.2
  **/
@@ -1137,7 +1101,7 @@ hb_buffer_get_cluster_level (hb_buffer_t *buffer)
  * Sets the #hb_codepoint_t that replaces invalid entries for a given encoding
  * when adding text to @buffer.
  *
- * Default is %HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT.
+ * Default is #HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT.
  *
  * Since: 0.9.31
  **/
@@ -1222,6 +1186,9 @@ hb_buffer_get_invisible_glyph (hb_buffer_t    *buffer)
 void
 hb_buffer_reset (hb_buffer_t *buffer)
 {
+  if (unlikely (hb_object_is_immutable (buffer)))
+    return;
+
   buffer->reset ();
 }
 
@@ -1237,6 +1204,9 @@ hb_buffer_reset (hb_buffer_t *buffer)
 void
 hb_buffer_clear_contents (hb_buffer_t *buffer)
 {
+  if (unlikely (hb_object_is_immutable (buffer)))
+    return;
+
   buffer->clear ();
 }
 
@@ -1321,7 +1291,7 @@ hb_buffer_set_length (hb_buffer_t  *buffer,
   if (unlikely (hb_object_is_immutable (buffer)))
     return length == 0;
 
-  if (!buffer->ensure (length))
+  if (unlikely (!buffer->ensure (length)))
     return false;
 
   /* Wipe the new space */
@@ -1501,20 +1471,20 @@ hb_buffer_reverse_clusters (hb_buffer_t *buffer)
  *
  * Sets unset buffer segment properties based on buffer Unicode
  * contents.  If buffer is not empty, it must have content type
- * %HB_BUFFER_CONTENT_TYPE_UNICODE.
+ * #HB_BUFFER_CONTENT_TYPE_UNICODE.
  *
- * If buffer script is not set (ie. is %HB_SCRIPT_INVALID), it
+ * If buffer script is not set (ie. is #HB_SCRIPT_INVALID), it
  * will be set to the Unicode script of the first character in
- * the buffer that has a script other than %HB_SCRIPT_COMMON,
- * %HB_SCRIPT_INHERITED, and %HB_SCRIPT_UNKNOWN.
+ * the buffer that has a script other than #HB_SCRIPT_COMMON,
+ * #HB_SCRIPT_INHERITED, and #HB_SCRIPT_UNKNOWN.
  *
- * Next, if buffer direction is not set (ie. is %HB_DIRECTION_INVALID),
+ * Next, if buffer direction is not set (ie. is #HB_DIRECTION_INVALID),
  * it will be set to the natural horizontal direction of the
  * buffer script as returned by hb_script_get_horizontal_direction().
- * If hb_script_get_horizontal_direction() returns %HB_DIRECTION_INVALID,
- * then %HB_DIRECTION_LTR is used.
+ * If hb_script_get_horizontal_direction() returns #HB_DIRECTION_INVALID,
+ * then #HB_DIRECTION_LTR is used.
  *
- * Finally, if buffer language is not set (ie. is %HB_LANGUAGE_INVALID),
+ * Finally, if buffer language is not set (ie. is #HB_LANGUAGE_INVALID),
  * it will be set to the process's default language as returned by
  * hb_language_get_default().  This may change in the future by
  * taking buffer script into consideration when choosing a language.
@@ -1551,7 +1521,10 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   if (item_length == -1)
     item_length = text_length - item_offset;
 
-  buffer->ensure (buffer->len + item_length * sizeof (T) / 4);
+  if (unlikely (item_length < 0 ||
+		item_length > INT_MAX / 8 ||
+		!buffer->ensure (buffer->len + item_length * sizeof (T) / 4)))
+    return;
 
   /* If buffer is empty and pre-context provided, install it.
    * This check is written this way, to make sure people can
@@ -1768,11 +1741,6 @@ hb_buffer_append (hb_buffer_t *buffer,
   if (start == end)
     return;
 
-  if (!buffer->len)
-    buffer->content_type = source->content_type;
-  if (!buffer->have_positions && source->have_positions)
-    buffer->clear_positions ();
-
   if (buffer->len + (end - start) < buffer->len) /* Overflows. */
   {
     buffer->successful = false;
@@ -1784,6 +1752,11 @@ hb_buffer_append (hb_buffer_t *buffer,
   if (unlikely (!buffer->successful))
     return;
 
+  if (!orig_len)
+    buffer->content_type = source->content_type;
+  if (!buffer->have_positions && source->have_positions)
+    buffer->clear_positions ();
+
   memcpy (buffer->info + orig_len, source->info + start, (end - start) * sizeof (buffer->info[0]));
   if (buffer->have_positions)
     memcpy (buffer->pos + orig_len, source->pos + start, (end - start) * sizeof (buffer->pos[0]));
@@ -1902,8 +1875,8 @@ hb_buffer_t::sort (unsigned int start, unsigned int end, int(*compar)(const hb_g
  * @dottedcircle_glyph: glyph id of U+25CC DOTTED CIRCLE, or (hb_codepont_t) -1.
  * @position_fuzz: allowed absolute difference in position values.
  *
- * If dottedcircle_glyph is (hb_codepoint_t) -1 then %HB_BUFFER_DIFF_FLAG_DOTTED_CIRCLE_PRESENT
- * and %HB_BUFFER_DIFF_FLAG_NOTDEF_PRESENT are never returned.  This should be used by most
+ * If dottedcircle_glyph is (hb_codepoint_t) -1 then #HB_BUFFER_DIFF_FLAG_DOTTED_CIRCLE_PRESENT
+ * and #HB_BUFFER_DIFF_FLAG_NOTDEF_PRESENT are never returned.  This should be used by most
  * callers if just comparing two buffers is needed.
  *
  * Since: 1.5.0
@@ -1994,11 +1967,11 @@ hb_buffer_diff (hb_buffer_t *buffer,
 /**
  * hb_buffer_set_message_func:
  * @buffer: An #hb_buffer_t
- * @func: (closure user_data) (destroy destroy) (scope notified):
- * @user_data:
- * @destroy:
- *
+ * @func: (closure user_data) (destroy destroy) (scope notified): Callback function
+ * @user_data: (nullable): Data to pass to @func
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
+ * Sets the implementation function for #hb_buffer_message_func_t.
  *
  * Since: 1.1.3
  **/
diff --git a/thirdparty/harfbuzz/src/hb-buffer.h b/thirdparty/harfbuzz/src/hb-buffer.h
index b13757e68f..865ccb2273 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.h
+++ b/thirdparty/harfbuzz/src/hb-buffer.h
@@ -27,7 +27,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -90,6 +90,8 @@ typedef struct hb_glyph_info_t {
  * 				   breaking point only.
  * @HB_GLYPH_FLAG_DEFINED: All the currently defined flags.
  *
+ * Flags for #hb_glyph_info_t.
+ *
  * Since: 1.5.0
  */
 typedef enum { /*< flags >*/
@@ -150,6 +152,11 @@ typedef struct hb_segment_properties_t {
   void           *reserved2;
 } hb_segment_properties_t;
 
+/**
+ * HB_SEGMENT_PROPERTIES_DEFAULT:
+ *
+ * The default #hb_segment_properties_t of of freshly created #hb_buffer_t.
+ */
 #define HB_SEGMENT_PROPERTIES_DEFAULT {HB_DIRECTION_INVALID, \
 				       HB_SCRIPT_INVALID, \
 				       HB_LANGUAGE_INVALID, \
@@ -203,6 +210,8 @@ hb_buffer_get_user_data (hb_buffer_t        *buffer,
  * @HB_BUFFER_CONTENT_TYPE_INVALID: Initial value for new buffer.
  * @HB_BUFFER_CONTENT_TYPE_UNICODE: The buffer contains input characters (before shaping).
  * @HB_BUFFER_CONTENT_TYPE_GLYPHS: The buffer contains output glyphs (after shaping).
+ *
+ * The type of #hb_buffer_t contents.
  */
 typedef enum {
   HB_BUFFER_CONTENT_TYPE_INVALID = 0,
@@ -288,6 +297,8 @@ hb_buffer_guess_segment_properties (hb_buffer_t *buffer);
  *                      not be inserted in the rendering of incorrect
  *                      character sequences (such at <0905 093E>). Since: 2.4
  *
+ * Flags for #hb_buffer_t.
+ *
  * Since: 0.9.20
  */
 typedef enum { /*< flags >*/
@@ -579,6 +590,35 @@ hb_buffer_deserialize_unicode (hb_buffer_t *buffer,
  * Compare buffers
  */
 
+/**
+ * hb_buffer_diff_flags_t:
+ * @HB_BUFFER_DIFF_FLAG_EQUAL: equal buffers.
+ * @HB_BUFFER_DIFF_FLAG_CONTENT_TYPE_MISMATCH: buffers with different
+ *     #hb_buffer_content_type_t.
+ * @HB_BUFFER_DIFF_FLAG_LENGTH_MISMATCH: buffers with differing length.
+ * @HB_BUFFER_DIFF_FLAG_NOTDEF_PRESENT: `.notdef` glyph is present in the
+ *     reference buffer.
+ * @HB_BUFFER_DIFF_FLAG_DOTTED_CIRCLE_PRESENT: dotted circle glyph is present
+ *     in the reference buffer.
+ * @HB_BUFFER_DIFF_FLAG_CODEPOINT_MISMATCH: difference in #hb_glyph_info_t.codepoint
+ * @HB_BUFFER_DIFF_FLAG_CLUSTER_MISMATCH: difference in #hb_glyph_info_t.cluster
+ * @HB_BUFFER_DIFF_FLAG_GLYPH_FLAGS_MISMATCH: difference in #hb_glyph_flags_t.
+ * @HB_BUFFER_DIFF_FLAG_POSITION_MISMATCH: difference in #hb_glyph_position_t.
+ *
+ * Flags from comparing two #hb_buffer_t's.
+ *
+ * Buffer with different #hb_buffer_content_type_t cannot be meaningfully
+ * compared in any further detail.
+ *
+ * For buffers with differing length, the per-glyph comparison is not
+ * attempted, though we do still scan reference buffer for dotted circle and
+ * `.notdef` glyphs.
+ *
+ * If the buffers have the same length, we compare them glyph-by-glyph and
+ * report which aspect(s) of the glyph info/position are different.
+ *
+ * Since: 1.5.0
+ */
 typedef enum { /*< flags >*/
   HB_BUFFER_DIFF_FLAG_EQUAL			= 0x0000,
 
@@ -618,6 +658,23 @@ hb_buffer_diff (hb_buffer_t *buffer,
  * Debugging.
  */
 
+/**
+ * hb_buffer_message_func_t:
+ * @buffer: An #hb_buffer_t to work upon
+ * @font: The #hb_font_t the @buffer is shaped with
+ * @message: %NULL-terminated message passed to the function
+ * @user_data: User data pointer passed by the caller
+ *
+ * A callback method for #hb_buffer_t. The method gets called with the
+ * #hb_buffer_t it was set on, the #hb_font_t the buffer is shaped with and a
+ * message describing what step of the shaping process will be performed.
+ * Returning %false from this method will skip this shaping step and move to
+ * the next one.
+ *
+ * Return value: %true to perform the shaping step, %false to skip it.
+ *
+ * Since: 1.1.3
+ */
 typedef hb_bool_t	(*hb_buffer_message_func_t)	(hb_buffer_t *buffer,
 							 hb_font_t   *font,
 							 const char  *message,
diff --git a/thirdparty/harfbuzz/src/hb-buffer.hh b/thirdparty/harfbuzz/src/hb-buffer.hh
index 9cad5206e2..8b432b5f96 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.hh
+++ b/thirdparty/harfbuzz/src/hb-buffer.hh
@@ -139,7 +139,7 @@ struct hb_buffer_t
 
   /* Methods */
 
-  bool in_error () const { return !successful; }
+  HB_NODISCARD bool in_error () const { return !successful; }
 
   void allocate_var (unsigned int start, unsigned int count)
   {
@@ -186,7 +186,7 @@ struct hb_buffer_t
   hb_glyph_info_t &prev ()      { return out_info[out_len ? out_len - 1 : 0]; }
   hb_glyph_info_t prev () const { return out_info[out_len ? out_len - 1 : 0]; }
 
-  bool has_separate_output () const { return info != out_info; }
+  HB_NODISCARD bool has_separate_output () const { return info != out_info; }
 
 
   HB_INTERNAL void reset ();
@@ -210,86 +210,89 @@ struct hb_buffer_t
   HB_INTERNAL void clear_output ();
   HB_INTERNAL void clear_positions ();
 
-  HB_INTERNAL void replace_glyphs (unsigned int num_in,
-				   unsigned int num_out,
-				   const hb_codepoint_t *glyph_data);
-
-  void replace_glyph (hb_codepoint_t glyph_index)
+  template <typename T>
+  HB_NODISCARD bool replace_glyphs (unsigned int num_in,
+				    unsigned int num_out,
+				    const T *glyph_data)
   {
-    if (unlikely (out_info != info || out_len != idx)) {
-      if (unlikely (!make_room_for (1, 1))) return;
-      out_info[out_len] = info[idx];
-    }
-    out_info[out_len].codepoint = glyph_index;
+    if (unlikely (!make_room_for (num_in, num_out))) return false;
 
-    idx++;
-    out_len++;
-  }
-  /* Makes a copy of the glyph at idx to output and replace glyph_index */
-  hb_glyph_info_t & output_glyph (hb_codepoint_t glyph_index)
-  {
-    if (unlikely (!make_room_for (0, 1))) return Crap (hb_glyph_info_t);
+    assert (idx + num_in <= len);
 
-    if (unlikely (idx == len && !out_len))
-      return Crap (hb_glyph_info_t);
+    merge_clusters (idx, idx + num_in);
 
-    out_info[out_len] = idx < len ? info[idx] : out_info[out_len - 1];
-    out_info[out_len].codepoint = glyph_index;
+    hb_glyph_info_t &orig_info = idx < len ? cur() : prev();
 
-    out_len++;
+    hb_glyph_info_t *pinfo = &out_info[out_len];
+    for (unsigned int i = 0; i < num_out; i++)
+    {
+      *pinfo = orig_info;
+      pinfo->codepoint = glyph_data[i];
+      pinfo++;
+    }
 
-    return out_info[out_len - 1];
+    idx  += num_in;
+    out_len += num_out;
+    return true;
   }
-  void output_info (const hb_glyph_info_t &glyph_info)
+
+  HB_NODISCARD bool replace_glyph (hb_codepoint_t glyph_index)
+  { return replace_glyphs (1, 1, &glyph_index); }
+
+  /* Makes a copy of the glyph at idx to output and replace glyph_index */
+  HB_NODISCARD bool output_glyph (hb_codepoint_t glyph_index)
+  { return replace_glyphs (0, 1, &glyph_index); }
+
+  HB_NODISCARD bool output_info (const hb_glyph_info_t &glyph_info)
   {
-    if (unlikely (!make_room_for (0, 1))) return;
+    if (unlikely (!make_room_for (0, 1))) return false;
 
     out_info[out_len] = glyph_info;
 
     out_len++;
+    return true;
   }
   /* Copies glyph at idx to output but doesn't advance idx */
-  void copy_glyph ()
+  HB_NODISCARD bool copy_glyph ()
   {
-    if (unlikely (!make_room_for (0, 1))) return;
-
-    out_info[out_len] = info[idx];
-
-    out_len++;
+    /* Extra copy because cur()'s return can be freed within
+     * output_info() call if buffer reallocates. */
+    return output_info (hb_glyph_info_t (cur()));
   }
+
   /* Copies glyph at idx to output and advance idx.
    * If there's no output, just advance idx. */
-  void
-  next_glyph ()
+  HB_NODISCARD bool next_glyph ()
   {
     if (have_output)
     {
       if (out_info != info || out_len != idx)
       {
-	if (unlikely (!make_room_for (1, 1))) return;
+	if (unlikely (!make_room_for (1, 1))) return false;
 	out_info[out_len] = info[idx];
       }
       out_len++;
     }
 
     idx++;
+    return true;
   }
   /* Copies n glyphs at idx to output and advance idx.
    * If there's no output, just advance idx. */
-  void
-  next_glyphs (unsigned int n)
+  HB_NODISCARD bool next_glyphs (unsigned int n)
   {
     if (have_output)
     {
       if (out_info != info || out_len != idx)
       {
-	if (unlikely (!make_room_for (n, n))) return;
+	if (unlikely (!make_room_for (n, n))) return false;
 	memmove (out_info + out_len, info + idx, n * sizeof (out_info[0]));
       }
       out_len += n;
     }
 
     idx += n;
+    return true;
   }
   /* Advance idx without copying to output. */
   void skip_glyph () { idx++; }
@@ -329,14 +332,14 @@ struct hb_buffer_t
 
 
   /* Internal methods */
-  HB_INTERNAL bool move_to (unsigned int i); /* i is output-buffer index. */
+  HB_NODISCARD HB_INTERNAL bool move_to (unsigned int i); /* i is output-buffer index. */
 
-  HB_INTERNAL bool enlarge (unsigned int size);
+  HB_NODISCARD HB_INTERNAL bool enlarge (unsigned int size);
 
-  bool ensure (unsigned int size)
+  HB_NODISCARD bool ensure (unsigned int size)
   { return likely (!size || size < allocated) ? true : enlarge (size); }
 
-  bool ensure_inplace (unsigned int size)
+  HB_NODISCARD bool ensure_inplace (unsigned int size)
   { return likely (!size || size < allocated); }
 
   void assert_glyphs ()
@@ -349,7 +352,7 @@ struct hb_buffer_t
     assert ((content_type == HB_BUFFER_CONTENT_TYPE_UNICODE) ||
 	    (!len && (content_type == HB_BUFFER_CONTENT_TYPE_INVALID)));
   }
-  bool ensure_glyphs ()
+  HB_NODISCARD bool ensure_glyphs ()
   {
     if (unlikely (content_type != HB_BUFFER_CONTENT_TYPE_GLYPHS))
     {
@@ -360,7 +363,7 @@ struct hb_buffer_t
     }
     return true;
   }
-  bool ensure_unicode ()
+  HB_NODISCARD bool ensure_unicode ()
   {
     if (unlikely (content_type != HB_BUFFER_CONTENT_TYPE_UNICODE))
     {
@@ -372,8 +375,8 @@ struct hb_buffer_t
     return true;
   }
 
-  HB_INTERNAL bool make_room_for (unsigned int num_in, unsigned int num_out);
-  HB_INTERNAL bool shift_forward (unsigned int count);
+  HB_NODISCARD HB_INTERNAL bool make_room_for (unsigned int num_in, unsigned int num_out);
+  HB_NODISCARD HB_INTERNAL bool shift_forward (unsigned int count);
 
   typedef long scratch_buffer_t;
   HB_INTERNAL scratch_buffer_t *get_scratch_buffer (unsigned int *size);
diff --git a/thirdparty/harfbuzz/src/hb-common.cc b/thirdparty/harfbuzz/src/hb-common.cc
index ddbcaa064c..7bb878b217 100644
--- a/thirdparty/harfbuzz/src/hb-common.cc
+++ b/thirdparty/harfbuzz/src/hb-common.cc
@@ -675,8 +675,8 @@ hb_version_string ()
  * Tests the library version against a minimum value,
  * as three integer components.
  *
- * Return value: True if the library is equal to or greater than
- * the test value, false otherwise
+ * Return value: %true if the library is equal to or greater than
+ * the test value, %false otherwise
  *
  * Since: 0.9.30
  **/
@@ -1003,6 +1003,21 @@ parse_one_variation (const char **pp, const char *end, hb_variation_t *variation
 
 /**
  * hb_variation_from_string:
+ * @str: (array length=len) (element-type uint8_t): a string to parse
+ * @len: length of @str, or -1 if string is %NULL terminated
+ * @variation: (out): the #hb_variation_t to initialize with the parsed values
+ *
+ * Parses a string into a #hb_variation_t.
+ *
+ * The format for specifying variation settings follows. All valid CSS
+ * font-variation-settings values other than 'normal' and 'inherited' are also
+ * accepted, though, not documented below.
+ *
+ * The format is a tag, optionally followed by an equals sign, followed by a
+ * number. For example `wght=500`, or `slnt=-7.5`.
+ *
+ * Return value:
+ * %true if @str is successfully parsed, %false otherwise
  *
  * Since: 1.4.2
  */
@@ -1029,6 +1044,13 @@ hb_variation_from_string (const char *str, int len,
 
 /**
  * hb_variation_to_string:
+ * @variation: an #hb_variation_t to convert
+ * @buf: (array length=size) (out): output string
+ * @size: the allocated size of @buf
+ *
+ * Converts an #hb_variation_t into a %NULL-terminated string in the format
+ * understood by hb_variation_from_string(). The client in responsible for
+ * allocating big enough size for @buf, 128 bytes is more than enough.
  *
  * Since: 1.4.2
  */
@@ -1055,9 +1077,11 @@ hb_variation_to_string (hb_variation_t *variation,
 
 /**
  * hb_color_get_alpha:
- * color: a #hb_color_t we are interested in its channels.
+ * @color: an #hb_color_t we are interested in its channels.
  *
- * Return value: Alpha channel value of the given color
+ * Fetches the alpha channel of the given @color.
+ *
+ * Return value: Alpha channel value
  *
  * Since: 2.1.0
  */
@@ -1069,9 +1093,11 @@ uint8_t
 
 /**
  * hb_color_get_red:
- * color: a #hb_color_t we are interested in its channels.
+ * @color: an #hb_color_t we are interested in its channels.
+ *
+ * Fetches the red channel of the given @color.
  *
- * Return value: Red channel value of the given color
+ * Return value: Red channel value
  *
  * Since: 2.1.0
  */
@@ -1083,9 +1109,11 @@ uint8_t
 
 /**
  * hb_color_get_green:
- * color: a #hb_color_t we are interested in its channels.
+ * @color: an #hb_color_t we are interested in its channels.
  *
- * Return value: Green channel value of the given color
+ * Fetches the green channel of the given @color.
+ *
+ * Return value: Green channel value
  *
  * Since: 2.1.0
  */
@@ -1097,9 +1125,11 @@ uint8_t
 
 /**
  * hb_color_get_blue:
- * color: a #hb_color_t we are interested in its channels.
+ * @color: an #hb_color_t we are interested in its channels.
+ *
+ * Fetches the blue channel of the given @color.
  *
- * Return value: Blue channel value of the given color
+ * Return value: Blue channel value
  *
  * Since: 2.1.0
  */
diff --git a/thirdparty/harfbuzz/src/hb-common.h b/thirdparty/harfbuzz/src/hb-common.h
index efe185cdfd..532fd428cb 100644
--- a/thirdparty/harfbuzz/src/hb-common.h
+++ b/thirdparty/harfbuzz/src/hb-common.h
@@ -26,7 +26,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -135,7 +135,7 @@ typedef union _hb_var_int_t {
 
 /**
  * hb_tag_t:
- * 
+ *
  * Data type for tag identifiers. Tags are four
  * byte integers, each byte representing a character.
  *
@@ -148,22 +148,48 @@ typedef uint32_t hb_tag_t;
 
 /**
  * HB_TAG:
+ * @c1: 1st character of the tag
+ * @c2: 2nd character of the tag
+ * @c3: 3rd character of the tag
+ * @c4: 4th character of the tag
  *
- * Constructs an #hb_tag_t from four characters.
+ * Constructs an #hb_tag_t from four character literals.
  *
  **/
 #define HB_TAG(c1,c2,c3,c4) ((hb_tag_t)((((uint32_t)(c1)&0xFF)<<24)|(((uint32_t)(c2)&0xFF)<<16)|(((uint32_t)(c3)&0xFF)<<8)|((uint32_t)(c4)&0xFF)))
 
 /**
  * HB_UNTAG:
+ * @tag: an #hb_tag_t
+ *
+ * Extracts four character literals from an #hb_tag_t.
  *
- * Extracts the characters from an #hb_tag_t.
+ * Since: 0.6.0
  *
  **/
 #define HB_UNTAG(tag)   (uint8_t)(((tag)>>24)&0xFF), (uint8_t)(((tag)>>16)&0xFF), (uint8_t)(((tag)>>8)&0xFF), (uint8_t)((tag)&0xFF)
 
+/**
+ * HB_TAG_NONE:
+ *
+ * Unset #hb_tag_t.
+ */
 #define HB_TAG_NONE HB_TAG(0,0,0,0)
+/**
+ * HB_TAG_MAX:
+ *
+ * Maximum possible unsigned #hb_tag_t.
+ *
+ * Since: 0.9.26
+ */
 #define HB_TAG_MAX HB_TAG(0xff,0xff,0xff,0xff)
+/**
+ * HB_TAG_MAX_SIGNED:
+ *
+ * Maximum possible signed #hb_tag_t.
+ *
+ * Since: 0.9.33
+ */
 #define HB_TAG_MAX_SIGNED HB_TAG(0x7f,0xff,0xff,0xff)
 
 /* len=-1 means str is NUL-terminated. */
@@ -263,6 +289,13 @@ hb_direction_to_string (hb_direction_t direction);
 
 /* hb_language_t */
 
+/**
+ * hb_language_t:
+ *
+ * Data type for languages. Each #hb_language_t corresponds to a BCP 47
+ * language tag.
+ *
+ */
 typedef const struct hb_language_impl_t *hb_language_t;
 
 HB_EXTERN hb_language_t
@@ -271,6 +304,13 @@ hb_language_from_string (const char *str, int len);
 HB_EXTERN const char *
 hb_language_to_string (hb_language_t language);
 
+/**
+ * HB_LANGUAGE_INVALID:
+ *
+ * An unset #hb_language_t.
+ *
+ * Since: 0.6.0
+ */
 #define HB_LANGUAGE_INVALID ((hb_language_t) 0)
 
 HB_EXTERN hb_language_t
@@ -279,160 +319,164 @@ hb_language_get_default (void);
 
 /**
  * hb_script_t:
- * @HB_SCRIPT_COMMON: HB_TAG ('Z','y','y','y')
- * @HB_SCRIPT_INHERITED: HB_TAG ('Z','i','n','h')
- * @HB_SCRIPT_UNKNOWN: HB_TAG ('Z','z','z','z')
- * @HB_SCRIPT_ARABIC
- * @HB_SCRIPT_ARMENIAN
- * @HB_SCRIPT_BENGALI
- * @HB_SCRIPT_CYRILLIC
- * @HB_SCRIPT_DEVANAGARI
- * @HB_SCRIPT_GEORGIAN
- * @HB_SCRIPT_GREEK
- * @HB_SCRIPT_GUJARATI
- * @HB_SCRIPT_GURMUKHI
- * @HB_SCRIPT_HANGUL
- * @HB_SCRIPT_HAN
- * @HB_SCRIPT_HEBREW
- * @HB_SCRIPT_HIRAGANA
- * @HB_SCRIPT_KANNADA
- * @HB_SCRIPT_KATAKANA
- * @HB_SCRIPT_LAO
- * @HB_SCRIPT_LATIN
- * @HB_SCRIPT_MALAYALAM
- * @HB_SCRIPT_ORIYA
- * @HB_SCRIPT_TAMIL
- * @HB_SCRIPT_TELUGU
- * @HB_SCRIPT_THAI
- * @HB_SCRIPT_TIBETAN
- * @HB_SCRIPT_BOPOMOFO
- * @HB_SCRIPT_BRAILLE
- * @HB_SCRIPT_CANADIAN_SYLLABICS
- * @HB_SCRIPT_CHEROKEE
- * @HB_SCRIPT_ETHIOPIC
- * @HB_SCRIPT_KHMER
- * @HB_SCRIPT_MONGOLIAN
- * @HB_SCRIPT_MYANMAR
- * @HB_SCRIPT_OGHAM
- * @HB_SCRIPT_RUNIC
- * @HB_SCRIPT_SINHALA
- * @HB_SCRIPT_SYRIAC
- * @HB_SCRIPT_THAANA
- * @HB_SCRIPT_YI
- * @HB_SCRIPT_DESERET
- * @HB_SCRIPT_GOTHIC
- * @HB_SCRIPT_OLD_ITALIC
- * @HB_SCRIPT_BUHID
- * @HB_SCRIPT_HANUNOO
- * @HB_SCRIPT_TAGALOG
- * @HB_SCRIPT_TAGBANWA
- * @HB_SCRIPT_CYPRIOT
- * @HB_SCRIPT_LIMBU
- * @HB_SCRIPT_LINEAR_B
- * @HB_SCRIPT_OSMANYA
- * @HB_SCRIPT_SHAVIAN
- * @HB_SCRIPT_TAI_LE
- * @HB_SCRIPT_UGARITIC
- * @HB_SCRIPT_BUGINESE
- * @HB_SCRIPT_COPTIC
- * @HB_SCRIPT_GLAGOLITIC
- * @HB_SCRIPT_KHAROSHTHI
- * @HB_SCRIPT_NEW_TAI_LUE
- * @HB_SCRIPT_OLD_PERSIAN
- * @HB_SCRIPT_SYLOTI_NAGRI
- * @HB_SCRIPT_TIFINAGH
- * @HB_SCRIPT_BALINESE
- * @HB_SCRIPT_CUNEIFORM
- * @HB_SCRIPT_NKO
- * @HB_SCRIPT_PHAGS_PA
- * @HB_SCRIPT_PHOENICIAN
- * @HB_SCRIPT_CARIAN
- * @HB_SCRIPT_CHAM
- * @HB_SCRIPT_KAYAH_LI
- * @HB_SCRIPT_LEPCHA
- * @HB_SCRIPT_LYCIAN
- * @HB_SCRIPT_LYDIAN
- * @HB_SCRIPT_OL_CHIKI
- * @HB_SCRIPT_REJANG
- * @HB_SCRIPT_SAURASHTRA
- * @HB_SCRIPT_SUNDANESE
- * @HB_SCRIPT_VAI
- * @HB_SCRIPT_AVESTAN
- * @HB_SCRIPT_BAMUM
- * @HB_SCRIPT_EGYPTIAN_HIEROGLYPHS
- * @HB_SCRIPT_IMPERIAL_ARAMAIC
- * @HB_SCRIPT_INSCRIPTIONAL_PAHLAVI
- * @HB_SCRIPT_INSCRIPTIONAL_PARTHIAN
- * @HB_SCRIPT_JAVANESE
- * @HB_SCRIPT_KAITHI
- * @HB_SCRIPT_LISU
- * @HB_SCRIPT_MEETEI_MAYEK
- * @HB_SCRIPT_OLD_SOUTH_ARABIAN
- * @HB_SCRIPT_OLD_TURKIC
- * @HB_SCRIPT_SAMARITAN
- * @HB_SCRIPT_TAI_THAM
- * @HB_SCRIPT_TAI_VIET
- * @HB_SCRIPT_BATAK
- * @HB_SCRIPT_BRAHMI
- * @HB_SCRIPT_MANDAIC
- * @HB_SCRIPT_CHAKMA
- * @HB_SCRIPT_MEROITIC_CURSIVE
- * @HB_SCRIPT_MEROITIC_HIEROGLYPHS
- * @HB_SCRIPT_MIAO
- * @HB_SCRIPT_SHARADA
- * @HB_SCRIPT_SORA_SOMPENG
- * @HB_SCRIPT_TAKRI
- * @HB_SCRIPT_BASSA_VAH
- * @HB_SCRIPT_CAUCASIAN_ALBANIAN
- * @HB_SCRIPT_DUPLOYAN
- * @HB_SCRIPT_ELBASAN
- * @HB_SCRIPT_GRANTHA
- * @HB_SCRIPT_KHOJKI
- * @HB_SCRIPT_KHUDAWADI
- * @HB_SCRIPT_LINEAR_A
- * @HB_SCRIPT_MAHAJANI
- * @HB_SCRIPT_MANICHAEAN
- * @HB_SCRIPT_MENDE_KIKAKUI
- * @HB_SCRIPT_MODI
- * @HB_SCRIPT_MRO
- * @HB_SCRIPT_NABATAEAN
- * @HB_SCRIPT_OLD_NORTH_ARABIAN
- * @HB_SCRIPT_OLD_PERMIC
- * @HB_SCRIPT_PAHAWH_HMONG
- * @HB_SCRIPT_PALMYRENE
- * @HB_SCRIPT_PAU_CIN_HAU
- * @HB_SCRIPT_PSALTER_PAHLAVI
- * @HB_SCRIPT_SIDDHAM
- * @HB_SCRIPT_TIRHUTA
- * @HB_SCRIPT_WARANG_CITI
- * @HB_SCRIPT_AHOM
- * @HB_SCRIPT_ANATOLIAN_HIEROGLYPHS
- * @HB_SCRIPT_HATRAN
- * @HB_SCRIPT_MULTANI
- * @HB_SCRIPT_OLD_HUNGARIAN
- * @HB_SCRIPT_SIGNWRITING
- * @HB_SCRIPT_ADLAM
- * @HB_SCRIPT_BHAIKSUKI
- * @HB_SCRIPT_MARCHEN
- * @HB_SCRIPT_OSAGE
- * @HB_SCRIPT_TANGUT
- * @HB_SCRIPT_NEWA
- * @HB_SCRIPT_MASARAM_GONDI
- * @HB_SCRIPT_NUSHU
- * @HB_SCRIPT_SOYOMBO
- * @HB_SCRIPT_ZANABAZAR_SQUARE
- * @HB_SCRIPT_DOGRA
- * @HB_SCRIPT_GUNJALA_GONDI
- * @HB_SCRIPT_HANIFI_ROHINGYA
- * @HB_SCRIPT_MAKASAR
- * @HB_SCRIPT_MEDEFAIDRIN
- * @HB_SCRIPT_OLD_SOGDIAN
- * @HB_SCRIPT_SOGDIAN
- * @HB_SCRIPT_ELYMAIC
- * @HB_SCRIPT_NANDINAGARI
- * @HB_SCRIPT_NYIAKENG_PUACHUE_HMONG
- * @HB_SCRIPT_WANCHO
- * @HB_SCRIPT_INVALID: #HB_TAG_NONE
+ * @HB_SCRIPT_COMMON: `Zyyy`
+ * @HB_SCRIPT_INHERITED: `Zinh`
+ * @HB_SCRIPT_UNKNOWN: `Zzzz`
+ * @HB_SCRIPT_ARABIC: `Arab`
+ * @HB_SCRIPT_ARMENIAN: `Armn`
+ * @HB_SCRIPT_BENGALI: `Beng`
+ * @HB_SCRIPT_CYRILLIC: `Cyrl`
+ * @HB_SCRIPT_DEVANAGARI: `Deva`
+ * @HB_SCRIPT_GEORGIAN: `Geor`
+ * @HB_SCRIPT_GREEK: `Grek`
+ * @HB_SCRIPT_GUJARATI: `Gujr`
+ * @HB_SCRIPT_GURMUKHI: `Guru`
+ * @HB_SCRIPT_HANGUL: `Hang`
+ * @HB_SCRIPT_HAN: `Hani`
+ * @HB_SCRIPT_HEBREW: `Hebr`
+ * @HB_SCRIPT_HIRAGANA: `Hira`
+ * @HB_SCRIPT_KANNADA: `Knda`
+ * @HB_SCRIPT_KATAKANA: `Kana`
+ * @HB_SCRIPT_LAO: `Laoo`
+ * @HB_SCRIPT_LATIN: `Latn`
+ * @HB_SCRIPT_MALAYALAM: `Mlym`
+ * @HB_SCRIPT_ORIYA: `Orya`
+ * @HB_SCRIPT_TAMIL: `Taml`
+ * @HB_SCRIPT_TELUGU: `Telu`
+ * @HB_SCRIPT_THAI: `Thai`
+ * @HB_SCRIPT_TIBETAN: `Tibt`
+ * @HB_SCRIPT_BOPOMOFO: `Bopo`
+ * @HB_SCRIPT_BRAILLE: `Brai`
+ * @HB_SCRIPT_CANADIAN_SYLLABICS: `Cans`
+ * @HB_SCRIPT_CHEROKEE: `Cher`
+ * @HB_SCRIPT_ETHIOPIC: `Ethi`
+ * @HB_SCRIPT_KHMER: `Khmr`
+ * @HB_SCRIPT_MONGOLIAN: `Mong`
+ * @HB_SCRIPT_MYANMAR: `Mymr`
+ * @HB_SCRIPT_OGHAM: `Ogam`
+ * @HB_SCRIPT_RUNIC: `Runr`
+ * @HB_SCRIPT_SINHALA: `Sinh`
+ * @HB_SCRIPT_SYRIAC: `Syrc`
+ * @HB_SCRIPT_THAANA: `Thaa`
+ * @HB_SCRIPT_YI: `Yiii`
+ * @HB_SCRIPT_DESERET: `Dsrt`
+ * @HB_SCRIPT_GOTHIC: `Goth`
+ * @HB_SCRIPT_OLD_ITALIC: `Ital`
+ * @HB_SCRIPT_BUHID: `Buhd`
+ * @HB_SCRIPT_HANUNOO: `Hano`
+ * @HB_SCRIPT_TAGALOG: `Tglg`
+ * @HB_SCRIPT_TAGBANWA: `Tagb`
+ * @HB_SCRIPT_CYPRIOT: `Cprt`
+ * @HB_SCRIPT_LIMBU: `Limb`
+ * @HB_SCRIPT_LINEAR_B: `Linb`
+ * @HB_SCRIPT_OSMANYA: `Osma`
+ * @HB_SCRIPT_SHAVIAN: `Shaw`
+ * @HB_SCRIPT_TAI_LE: `Tale`
+ * @HB_SCRIPT_UGARITIC: `Ugar`
+ * @HB_SCRIPT_BUGINESE: `Bugi`
+ * @HB_SCRIPT_COPTIC: `Copt`
+ * @HB_SCRIPT_GLAGOLITIC: `Glag`
+ * @HB_SCRIPT_KHAROSHTHI: `Khar`
+ * @HB_SCRIPT_NEW_TAI_LUE: `Talu`
+ * @HB_SCRIPT_OLD_PERSIAN: `Xpeo`
+ * @HB_SCRIPT_SYLOTI_NAGRI: `Sylo`
+ * @HB_SCRIPT_TIFINAGH: `Tfng`
+ * @HB_SCRIPT_BALINESE: `Bali`
+ * @HB_SCRIPT_CUNEIFORM: `Xsux`
+ * @HB_SCRIPT_NKO: `Nkoo`
+ * @HB_SCRIPT_PHAGS_PA: `Phag`
+ * @HB_SCRIPT_PHOENICIAN: `Phnx`
+ * @HB_SCRIPT_CARIAN: `Cari`
+ * @HB_SCRIPT_CHAM: `Cham`
+ * @HB_SCRIPT_KAYAH_LI: `Kali`
+ * @HB_SCRIPT_LEPCHA: `Lepc`
+ * @HB_SCRIPT_LYCIAN: `Lyci`
+ * @HB_SCRIPT_LYDIAN: `Lydi`
+ * @HB_SCRIPT_OL_CHIKI: `Olck`
+ * @HB_SCRIPT_REJANG: `Rjng`
+ * @HB_SCRIPT_SAURASHTRA: `Saur`
+ * @HB_SCRIPT_SUNDANESE: `Sund`
+ * @HB_SCRIPT_VAI: `Vaii`
+ * @HB_SCRIPT_AVESTAN: `Avst`
+ * @HB_SCRIPT_BAMUM: `Bamu`
+ * @HB_SCRIPT_EGYPTIAN_HIEROGLYPHS: `Egyp`
+ * @HB_SCRIPT_IMPERIAL_ARAMAIC: `Armi`
+ * @HB_SCRIPT_INSCRIPTIONAL_PAHLAVI: `Phli`
+ * @HB_SCRIPT_INSCRIPTIONAL_PARTHIAN: `Prti`
+ * @HB_SCRIPT_JAVANESE: `Java`
+ * @HB_SCRIPT_KAITHI: `Kthi`
+ * @HB_SCRIPT_LISU: `Lisu`
+ * @HB_SCRIPT_MEETEI_MAYEK: `Mtei`
+ * @HB_SCRIPT_OLD_SOUTH_ARABIAN: `Sarb`
+ * @HB_SCRIPT_OLD_TURKIC: `Orkh`
+ * @HB_SCRIPT_SAMARITAN: `Samr`
+ * @HB_SCRIPT_TAI_THAM: `Lana`
+ * @HB_SCRIPT_TAI_VIET: `Tavt`
+ * @HB_SCRIPT_BATAK: `Batk`
+ * @HB_SCRIPT_BRAHMI: `Brah`
+ * @HB_SCRIPT_MANDAIC: `Mand`
+ * @HB_SCRIPT_CHAKMA: `Cakm`
+ * @HB_SCRIPT_MEROITIC_CURSIVE: `Merc`
+ * @HB_SCRIPT_MEROITIC_HIEROGLYPHS: `Mero`
+ * @HB_SCRIPT_MIAO: `Plrd`
+ * @HB_SCRIPT_SHARADA: `Shrd`
+ * @HB_SCRIPT_SORA_SOMPENG: `Sora`
+ * @HB_SCRIPT_TAKRI: `Takr`
+ * @HB_SCRIPT_BASSA_VAH: `Bass`, Since: 0.9.30
+ * @HB_SCRIPT_CAUCASIAN_ALBANIAN: `Aghb`, Since: 0.9.30
+ * @HB_SCRIPT_DUPLOYAN: `Dupl`, Since: 0.9.30
+ * @HB_SCRIPT_ELBASAN: `Elba`, Since: 0.9.30
+ * @HB_SCRIPT_GRANTHA: `Gran`, Since: 0.9.30
+ * @HB_SCRIPT_KHOJKI: `Khoj`, Since: 0.9.30
+ * @HB_SCRIPT_KHUDAWADI: `Sind`, Since: 0.9.30
+ * @HB_SCRIPT_LINEAR_A: `Lina`, Since: 0.9.30
+ * @HB_SCRIPT_MAHAJANI: `Mahj`, Since: 0.9.30
+ * @HB_SCRIPT_MANICHAEAN: `Mani`, Since: 0.9.30
+ * @HB_SCRIPT_MENDE_KIKAKUI: `Mend`, Since: 0.9.30
+ * @HB_SCRIPT_MODI: `Modi`, Since: 0.9.30
+ * @HB_SCRIPT_MRO: `Mroo`, Since: 0.9.30
+ * @HB_SCRIPT_NABATAEAN: `Nbat`, Since: 0.9.30
+ * @HB_SCRIPT_OLD_NORTH_ARABIAN: `Narb`, Since: 0.9.30
+ * @HB_SCRIPT_OLD_PERMIC: `Perm`, Since: 0.9.30
+ * @HB_SCRIPT_PAHAWH_HMONG: `Hmng`, Since: 0.9.30
+ * @HB_SCRIPT_PALMYRENE: `Palm`, Since: 0.9.30
+ * @HB_SCRIPT_PAU_CIN_HAU: `Pauc`, Since: 0.9.30
+ * @HB_SCRIPT_PSALTER_PAHLAVI: `Phlp`, Since: 0.9.30
+ * @HB_SCRIPT_SIDDHAM: `Sidd`, Since: 0.9.30
+ * @HB_SCRIPT_TIRHUTA: `Tirh`, Since: 0.9.30
+ * @HB_SCRIPT_WARANG_CITI: `Wara`, Since: 0.9.30
+ * @HB_SCRIPT_AHOM: `Ahom`, Since: 0.9.30
+ * @HB_SCRIPT_ANATOLIAN_HIEROGLYPHS: `Hluw`, Since: 0.9.30
+ * @HB_SCRIPT_HATRAN: `Hatr`, Since: 0.9.30
+ * @HB_SCRIPT_MULTANI: `Mult`, Since: 0.9.30
+ * @HB_SCRIPT_OLD_HUNGARIAN: `Hung`, Since: 0.9.30
+ * @HB_SCRIPT_SIGNWRITING: `Sgnw`, Since: 0.9.30
+ * @HB_SCRIPT_ADLAM: `Adlm`, Since: 1.3.0
+ * @HB_SCRIPT_BHAIKSUKI: `Bhks`, Since: 1.3.0
+ * @HB_SCRIPT_MARCHEN: `Marc`, Since: 1.3.0
+ * @HB_SCRIPT_OSAGE: `Osge`, Since: 1.3.0
+ * @HB_SCRIPT_TANGUT: `Tang`, Since: 1.3.0
+ * @HB_SCRIPT_NEWA: `Newa`, Since: 1.3.0
+ * @HB_SCRIPT_MASARAM_GONDI: `Gonm`, Since: 1.6.0
+ * @HB_SCRIPT_NUSHU: `Nshu`, Since: 1.6.0
+ * @HB_SCRIPT_SOYOMBO: `Soyo`, Since: 1.6.0
+ * @HB_SCRIPT_ZANABAZAR_SQUARE: `Zanb`, Since: 1.6.0
+ * @HB_SCRIPT_DOGRA: `Dogr`, Since: 1.8.0
+ * @HB_SCRIPT_GUNJALA_GONDI: `Gong`, Since: 1.8.0
+ * @HB_SCRIPT_HANIFI_ROHINGYA: `Rohg`, Since: 1.8.0
+ * @HB_SCRIPT_MAKASAR: `Maka`, Since: 1.8.0
+ * @HB_SCRIPT_MEDEFAIDRIN: `Medf`, Since: 1.8.0
+ * @HB_SCRIPT_OLD_SOGDIAN: `Sogo`, Since: 1.8.0
+ * @HB_SCRIPT_SOGDIAN: `Sogd`, Since: 1.8.0
+ * @HB_SCRIPT_ELYMAIC: `Elym`, Since: 2.4.0
+ * @HB_SCRIPT_NANDINAGARI: `Nand`, Since: 2.4.0
+ * @HB_SCRIPT_NYIAKENG_PUACHUE_HMONG: `Hmnp`, Since: 2.4.0
+ * @HB_SCRIPT_WANCHO: `Wcho`, Since: 2.4.0
+ * @HB_SCRIPT_CHORASMIAN: `Chrs`, Since: 2.6.7
+ * @HB_SCRIPT_DIVES_AKURU: `Diak`, Since: 2.6.7
+ * @HB_SCRIPT_KHITAN_SMALL_SCRIPT: `Kits`, Since: 2.6.7
+ * @HB_SCRIPT_YEZIDI: `Yezi`, Since: 2.6.7
+ * @HB_SCRIPT_INVALID: No script set
  *
  * Data type for scripts. Each #hb_script_t's value is an #hb_tag_t corresponding
  * to the four-letter values defined by [ISO 15924](https://unicode.org/iso15924/).
@@ -441,208 +485,208 @@ hb_language_get_default (void);
  *
  **/
 
-/* https://unicode.org/iso15924/ */
 /* https://docs.google.com/spreadsheets/d/1Y90M0Ie3MUJ6UVCRDOypOtijlMDLNNyyLk36T6iMu0o */
-/* Unicode Character Database property: Script (sc) */
 typedef enum
 {
-  /*1.1*/ HB_SCRIPT_COMMON			= HB_TAG ('Z','y','y','y'),
-  /*1.1*/ HB_SCRIPT_INHERITED			= HB_TAG ('Z','i','n','h'),
-  /*5.0*/ HB_SCRIPT_UNKNOWN			= HB_TAG ('Z','z','z','z'),
-
-  /*1.1*/ HB_SCRIPT_ARABIC			= HB_TAG ('A','r','a','b'),
-  /*1.1*/ HB_SCRIPT_ARMENIAN			= HB_TAG ('A','r','m','n'),
-  /*1.1*/ HB_SCRIPT_BENGALI			= HB_TAG ('B','e','n','g'),
-  /*1.1*/ HB_SCRIPT_CYRILLIC			= HB_TAG ('C','y','r','l'),
-  /*1.1*/ HB_SCRIPT_DEVANAGARI			= HB_TAG ('D','e','v','a'),
-  /*1.1*/ HB_SCRIPT_GEORGIAN			= HB_TAG ('G','e','o','r'),
-  /*1.1*/ HB_SCRIPT_GREEK			= HB_TAG ('G','r','e','k'),
-  /*1.1*/ HB_SCRIPT_GUJARATI			= HB_TAG ('G','u','j','r'),
-  /*1.1*/ HB_SCRIPT_GURMUKHI			= HB_TAG ('G','u','r','u'),
-  /*1.1*/ HB_SCRIPT_HANGUL			= HB_TAG ('H','a','n','g'),
-  /*1.1*/ HB_SCRIPT_HAN				= HB_TAG ('H','a','n','i'),
-  /*1.1*/ HB_SCRIPT_HEBREW			= HB_TAG ('H','e','b','r'),
-  /*1.1*/ HB_SCRIPT_HIRAGANA			= HB_TAG ('H','i','r','a'),
-  /*1.1*/ HB_SCRIPT_KANNADA			= HB_TAG ('K','n','d','a'),
-  /*1.1*/ HB_SCRIPT_KATAKANA			= HB_TAG ('K','a','n','a'),
-  /*1.1*/ HB_SCRIPT_LAO				= HB_TAG ('L','a','o','o'),
-  /*1.1*/ HB_SCRIPT_LATIN			= HB_TAG ('L','a','t','n'),
-  /*1.1*/ HB_SCRIPT_MALAYALAM			= HB_TAG ('M','l','y','m'),
-  /*1.1*/ HB_SCRIPT_ORIYA			= HB_TAG ('O','r','y','a'),
-  /*1.1*/ HB_SCRIPT_TAMIL			= HB_TAG ('T','a','m','l'),
-  /*1.1*/ HB_SCRIPT_TELUGU			= HB_TAG ('T','e','l','u'),
-  /*1.1*/ HB_SCRIPT_THAI			= HB_TAG ('T','h','a','i'),
-
-  /*2.0*/ HB_SCRIPT_TIBETAN			= HB_TAG ('T','i','b','t'),
-
-  /*3.0*/ HB_SCRIPT_BOPOMOFO			= HB_TAG ('B','o','p','o'),
-  /*3.0*/ HB_SCRIPT_BRAILLE			= HB_TAG ('B','r','a','i'),
-  /*3.0*/ HB_SCRIPT_CANADIAN_SYLLABICS		= HB_TAG ('C','a','n','s'),
-  /*3.0*/ HB_SCRIPT_CHEROKEE			= HB_TAG ('C','h','e','r'),
-  /*3.0*/ HB_SCRIPT_ETHIOPIC			= HB_TAG ('E','t','h','i'),
-  /*3.0*/ HB_SCRIPT_KHMER			= HB_TAG ('K','h','m','r'),
-  /*3.0*/ HB_SCRIPT_MONGOLIAN			= HB_TAG ('M','o','n','g'),
-  /*3.0*/ HB_SCRIPT_MYANMAR			= HB_TAG ('M','y','m','r'),
-  /*3.0*/ HB_SCRIPT_OGHAM			= HB_TAG ('O','g','a','m'),
-  /*3.0*/ HB_SCRIPT_RUNIC			= HB_TAG ('R','u','n','r'),
-  /*3.0*/ HB_SCRIPT_SINHALA			= HB_TAG ('S','i','n','h'),
-  /*3.0*/ HB_SCRIPT_SYRIAC			= HB_TAG ('S','y','r','c'),
-  /*3.0*/ HB_SCRIPT_THAANA			= HB_TAG ('T','h','a','a'),
-  /*3.0*/ HB_SCRIPT_YI				= HB_TAG ('Y','i','i','i'),
-
-  /*3.1*/ HB_SCRIPT_DESERET			= HB_TAG ('D','s','r','t'),
-  /*3.1*/ HB_SCRIPT_GOTHIC			= HB_TAG ('G','o','t','h'),
-  /*3.1*/ HB_SCRIPT_OLD_ITALIC			= HB_TAG ('I','t','a','l'),
-
-  /*3.2*/ HB_SCRIPT_BUHID			= HB_TAG ('B','u','h','d'),
-  /*3.2*/ HB_SCRIPT_HANUNOO			= HB_TAG ('H','a','n','o'),
-  /*3.2*/ HB_SCRIPT_TAGALOG			= HB_TAG ('T','g','l','g'),
-  /*3.2*/ HB_SCRIPT_TAGBANWA			= HB_TAG ('T','a','g','b'),
-
-  /*4.0*/ HB_SCRIPT_CYPRIOT			= HB_TAG ('C','p','r','t'),
-  /*4.0*/ HB_SCRIPT_LIMBU			= HB_TAG ('L','i','m','b'),
-  /*4.0*/ HB_SCRIPT_LINEAR_B			= HB_TAG ('L','i','n','b'),
-  /*4.0*/ HB_SCRIPT_OSMANYA			= HB_TAG ('O','s','m','a'),
-  /*4.0*/ HB_SCRIPT_SHAVIAN			= HB_TAG ('S','h','a','w'),
-  /*4.0*/ HB_SCRIPT_TAI_LE			= HB_TAG ('T','a','l','e'),
-  /*4.0*/ HB_SCRIPT_UGARITIC			= HB_TAG ('U','g','a','r'),
-
-  /*4.1*/ HB_SCRIPT_BUGINESE			= HB_TAG ('B','u','g','i'),
-  /*4.1*/ HB_SCRIPT_COPTIC			= HB_TAG ('C','o','p','t'),
-  /*4.1*/ HB_SCRIPT_GLAGOLITIC			= HB_TAG ('G','l','a','g'),
-  /*4.1*/ HB_SCRIPT_KHAROSHTHI			= HB_TAG ('K','h','a','r'),
-  /*4.1*/ HB_SCRIPT_NEW_TAI_LUE			= HB_TAG ('T','a','l','u'),
-  /*4.1*/ HB_SCRIPT_OLD_PERSIAN			= HB_TAG ('X','p','e','o'),
-  /*4.1*/ HB_SCRIPT_SYLOTI_NAGRI		= HB_TAG ('S','y','l','o'),
-  /*4.1*/ HB_SCRIPT_TIFINAGH			= HB_TAG ('T','f','n','g'),
-
-  /*5.0*/ HB_SCRIPT_BALINESE			= HB_TAG ('B','a','l','i'),
-  /*5.0*/ HB_SCRIPT_CUNEIFORM			= HB_TAG ('X','s','u','x'),
-  /*5.0*/ HB_SCRIPT_NKO				= HB_TAG ('N','k','o','o'),
-  /*5.0*/ HB_SCRIPT_PHAGS_PA			= HB_TAG ('P','h','a','g'),
-  /*5.0*/ HB_SCRIPT_PHOENICIAN			= HB_TAG ('P','h','n','x'),
-
-  /*5.1*/ HB_SCRIPT_CARIAN			= HB_TAG ('C','a','r','i'),
-  /*5.1*/ HB_SCRIPT_CHAM			= HB_TAG ('C','h','a','m'),
-  /*5.1*/ HB_SCRIPT_KAYAH_LI			= HB_TAG ('K','a','l','i'),
-  /*5.1*/ HB_SCRIPT_LEPCHA			= HB_TAG ('L','e','p','c'),
-  /*5.1*/ HB_SCRIPT_LYCIAN			= HB_TAG ('L','y','c','i'),
-  /*5.1*/ HB_SCRIPT_LYDIAN			= HB_TAG ('L','y','d','i'),
-  /*5.1*/ HB_SCRIPT_OL_CHIKI			= HB_TAG ('O','l','c','k'),
-  /*5.1*/ HB_SCRIPT_REJANG			= HB_TAG ('R','j','n','g'),
-  /*5.1*/ HB_SCRIPT_SAURASHTRA			= HB_TAG ('S','a','u','r'),
-  /*5.1*/ HB_SCRIPT_SUNDANESE			= HB_TAG ('S','u','n','d'),
-  /*5.1*/ HB_SCRIPT_VAI				= HB_TAG ('V','a','i','i'),
-
-  /*5.2*/ HB_SCRIPT_AVESTAN			= HB_TAG ('A','v','s','t'),
-  /*5.2*/ HB_SCRIPT_BAMUM			= HB_TAG ('B','a','m','u'),
-  /*5.2*/ HB_SCRIPT_EGYPTIAN_HIEROGLYPHS	= HB_TAG ('E','g','y','p'),
-  /*5.2*/ HB_SCRIPT_IMPERIAL_ARAMAIC		= HB_TAG ('A','r','m','i'),
-  /*5.2*/ HB_SCRIPT_INSCRIPTIONAL_PAHLAVI	= HB_TAG ('P','h','l','i'),
-  /*5.2*/ HB_SCRIPT_INSCRIPTIONAL_PARTHIAN	= HB_TAG ('P','r','t','i'),
-  /*5.2*/ HB_SCRIPT_JAVANESE			= HB_TAG ('J','a','v','a'),
-  /*5.2*/ HB_SCRIPT_KAITHI			= HB_TAG ('K','t','h','i'),
-  /*5.2*/ HB_SCRIPT_LISU			= HB_TAG ('L','i','s','u'),
-  /*5.2*/ HB_SCRIPT_MEETEI_MAYEK		= HB_TAG ('M','t','e','i'),
-  /*5.2*/ HB_SCRIPT_OLD_SOUTH_ARABIAN		= HB_TAG ('S','a','r','b'),
-  /*5.2*/ HB_SCRIPT_OLD_TURKIC			= HB_TAG ('O','r','k','h'),
-  /*5.2*/ HB_SCRIPT_SAMARITAN			= HB_TAG ('S','a','m','r'),
-  /*5.2*/ HB_SCRIPT_TAI_THAM			= HB_TAG ('L','a','n','a'),
-  /*5.2*/ HB_SCRIPT_TAI_VIET			= HB_TAG ('T','a','v','t'),
-
-  /*6.0*/ HB_SCRIPT_BATAK			= HB_TAG ('B','a','t','k'),
-  /*6.0*/ HB_SCRIPT_BRAHMI			= HB_TAG ('B','r','a','h'),
-  /*6.0*/ HB_SCRIPT_MANDAIC			= HB_TAG ('M','a','n','d'),
-
-  /*6.1*/ HB_SCRIPT_CHAKMA			= HB_TAG ('C','a','k','m'),
-  /*6.1*/ HB_SCRIPT_MEROITIC_CURSIVE		= HB_TAG ('M','e','r','c'),
-  /*6.1*/ HB_SCRIPT_MEROITIC_HIEROGLYPHS	= HB_TAG ('M','e','r','o'),
-  /*6.1*/ HB_SCRIPT_MIAO			= HB_TAG ('P','l','r','d'),
-  /*6.1*/ HB_SCRIPT_SHARADA			= HB_TAG ('S','h','r','d'),
-  /*6.1*/ HB_SCRIPT_SORA_SOMPENG		= HB_TAG ('S','o','r','a'),
-  /*6.1*/ HB_SCRIPT_TAKRI			= HB_TAG ('T','a','k','r'),
+  HB_SCRIPT_COMMON			= HB_TAG ('Z','y','y','y'), /*1.1*/
+  HB_SCRIPT_INHERITED			= HB_TAG ('Z','i','n','h'), /*1.1*/
+  HB_SCRIPT_UNKNOWN			= HB_TAG ('Z','z','z','z'), /*5.0*/
+
+  HB_SCRIPT_ARABIC			= HB_TAG ('A','r','a','b'), /*1.1*/
+  HB_SCRIPT_ARMENIAN			= HB_TAG ('A','r','m','n'), /*1.1*/
+  HB_SCRIPT_BENGALI			= HB_TAG ('B','e','n','g'), /*1.1*/
+  HB_SCRIPT_CYRILLIC			= HB_TAG ('C','y','r','l'), /*1.1*/
+  HB_SCRIPT_DEVANAGARI			= HB_TAG ('D','e','v','a'), /*1.1*/
+  HB_SCRIPT_GEORGIAN			= HB_TAG ('G','e','o','r'), /*1.1*/
+  HB_SCRIPT_GREEK			= HB_TAG ('G','r','e','k'), /*1.1*/
+  HB_SCRIPT_GUJARATI			= HB_TAG ('G','u','j','r'), /*1.1*/
+  HB_SCRIPT_GURMUKHI			= HB_TAG ('G','u','r','u'), /*1.1*/
+  HB_SCRIPT_HANGUL			= HB_TAG ('H','a','n','g'), /*1.1*/
+  HB_SCRIPT_HAN				= HB_TAG ('H','a','n','i'), /*1.1*/
+  HB_SCRIPT_HEBREW			= HB_TAG ('H','e','b','r'), /*1.1*/
+  HB_SCRIPT_HIRAGANA			= HB_TAG ('H','i','r','a'), /*1.1*/
+  HB_SCRIPT_KANNADA			= HB_TAG ('K','n','d','a'), /*1.1*/
+  HB_SCRIPT_KATAKANA			= HB_TAG ('K','a','n','a'), /*1.1*/
+  HB_SCRIPT_LAO				= HB_TAG ('L','a','o','o'), /*1.1*/
+  HB_SCRIPT_LATIN			= HB_TAG ('L','a','t','n'), /*1.1*/
+  HB_SCRIPT_MALAYALAM			= HB_TAG ('M','l','y','m'), /*1.1*/
+  HB_SCRIPT_ORIYA			= HB_TAG ('O','r','y','a'), /*1.1*/
+  HB_SCRIPT_TAMIL			= HB_TAG ('T','a','m','l'), /*1.1*/
+  HB_SCRIPT_TELUGU			= HB_TAG ('T','e','l','u'), /*1.1*/
+  HB_SCRIPT_THAI			= HB_TAG ('T','h','a','i'), /*1.1*/
+
+  HB_SCRIPT_TIBETAN			= HB_TAG ('T','i','b','t'), /*2.0*/
+
+  HB_SCRIPT_BOPOMOFO			= HB_TAG ('B','o','p','o'), /*3.0*/
+  HB_SCRIPT_BRAILLE			= HB_TAG ('B','r','a','i'), /*3.0*/
+  HB_SCRIPT_CANADIAN_SYLLABICS		= HB_TAG ('C','a','n','s'), /*3.0*/
+  HB_SCRIPT_CHEROKEE			= HB_TAG ('C','h','e','r'), /*3.0*/
+  HB_SCRIPT_ETHIOPIC			= HB_TAG ('E','t','h','i'), /*3.0*/
+  HB_SCRIPT_KHMER			= HB_TAG ('K','h','m','r'), /*3.0*/
+  HB_SCRIPT_MONGOLIAN			= HB_TAG ('M','o','n','g'), /*3.0*/
+  HB_SCRIPT_MYANMAR			= HB_TAG ('M','y','m','r'), /*3.0*/
+  HB_SCRIPT_OGHAM			= HB_TAG ('O','g','a','m'), /*3.0*/
+  HB_SCRIPT_RUNIC			= HB_TAG ('R','u','n','r'), /*3.0*/
+  HB_SCRIPT_SINHALA			= HB_TAG ('S','i','n','h'), /*3.0*/
+  HB_SCRIPT_SYRIAC			= HB_TAG ('S','y','r','c'), /*3.0*/
+  HB_SCRIPT_THAANA			= HB_TAG ('T','h','a','a'), /*3.0*/
+  HB_SCRIPT_YI				= HB_TAG ('Y','i','i','i'), /*3.0*/
+
+  HB_SCRIPT_DESERET			= HB_TAG ('D','s','r','t'), /*3.1*/
+  HB_SCRIPT_GOTHIC			= HB_TAG ('G','o','t','h'), /*3.1*/
+  HB_SCRIPT_OLD_ITALIC			= HB_TAG ('I','t','a','l'), /*3.1*/
+
+  HB_SCRIPT_BUHID			= HB_TAG ('B','u','h','d'), /*3.2*/
+  HB_SCRIPT_HANUNOO			= HB_TAG ('H','a','n','o'), /*3.2*/
+  HB_SCRIPT_TAGALOG			= HB_TAG ('T','g','l','g'), /*3.2*/
+  HB_SCRIPT_TAGBANWA			= HB_TAG ('T','a','g','b'), /*3.2*/
+
+  HB_SCRIPT_CYPRIOT			= HB_TAG ('C','p','r','t'), /*4.0*/
+  HB_SCRIPT_LIMBU			= HB_TAG ('L','i','m','b'), /*4.0*/
+  HB_SCRIPT_LINEAR_B			= HB_TAG ('L','i','n','b'), /*4.0*/
+  HB_SCRIPT_OSMANYA			= HB_TAG ('O','s','m','a'), /*4.0*/
+  HB_SCRIPT_SHAVIAN			= HB_TAG ('S','h','a','w'), /*4.0*/
+  HB_SCRIPT_TAI_LE			= HB_TAG ('T','a','l','e'), /*4.0*/
+  HB_SCRIPT_UGARITIC			= HB_TAG ('U','g','a','r'), /*4.0*/
+
+  HB_SCRIPT_BUGINESE			= HB_TAG ('B','u','g','i'), /*4.1*/
+  HB_SCRIPT_COPTIC			= HB_TAG ('C','o','p','t'), /*4.1*/
+  HB_SCRIPT_GLAGOLITIC			= HB_TAG ('G','l','a','g'), /*4.1*/
+  HB_SCRIPT_KHAROSHTHI			= HB_TAG ('K','h','a','r'), /*4.1*/
+  HB_SCRIPT_NEW_TAI_LUE			= HB_TAG ('T','a','l','u'), /*4.1*/
+  HB_SCRIPT_OLD_PERSIAN			= HB_TAG ('X','p','e','o'), /*4.1*/
+  HB_SCRIPT_SYLOTI_NAGRI		= HB_TAG ('S','y','l','o'), /*4.1*/
+  HB_SCRIPT_TIFINAGH			= HB_TAG ('T','f','n','g'), /*4.1*/
+
+  HB_SCRIPT_BALINESE			= HB_TAG ('B','a','l','i'), /*5.0*/
+  HB_SCRIPT_CUNEIFORM			= HB_TAG ('X','s','u','x'), /*5.0*/
+  HB_SCRIPT_NKO				= HB_TAG ('N','k','o','o'), /*5.0*/
+  HB_SCRIPT_PHAGS_PA			= HB_TAG ('P','h','a','g'), /*5.0*/
+  HB_SCRIPT_PHOENICIAN			= HB_TAG ('P','h','n','x'), /*5.0*/
+
+  HB_SCRIPT_CARIAN			= HB_TAG ('C','a','r','i'), /*5.1*/
+  HB_SCRIPT_CHAM			= HB_TAG ('C','h','a','m'), /*5.1*/
+  HB_SCRIPT_KAYAH_LI			= HB_TAG ('K','a','l','i'), /*5.1*/
+  HB_SCRIPT_LEPCHA			= HB_TAG ('L','e','p','c'), /*5.1*/
+  HB_SCRIPT_LYCIAN			= HB_TAG ('L','y','c','i'), /*5.1*/
+  HB_SCRIPT_LYDIAN			= HB_TAG ('L','y','d','i'), /*5.1*/
+  HB_SCRIPT_OL_CHIKI			= HB_TAG ('O','l','c','k'), /*5.1*/
+  HB_SCRIPT_REJANG			= HB_TAG ('R','j','n','g'), /*5.1*/
+  HB_SCRIPT_SAURASHTRA			= HB_TAG ('S','a','u','r'), /*5.1*/
+  HB_SCRIPT_SUNDANESE			= HB_TAG ('S','u','n','d'), /*5.1*/
+  HB_SCRIPT_VAI				= HB_TAG ('V','a','i','i'), /*5.1*/
+
+  HB_SCRIPT_AVESTAN			= HB_TAG ('A','v','s','t'), /*5.2*/
+  HB_SCRIPT_BAMUM			= HB_TAG ('B','a','m','u'), /*5.2*/
+  HB_SCRIPT_EGYPTIAN_HIEROGLYPHS	= HB_TAG ('E','g','y','p'), /*5.2*/
+  HB_SCRIPT_IMPERIAL_ARAMAIC		= HB_TAG ('A','r','m','i'), /*5.2*/
+  HB_SCRIPT_INSCRIPTIONAL_PAHLAVI	= HB_TAG ('P','h','l','i'), /*5.2*/
+  HB_SCRIPT_INSCRIPTIONAL_PARTHIAN	= HB_TAG ('P','r','t','i'), /*5.2*/
+  HB_SCRIPT_JAVANESE			= HB_TAG ('J','a','v','a'), /*5.2*/
+  HB_SCRIPT_KAITHI			= HB_TAG ('K','t','h','i'), /*5.2*/
+  HB_SCRIPT_LISU			= HB_TAG ('L','i','s','u'), /*5.2*/
+  HB_SCRIPT_MEETEI_MAYEK		= HB_TAG ('M','t','e','i'), /*5.2*/
+  HB_SCRIPT_OLD_SOUTH_ARABIAN		= HB_TAG ('S','a','r','b'), /*5.2*/
+  HB_SCRIPT_OLD_TURKIC			= HB_TAG ('O','r','k','h'), /*5.2*/
+  HB_SCRIPT_SAMARITAN			= HB_TAG ('S','a','m','r'), /*5.2*/
+  HB_SCRIPT_TAI_THAM			= HB_TAG ('L','a','n','a'), /*5.2*/
+  HB_SCRIPT_TAI_VIET			= HB_TAG ('T','a','v','t'), /*5.2*/
+
+  HB_SCRIPT_BATAK			= HB_TAG ('B','a','t','k'), /*6.0*/
+  HB_SCRIPT_BRAHMI			= HB_TAG ('B','r','a','h'), /*6.0*/
+  HB_SCRIPT_MANDAIC			= HB_TAG ('M','a','n','d'), /*6.0*/
+
+  HB_SCRIPT_CHAKMA			= HB_TAG ('C','a','k','m'), /*6.1*/
+  HB_SCRIPT_MEROITIC_CURSIVE		= HB_TAG ('M','e','r','c'), /*6.1*/
+  HB_SCRIPT_MEROITIC_HIEROGLYPHS	= HB_TAG ('M','e','r','o'), /*6.1*/
+  HB_SCRIPT_MIAO			= HB_TAG ('P','l','r','d'), /*6.1*/
+  HB_SCRIPT_SHARADA			= HB_TAG ('S','h','r','d'), /*6.1*/
+  HB_SCRIPT_SORA_SOMPENG		= HB_TAG ('S','o','r','a'), /*6.1*/
+  HB_SCRIPT_TAKRI			= HB_TAG ('T','a','k','r'), /*6.1*/
 
   /*
    * Since: 0.9.30
    */
-  /*7.0*/ HB_SCRIPT_BASSA_VAH			= HB_TAG ('B','a','s','s'),
-  /*7.0*/ HB_SCRIPT_CAUCASIAN_ALBANIAN		= HB_TAG ('A','g','h','b'),
-  /*7.0*/ HB_SCRIPT_DUPLOYAN			= HB_TAG ('D','u','p','l'),
-  /*7.0*/ HB_SCRIPT_ELBASAN			= HB_TAG ('E','l','b','a'),
-  /*7.0*/ HB_SCRIPT_GRANTHA			= HB_TAG ('G','r','a','n'),
-  /*7.0*/ HB_SCRIPT_KHOJKI			= HB_TAG ('K','h','o','j'),
-  /*7.0*/ HB_SCRIPT_KHUDAWADI			= HB_TAG ('S','i','n','d'),
-  /*7.0*/ HB_SCRIPT_LINEAR_A			= HB_TAG ('L','i','n','a'),
-  /*7.0*/ HB_SCRIPT_MAHAJANI			= HB_TAG ('M','a','h','j'),
-  /*7.0*/ HB_SCRIPT_MANICHAEAN			= HB_TAG ('M','a','n','i'),
-  /*7.0*/ HB_SCRIPT_MENDE_KIKAKUI		= HB_TAG ('M','e','n','d'),
-  /*7.0*/ HB_SCRIPT_MODI			= HB_TAG ('M','o','d','i'),
-  /*7.0*/ HB_SCRIPT_MRO				= HB_TAG ('M','r','o','o'),
-  /*7.0*/ HB_SCRIPT_NABATAEAN			= HB_TAG ('N','b','a','t'),
-  /*7.0*/ HB_SCRIPT_OLD_NORTH_ARABIAN		= HB_TAG ('N','a','r','b'),
-  /*7.0*/ HB_SCRIPT_OLD_PERMIC			= HB_TAG ('P','e','r','m'),
-  /*7.0*/ HB_SCRIPT_PAHAWH_HMONG		= HB_TAG ('H','m','n','g'),
-  /*7.0*/ HB_SCRIPT_PALMYRENE			= HB_TAG ('P','a','l','m'),
-  /*7.0*/ HB_SCRIPT_PAU_CIN_HAU			= HB_TAG ('P','a','u','c'),
-  /*7.0*/ HB_SCRIPT_PSALTER_PAHLAVI		= HB_TAG ('P','h','l','p'),
-  /*7.0*/ HB_SCRIPT_SIDDHAM			= HB_TAG ('S','i','d','d'),
-  /*7.0*/ HB_SCRIPT_TIRHUTA			= HB_TAG ('T','i','r','h'),
-  /*7.0*/ HB_SCRIPT_WARANG_CITI			= HB_TAG ('W','a','r','a'),
-
-  /*8.0*/ HB_SCRIPT_AHOM			= HB_TAG ('A','h','o','m'),
-  /*8.0*/ HB_SCRIPT_ANATOLIAN_HIEROGLYPHS	= HB_TAG ('H','l','u','w'),
-  /*8.0*/ HB_SCRIPT_HATRAN			= HB_TAG ('H','a','t','r'),
-  /*8.0*/ HB_SCRIPT_MULTANI			= HB_TAG ('M','u','l','t'),
-  /*8.0*/ HB_SCRIPT_OLD_HUNGARIAN		= HB_TAG ('H','u','n','g'),
-  /*8.0*/ HB_SCRIPT_SIGNWRITING			= HB_TAG ('S','g','n','w'),
+  HB_SCRIPT_BASSA_VAH			= HB_TAG ('B','a','s','s'), /*7.0*/
+  HB_SCRIPT_CAUCASIAN_ALBANIAN		= HB_TAG ('A','g','h','b'), /*7.0*/
+  HB_SCRIPT_DUPLOYAN			= HB_TAG ('D','u','p','l'), /*7.0*/
+  HB_SCRIPT_ELBASAN			= HB_TAG ('E','l','b','a'), /*7.0*/
+  HB_SCRIPT_GRANTHA			= HB_TAG ('G','r','a','n'), /*7.0*/
+  HB_SCRIPT_KHOJKI			= HB_TAG ('K','h','o','j'), /*7.0*/
+  HB_SCRIPT_KHUDAWADI			= HB_TAG ('S','i','n','d'), /*7.0*/
+  HB_SCRIPT_LINEAR_A			= HB_TAG ('L','i','n','a'), /*7.0*/
+  HB_SCRIPT_MAHAJANI			= HB_TAG ('M','a','h','j'), /*7.0*/
+  HB_SCRIPT_MANICHAEAN			= HB_TAG ('M','a','n','i'), /*7.0*/
+  HB_SCRIPT_MENDE_KIKAKUI		= HB_TAG ('M','e','n','d'), /*7.0*/
+  HB_SCRIPT_MODI			= HB_TAG ('M','o','d','i'), /*7.0*/
+  HB_SCRIPT_MRO				= HB_TAG ('M','r','o','o'), /*7.0*/
+  HB_SCRIPT_NABATAEAN			= HB_TAG ('N','b','a','t'), /*7.0*/
+  HB_SCRIPT_OLD_NORTH_ARABIAN		= HB_TAG ('N','a','r','b'), /*7.0*/
+  HB_SCRIPT_OLD_PERMIC			= HB_TAG ('P','e','r','m'), /*7.0*/
+  HB_SCRIPT_PAHAWH_HMONG		= HB_TAG ('H','m','n','g'), /*7.0*/
+  HB_SCRIPT_PALMYRENE			= HB_TAG ('P','a','l','m'), /*7.0*/
+  HB_SCRIPT_PAU_CIN_HAU			= HB_TAG ('P','a','u','c'), /*7.0*/
+  HB_SCRIPT_PSALTER_PAHLAVI		= HB_TAG ('P','h','l','p'), /*7.0*/
+  HB_SCRIPT_SIDDHAM			= HB_TAG ('S','i','d','d'), /*7.0*/
+  HB_SCRIPT_TIRHUTA			= HB_TAG ('T','i','r','h'), /*7.0*/
+  HB_SCRIPT_WARANG_CITI			= HB_TAG ('W','a','r','a'), /*7.0*/
+
+  HB_SCRIPT_AHOM			= HB_TAG ('A','h','o','m'), /*8.0*/
+  HB_SCRIPT_ANATOLIAN_HIEROGLYPHS	= HB_TAG ('H','l','u','w'), /*8.0*/
+  HB_SCRIPT_HATRAN			= HB_TAG ('H','a','t','r'), /*8.0*/
+  HB_SCRIPT_MULTANI			= HB_TAG ('M','u','l','t'), /*8.0*/
+  HB_SCRIPT_OLD_HUNGARIAN		= HB_TAG ('H','u','n','g'), /*8.0*/
+  HB_SCRIPT_SIGNWRITING			= HB_TAG ('S','g','n','w'), /*8.0*/
 
   /*
    * Since 1.3.0
    */
-  /*9.0*/ HB_SCRIPT_ADLAM			= HB_TAG ('A','d','l','m'),
-  /*9.0*/ HB_SCRIPT_BHAIKSUKI			= HB_TAG ('B','h','k','s'),
-  /*9.0*/ HB_SCRIPT_MARCHEN			= HB_TAG ('M','a','r','c'),
-  /*9.0*/ HB_SCRIPT_OSAGE			= HB_TAG ('O','s','g','e'),
-  /*9.0*/ HB_SCRIPT_TANGUT			= HB_TAG ('T','a','n','g'),
-  /*9.0*/ HB_SCRIPT_NEWA			= HB_TAG ('N','e','w','a'),
+  HB_SCRIPT_ADLAM			= HB_TAG ('A','d','l','m'), /*9.0*/
+  HB_SCRIPT_BHAIKSUKI			= HB_TAG ('B','h','k','s'), /*9.0*/
+  HB_SCRIPT_MARCHEN			= HB_TAG ('M','a','r','c'), /*9.0*/
+  HB_SCRIPT_OSAGE			= HB_TAG ('O','s','g','e'), /*9.0*/
+  HB_SCRIPT_TANGUT			= HB_TAG ('T','a','n','g'), /*9.0*/
+  HB_SCRIPT_NEWA			= HB_TAG ('N','e','w','a'), /*9.0*/
 
   /*
    * Since 1.6.0
    */
-  /*10.0*/HB_SCRIPT_MASARAM_GONDI		= HB_TAG ('G','o','n','m'),
-  /*10.0*/HB_SCRIPT_NUSHU			= HB_TAG ('N','s','h','u'),
-  /*10.0*/HB_SCRIPT_SOYOMBO			= HB_TAG ('S','o','y','o'),
-  /*10.0*/HB_SCRIPT_ZANABAZAR_SQUARE		= HB_TAG ('Z','a','n','b'),
+  HB_SCRIPT_MASARAM_GONDI		= HB_TAG ('G','o','n','m'), /*10.0*/
+  HB_SCRIPT_NUSHU			= HB_TAG ('N','s','h','u'), /*10.0*/
+  HB_SCRIPT_SOYOMBO			= HB_TAG ('S','o','y','o'), /*10.0*/
+  HB_SCRIPT_ZANABAZAR_SQUARE		= HB_TAG ('Z','a','n','b'), /*10.0*/
 
   /*
    * Since 1.8.0
    */
-  /*11.0*/HB_SCRIPT_DOGRA			= HB_TAG ('D','o','g','r'),
-  /*11.0*/HB_SCRIPT_GUNJALA_GONDI		= HB_TAG ('G','o','n','g'),
-  /*11.0*/HB_SCRIPT_HANIFI_ROHINGYA		= HB_TAG ('R','o','h','g'),
-  /*11.0*/HB_SCRIPT_MAKASAR			= HB_TAG ('M','a','k','a'),
-  /*11.0*/HB_SCRIPT_MEDEFAIDRIN			= HB_TAG ('M','e','d','f'),
-  /*11.0*/HB_SCRIPT_OLD_SOGDIAN			= HB_TAG ('S','o','g','o'),
-  /*11.0*/HB_SCRIPT_SOGDIAN			= HB_TAG ('S','o','g','d'),
+  HB_SCRIPT_DOGRA			= HB_TAG ('D','o','g','r'), /*11.0*/
+  HB_SCRIPT_GUNJALA_GONDI		= HB_TAG ('G','o','n','g'), /*11.0*/
+  HB_SCRIPT_HANIFI_ROHINGYA		= HB_TAG ('R','o','h','g'), /*11.0*/
+  HB_SCRIPT_MAKASAR			= HB_TAG ('M','a','k','a'), /*11.0*/
+  HB_SCRIPT_MEDEFAIDRIN			= HB_TAG ('M','e','d','f'), /*11.0*/
+  HB_SCRIPT_OLD_SOGDIAN			= HB_TAG ('S','o','g','o'), /*11.0*/
+  HB_SCRIPT_SOGDIAN			= HB_TAG ('S','o','g','d'), /*11.0*/
 
   /*
    * Since 2.4.0
    */
-  /*12.0*/HB_SCRIPT_ELYMAIC			= HB_TAG ('E','l','y','m'),
-  /*12.0*/HB_SCRIPT_NANDINAGARI			= HB_TAG ('N','a','n','d'),
-  /*12.0*/HB_SCRIPT_NYIAKENG_PUACHUE_HMONG	= HB_TAG ('H','m','n','p'),
-  /*12.0*/HB_SCRIPT_WANCHO			= HB_TAG ('W','c','h','o'),
+  HB_SCRIPT_ELYMAIC			= HB_TAG ('E','l','y','m'), /*12.0*/
+  HB_SCRIPT_NANDINAGARI			= HB_TAG ('N','a','n','d'), /*12.0*/
+  HB_SCRIPT_NYIAKENG_PUACHUE_HMONG	= HB_TAG ('H','m','n','p'), /*12.0*/
+  HB_SCRIPT_WANCHO			= HB_TAG ('W','c','h','o'), /*12.0*/
 
   /*
    * Since 2.6.7
    */
-  /*13.0*/HB_SCRIPT_CHORASMIAN			= HB_TAG ('C','h','r','s'),
-  /*13.0*/HB_SCRIPT_DIVES_AKURU			= HB_TAG ('D','i','a','k'),
-  /*13.0*/HB_SCRIPT_KHITAN_SMALL_SCRIPT		= HB_TAG ('K','i','t','s'),
-  /*13.0*/HB_SCRIPT_YEZIDI			= HB_TAG ('Y','e','z','i'),
+  HB_SCRIPT_CHORASMIAN			= HB_TAG ('C','h','r','s'), /*13.0*/
+  HB_SCRIPT_DIVES_AKURU			= HB_TAG ('D','i','a','k'), /*13.0*/
+  HB_SCRIPT_KHITAN_SMALL_SCRIPT		= HB_TAG ('K','i','t','s'), /*13.0*/
+  HB_SCRIPT_YEZIDI			= HB_TAG ('Y','e','z','i'), /*13.0*/
 
   /* No script set. */
-  HB_SCRIPT_INVALID				= HB_TAG_NONE,
+  HB_SCRIPT_INVALID			= HB_TAG_NONE,
+
+  /*< private >*/
 
   /* Dummy values to ensure any hb_tag_t value can be passed/stored as hb_script_t
    * without risking undefined behavior.  We have two, for historical reasons.
@@ -687,19 +731,33 @@ typedef struct hb_user_data_key_t {
   char unused;
 } hb_user_data_key_t;
 
+/**
+ * hb_destroy_func_t:
+ * @user_data: the data to be destroyed
+ *
+ * A virtual method for destroy user-data callbacks.
+ *
+ */
 typedef void (*hb_destroy_func_t) (void *user_data);
 
 
 /* Font features and variations. */
 
 /**
- * HB_FEATURE_GLOBAL_START
+ * HB_FEATURE_GLOBAL_START:
+ *
+ * Special setting for #hb_feature_t.start to apply the feature from the start
+ * of the buffer.
  *
  * Since: 2.0.0
  */
 #define HB_FEATURE_GLOBAL_START	0
+
 /**
- * HB_FEATURE_GLOBAL_END
+ * HB_FEATURE_GLOBAL_END:
+ *
+ * Special setting for #hb_feature_t.end to apply the feature from to the end
+ * of the buffer.
  *
  * Since: 2.0.0
  */
@@ -717,7 +775,7 @@ typedef void (*hb_destroy_func_t) (void *user_data);
  * The #hb_feature_t is the structure that holds information about requested
  * feature application. The feature will be applied with the given value to all
  * glyphs which are in clusters between @start (inclusive) and @end (exclusive).
- * Setting start to @HB_FEATURE_GLOBAL_START and end to @HB_FEATURE_GLOBAL_END
+ * Setting start to #HB_FEATURE_GLOBAL_START and end to #HB_FEATURE_GLOBAL_END
  * specifies that the feature always applies to the entire buffer.
  */
 typedef struct hb_feature_t {
@@ -741,8 +799,8 @@ hb_feature_to_string (hb_feature_t *feature,
  * @value: The value of the variation axis
  *
  * Data type for holding variation data. Registered OpenType
- * variation-axis tags are listed at
- * https://docs.microsoft.com/en-us/typography/opentype/spec/dvaraxisreg
+ * variation-axis tags are listed in
+ * [OpenType Axis Tag Registry](https://docs.microsoft.com/en-us/typography/opentype/spec/dvaraxisreg).
  * 
  * Since: 1.4.2
  */
@@ -769,6 +827,17 @@ hb_variation_to_string (hb_variation_t *variation,
  */
 typedef uint32_t hb_color_t;
 
+/**
+ * HB_COLOR:
+ * @b: blue channel value
+ * @g: green channel value
+ * @r: red channel value
+ * @a: alpha channel value
+ *
+ * Constructs an #hb_color_t from four integers.
+ *
+ * Since: 2.1.0
+ */
 #define HB_COLOR(b,g,r,a) ((hb_color_t) HB_TAG ((b),(g),(r),(a)))
 
 HB_EXTERN uint8_t
diff --git a/thirdparty/harfbuzz/src/hb-coretext.cc b/thirdparty/harfbuzz/src/hb-coretext.cc
index 7b6b2bd5ef..461bd20e65 100644
--- a/thirdparty/harfbuzz/src/hb-coretext.cc
+++ b/thirdparty/harfbuzz/src/hb-coretext.cc
@@ -34,7 +34,6 @@
 
 #include "hb-coretext.h"
 #include "hb-aat-layout.hh"
-#include <math.h>
 
 
 /**
@@ -190,7 +189,10 @@ create_ct_font (CGFontRef cg_font, CGFloat font_size)
    * reconfiguring the cascade list causes CoreText crashes. For details, see
    * crbug.com/549610 */
   // 0x00070000 stands for "kCTVersionNumber10_10", see CoreText.h
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
   if (&CTGetCoreTextVersion != nullptr && CTGetCoreTextVersion() < 0x00070000) {
+#pragma GCC diagnostic pop
     CFStringRef fontName = CTFontCopyPostScriptName (ct_font);
     bool isEmojiFont = CFStringCompare (fontName, CFSTR("AppleColorEmoji"), 0) == kCFCompareEqualTo;
     CFRelease (fontName);
@@ -346,7 +348,7 @@ retry:
   const hb_coretext_font_data_t *data = font->data.coretext;
   if (unlikely (!data)) return nullptr;
 
-  if (fabs (CTFontGetSize ((CTFontRef) data) - (CGFloat) font->ptem) > .5)
+  if (fabs (CTFontGetSize ((CTFontRef) data) - (CGFloat) font->ptem) > (CGFloat) .5)
   {
     /* XXX-MT-bug
      * Note that evaluating condition above can be dangerous if another thread
@@ -402,7 +404,7 @@ hb_coretext_font_create (CTFontRef ct_font)
 }
 
 /**
- * hb_coretext_face_get_ct_font:
+ * hb_coretext_font_get_ct_font:
  * @font: #hb_font_t to work upon
  *
  * Fetches the CTFontRef associated with the specified
@@ -858,7 +860,7 @@ resize_and_retry:
 
     buffer->len = 0;
     uint32_t status_and = ~0, status_or = 0;
-    double advances_so_far = 0;
+    CGFloat advances_so_far = 0;
     /* For right-to-left runs, CoreText returns the glyphs positioned such that
      * any trailing whitespace is to the left of (0,0).  Adjust coordinate system
      * to fix for that.  Test with any RTL string with trailing spaces.
@@ -880,10 +882,10 @@ resize_and_retry:
       status_or  |= run_status;
       status_and &= run_status;
       DEBUG_MSG (CORETEXT, run, "CTRunStatus: %x", run_status);
-      double run_advance = CTRunGetTypographicBounds (run, range_all, nullptr, nullptr, nullptr);
+      CGFloat run_advance = CTRunGetTypographicBounds (run, range_all, nullptr, nullptr, nullptr);
       if (HB_DIRECTION_IS_VERTICAL (buffer->props.direction))
 	  run_advance = -run_advance;
-      DEBUG_MSG (CORETEXT, run, "Run advance: %g", run_advance);
+      DEBUG_MSG (CORETEXT, run, "Run advance: %g", (double) run_advance);
 
       /* CoreText does automatic font fallback (AKA "cascading") for  characters
        * not supported by the requested font, and provides no way to turn it off,
@@ -1062,7 +1064,7 @@ resize_and_retry:
 	  hb_position_t x_offset = (positions[0].x - advances_so_far) * x_mult;
 	  for (unsigned int j = 0; j < num_glyphs; j++)
 	  {
-	    double advance;
+	    CGFloat advance;
 	    if (likely (j + 1 < num_glyphs))
 	      advance = positions[j + 1].x - positions[j].x;
 	    else /* last glyph */
@@ -1078,7 +1080,7 @@ resize_and_retry:
 	  hb_position_t y_offset = (positions[0].y - advances_so_far) * y_mult;
 	  for (unsigned int j = 0; j < num_glyphs; j++)
 	  {
-	    double advance;
+	    CGFloat advance;
 	    if (likely (j + 1 < num_glyphs))
 	      advance = positions[j + 1].y - positions[j].y;
 	    else /* last glyph */
diff --git a/thirdparty/harfbuzz/src/hb-deprecated.h b/thirdparty/harfbuzz/src/hb-deprecated.h
index 43f89a4c4e..5f19125789 100644
--- a/thirdparty/harfbuzz/src/hb-deprecated.h
+++ b/thirdparty/harfbuzz/src/hb-deprecated.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -53,11 +53,50 @@ HB_BEGIN_DECLS
 #ifndef HB_DISABLE_DEPRECATED
 
 
+/**
+ * HB_SCRIPT_CANADIAN_ABORIGINAL:
+ *
+ * Use #HB_SCRIPT_CANADIAN_SYLLABICS instead:
+ *
+ * Deprecated: 0.9.20
+ */
 #define HB_SCRIPT_CANADIAN_ABORIGINAL		HB_SCRIPT_CANADIAN_SYLLABICS
 
+/**
+ * HB_BUFFER_FLAGS_DEFAULT:
+ *
+ * Use #HB_BUFFER_FLAG_DEFAULT instead.
+ *
+ * Deprecated: 0.9.20
+ */
 #define HB_BUFFER_FLAGS_DEFAULT			HB_BUFFER_FLAG_DEFAULT
+/**
+ * HB_BUFFER_SERIALIZE_FLAGS_DEFAULT:
+ *
+ * Use #HB_BUFFER_SERIALIZE_FLAG_DEFAULT instead.
+ *
+ * Deprecated: 0.9.20
+ */
 #define HB_BUFFER_SERIALIZE_FLAGS_DEFAULT	HB_BUFFER_SERIALIZE_FLAG_DEFAULT
 
+/**
+ * hb_font_get_glyph_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @unicode: The Unicode code point to query
+ * @variation_selector: The  variation-selector code point to query
+ * @glyph: (out): The glyph ID retrieved
+ * @user_data: User data pointer passed by the caller
+ *
+ * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
+ *
+ * This method should retrieve the glyph ID for a specified Unicode code point
+ * font, with an optional variation selector.
+ *
+ * Return value: %true if data found, %false otherwise
+ * Deprecated: 1.2.3
+ *
+ **/
 typedef hb_bool_t (*hb_font_get_glyph_func_t) (hb_font_t *font, void *font_data,
 					       hb_codepoint_t unicode, hb_codepoint_t variation_selector,
 					       hb_codepoint_t *glyph,
@@ -73,6 +112,11 @@ hb_set_invert (hb_set_t *set);
 
 /**
  * hb_unicode_eastasian_width_func_t:
+ * @ufuncs: A Unicode-functions structure
+ * @unicode: The code point to query
+ * @user_data: User data pointer passed by the caller
+ *
+ * A virtual method for the #hb_unicode_funcs_t structure.
  *
  * Deprecated: 2.0.0
  */
@@ -82,12 +126,12 @@ typedef unsigned int			(*hb_unicode_eastasian_width_func_t)	(hb_unicode_funcs_t
 
 /**
  * hb_unicode_funcs_set_eastasian_width_func:
- * @ufuncs: a Unicode function structure
- * @func: (closure user_data) (destroy destroy) (scope notified):
- * @user_data:
- * @destroy:
+ * @ufuncs: a Unicode-function structure
+ * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
+ * @user_data: Data to pass to @func
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
- * 
+ * Sets the implementation function for #hb_unicode_eastasian_width_func_t.
  *
  * Since: 0.9.2
  * Deprecated: 2.0.0
@@ -99,6 +143,10 @@ hb_unicode_funcs_set_eastasian_width_func (hb_unicode_funcs_t *ufuncs,
 
 /**
  * hb_unicode_eastasian_width:
+ * @ufuncs: a Unicode-function structure
+ * @unicode: The code point to query
+ *
+ * Don't use. Not used by HarfBuzz.
  *
  * Since: 0.9.2
  * Deprecated: 2.0.0
@@ -112,7 +160,7 @@ hb_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs,
  * hb_unicode_decompose_compatibility_func_t:
  * @ufuncs: a Unicode function structure
  * @u: codepoint to decompose
- * @decomposed: address of codepoint array (of length %HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into
+ * @decomposed: address of codepoint array (of length #HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into
  * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func()
  *
  * Fully decompose @u to its Unicode compatibility decomposition. The codepoints of the decomposition will be written to @decomposed.
@@ -120,7 +168,7 @@ hb_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs,
  *
  * If @u has no compatibility decomposition, zero should be returned.
  *
- * The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any
+ * The Unicode standard guarantees that a buffer of length #HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any
  * compatibility decomposition plus an terminating value of 0.  Consequently, @decompose must be allocated by the caller to be at least this length.  Implementations
  * of this function type must ensure that they do not write past the provided array.
  *
@@ -144,10 +192,12 @@ typedef unsigned int			(*hb_unicode_decompose_compatibility_func_t)	(hb_unicode_
 
 /**
  * hb_unicode_funcs_set_decompose_compatibility_func:
- * @ufuncs: a Unicode function structure
- * @func: (closure user_data) (destroy destroy) (scope notified):
- * @user_data:
- * @destroy:
+ * @ufuncs: A Unicode-functions structure
+ * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
+ * @user_data: Data to pass to @func
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
+ *
+ * Sets the implementation function for #hb_unicode_decompose_compatibility_func_t.
  *
  * 
  *
@@ -165,16 +215,25 @@ hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
 				    hb_codepoint_t     *decomposed);
 
 
+/**
+ * hb_font_get_glyph_v_kerning_func_t:
+ *
+ * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
+ *
+ * This method should retrieve the kerning-adjustment value for a glyph-pair in
+ * the specified font, for vertical text segments.
+ *
+ **/
 typedef hb_font_get_glyph_kerning_func_t hb_font_get_glyph_v_kerning_func_t;
 
 /**
  * hb_font_funcs_set_glyph_v_kerning_func:
- * @ffuncs: font functions.
- * @func: (closure user_data) (destroy destroy) (scope notified):
- * @user_data:
- * @destroy:
+ * @ffuncs: A font-function structure
+ * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
+ * @user_data: Data to pass to @func
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
- * 
+ * Sets the implementation function for #hb_font_get_glyph_v_kerning_func_t.
  *
  * Since: 0.9.2
  * Deprecated: 2.0.0
diff --git a/thirdparty/harfbuzz/src/hb-directwrite.cc b/thirdparty/harfbuzz/src/hb-directwrite.cc
index 92c956c032..a07302159c 100644
--- a/thirdparty/harfbuzz/src/hb-directwrite.cc
+++ b/thirdparty/harfbuzz/src/hb-directwrite.cc
@@ -957,6 +957,8 @@ _hb_directwrite_font_release (void *data)
  * hb_directwrite_face_create:
  * @font_face: a DirectWrite IDWriteFontFace object.
  *
+ * Constructs a new face object from the specified DirectWrite IDWriteFontFace.
+ *
  * Return value: #hb_face_t object corresponding to the given input
  *
  * Since: 2.4.0
@@ -974,6 +976,8 @@ hb_directwrite_face_create (IDWriteFontFace *font_face)
 * hb_directwrite_face_get_font_face:
 * @face: a #hb_face_t object
 *
+* Gets the DirectWrite IDWriteFontFace associated with @face.
+*
 * Return value: DirectWrite IDWriteFontFace object corresponding to the given input
 *
 * Since: 2.5.0
diff --git a/thirdparty/harfbuzz/src/hb-dispatch.hh b/thirdparty/harfbuzz/src/hb-dispatch.hh
index 7eace86e54..4b2b65a8de 100644
--- a/thirdparty/harfbuzz/src/hb-dispatch.hh
+++ b/thirdparty/harfbuzz/src/hb-dispatch.hh
@@ -38,7 +38,6 @@
 template <typename Context, typename Return=hb_empty_t, unsigned int MaxDebugDepth=0>
 struct hb_dispatch_context_t
 {
-  hb_dispatch_context_t () : debug_depth (0) {}
   private:
   /* https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern */
   const Context* thiz () const { return static_cast<const Context *> (this); }
@@ -54,7 +53,7 @@ struct hb_dispatch_context_t
   { return obj.dispatch (thiz (), hb_forward<Ts> (ds)...); }
   static return_t no_dispatch_return_value () { return Context::default_return_value (); }
   static bool stop_sublookup_iteration (const return_t r HB_UNUSED) { return false; }
-  unsigned debug_depth;
+  unsigned debug_depth = 0;
 };
 
 
diff --git a/thirdparty/harfbuzz/src/hb-draw.h b/thirdparty/harfbuzz/src/hb-draw.h
index 98eccf4c0c..bddc876399 100644
--- a/thirdparty/harfbuzz/src/hb-draw.h
+++ b/thirdparty/harfbuzz/src/hb-draw.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-face.cc b/thirdparty/harfbuzz/src/hb-face.cc
index 33a788e7c5..61bd4af7b1 100644
--- a/thirdparty/harfbuzz/src/hb-face.cc
+++ b/thirdparty/harfbuzz/src/hb-face.cc
@@ -89,8 +89,8 @@ DEFINE_NULL_INSTANCE (hb_face_t) =
   nullptr, /* destroy */
 
   0,    /* index */
-  HB_ATOMIC_INT_INIT (1000), /* upem */
-  HB_ATOMIC_INT_INIT (0),    /* num_glyphs */
+  1000, /* upem */
+  0,    /* num_glyphs */
 
   /* Zero for the rest is fine. */
 };
@@ -100,7 +100,7 @@ DEFINE_NULL_INSTANCE (hb_face_t) =
  * hb_face_create_for_tables:
  * @reference_table_func: (closure user_data) (destroy destroy) (scope notified): Table-referencing function
  * @user_data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  *
  * Variant of hb_face_create(), built for those cases where it is more
  * convenient to provide data for individual tables instead of the whole font
@@ -235,7 +235,7 @@ hb_face_create (hb_blob_t    *blob,
  *
  * Fetches the singleton empty face object.
  *
- * Return value: (transfer full) The empty face object
+ * Return value: (transfer full): The empty face object
  *
  * Since: 0.9.2
  **/
@@ -299,7 +299,7 @@ hb_face_destroy (hb_face_t *face)
  * @face: A face object
  * @key: The user-data key to set
  * @data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the given face object. 
@@ -360,7 +360,7 @@ hb_face_make_immutable (hb_face_t *face)
  *
  * Tests whether the given face object is immutable.
  *
- * Return value: True is @face is immutable, false otherwise
+ * Return value: %true is @face is immutable, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -756,7 +756,7 @@ hb_face_builder_add_table (hb_face_t *face, hb_tag_t tag, hb_blob_t *blob)
   hb_face_builder_data_t *data = (hb_face_builder_data_t *) face->user_data;
 
   hb_face_builder_data_t::table_entry_t *entry = data->tables.push ();
-  if (data->tables.in_error())
+  if (unlikely (data->tables.in_error()))
     return false;
 
   entry->tag = tag;
diff --git a/thirdparty/harfbuzz/src/hb-face.h b/thirdparty/harfbuzz/src/hb-face.h
index 3b18f7eef9..6ef2f8b886 100644
--- a/thirdparty/harfbuzz/src/hb-face.h
+++ b/thirdparty/harfbuzz/src/hb-face.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -58,6 +58,19 @@ HB_EXTERN hb_face_t *
 hb_face_create (hb_blob_t    *blob,
 		unsigned int  index);
 
+/**
+ * hb_reference_table_func_t:
+ * @face: an #hb_face_t to reference table for
+ * @tag: the tag of the table to reference
+ * @user_data: User data pointer passed by the caller
+ *
+ * Callback function for hb_face_create_for_tables().
+ *
+ * Return value: (transfer full): A pointer to the @tag table within @face
+ *
+ * Since: 0.9.2
+ */
+
 typedef hb_blob_t * (*hb_reference_table_func_t)  (hb_face_t *face, hb_tag_t tag, void *user_data);
 
 /* calls destroy() when not needing user_data anymore */
diff --git a/thirdparty/harfbuzz/src/hb-face.hh b/thirdparty/harfbuzz/src/hb-face.hh
index f1b472ccf3..765f272858 100644
--- a/thirdparty/harfbuzz/src/hb-face.hh
+++ b/thirdparty/harfbuzz/src/hb-face.hh
@@ -81,7 +81,7 @@ struct hb_face_t
     return blob;
   }
 
-  HB_PURE_FUNC unsigned int get_upem () const
+  unsigned int get_upem () const
   {
     unsigned int ret = upem.get_relaxed ();
     if (unlikely (!ret))
diff --git a/thirdparty/harfbuzz/src/hb-font.cc b/thirdparty/harfbuzz/src/hb-font.cc
index 5c8357ff28..37a0e7fe85 100644
--- a/thirdparty/harfbuzz/src/hb-font.cc
+++ b/thirdparty/harfbuzz/src/hb-font.cc
@@ -628,7 +628,7 @@ hb_font_funcs_destroy (hb_font_funcs_t *ffuncs)
  * @ffuncs: The font-functions structure
  * @key: The user-data key to set
  * @data: A pointer to the user data set
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified font-functions structure. 
@@ -690,7 +690,7 @@ hb_font_funcs_make_immutable (hb_font_funcs_t *ffuncs)
  *
  * Tests whether a font-functions structure is immutable.
  *
- * Return value: %true if @ffuncs is immutable, false otherwise
+ * Return value: %true if @ffuncs is immutable, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -753,10 +753,10 @@ hb_font_t::has_func (unsigned int i)
  * @font: #hb_font_t to work upon
  * @extents: (out): The font extents retrieved
  *
- * Fetches the extents for a specified font, in horizontal
+ * Fetches the extents for a specified font, for horizontal
  * text segments.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 1.1.3
  **/
@@ -772,10 +772,10 @@ hb_font_get_h_extents (hb_font_t         *font,
  * @font: #hb_font_t to work upon
  * @extents: (out): The font extents retrieved
  *
- * Fetches the extents for a specified font, in vertical
+ * Fetches the extents for a specified font, for vertical
  * text segments.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 1.1.3
  **/
@@ -790,7 +790,7 @@ hb_font_get_v_extents (hb_font_t         *font,
  * hb_font_get_glyph:
  * @font: #hb_font_t to work upon
  * @unicode: The Unicode code point to query
- * @variation_selector: (optional): A variation-selector code point
+ * @variation_selector: A variation-selector code point
  * @glyph: (out): The glyph ID retrieved
  *
  * Fetches the glyph ID for a Unicode code point in the specified
@@ -799,7 +799,7 @@ hb_font_get_v_extents (hb_font_t         *font,
  * If @variation_selector is 0, calls hb_font_get_nominal_glyph();
  * otherwise calls hb_font_get_variation_glyph().
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -827,7 +827,7 @@ hb_font_get_glyph (hb_font_t      *font,
  * for code points modified by variation selectors. For variation-selector
  * support, user hb_font_get_variation_glyph() or use hb_font_get_glyph().
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 1.2.3
  **/
@@ -841,11 +841,17 @@ hb_font_get_nominal_glyph (hb_font_t      *font,
 
 /**
  * hb_font_get_nominal_glyphs:
- * @font: a font.
- *
+ * @font: #hb_font_t to work upon
+ * @count: number of code points to query
+ * @first_unicode: The first Unicode code point to query
+ * @unicode_stride: The stride between successive code points
+ * @first_glyph: (out): The first glyph ID retrieved
+ * @glyph_stride: The stride between successive glyph IDs
  *
+ * Fetches the nominal glyph IDs for a sequence of Unicode code points. Glyph
+ * IDs must be returned in a #hb_codepoint_t output parameter.
  *
- * Return value:
+ * Return value: the number of code points processed
  *
  * Since: 2.6.3
  **/
@@ -873,7 +879,7 @@ hb_font_get_nominal_glyphs (hb_font_t *font,
  * by the specified variation-selector code point, in the specified
  * font.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 1.2.3
  **/
@@ -931,7 +937,7 @@ hb_font_get_glyph_v_advance (hb_font_t      *font,
  * @first_glyph: The first glyph ID to query
  * @glyph_stride: The stride between successive glyph IDs
  * @first_advance: (out): The first advance retrieved
- * @advance_stride: (out): The stride between successive advances
+ * @advance_stride: The stride between successive advances
  *
  * Fetches the advances for a sequence of glyph IDs in the specified
  * font, for horizontal text segments. 
@@ -983,7 +989,7 @@ hb_font_get_glyph_v_advances (hb_font_t*            font,
  * Fetches the (X,Y) coordinates of the origin for a glyph ID
  * in the specified font, for horizontal text segments.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1006,7 +1012,7 @@ hb_font_get_glyph_h_origin (hb_font_t      *font,
  * Fetches the (X,Y) coordinates of the origin for a glyph ID
  * in the specified font, for vertical text segments.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1026,7 +1032,7 @@ hb_font_get_glyph_v_origin (hb_font_t      *font,
  * @right_glyph: The glyph ID of the right glyph in the glyph pair
  *
  * Fetches the kerning-adjustment value for a glyph-pair in
- * the specified font, in horizontal text segments.
+ * the specified font, for horizontal text segments.
  *
  * <note>It handles legacy kerning only (as returned by the corresponding
  * #hb_font_funcs_t function).</note>
@@ -1051,7 +1057,7 @@ hb_font_get_glyph_h_kerning (hb_font_t      *font,
  * @bottom_glyph: The glyph ID of the bottom glyph in the glyph pair
  *
  * Fetches the kerning-adjustment value for a glyph-pair in
- * the specified font, in vertical text segments.
+ * the specified font, for vertical text segments.
  *
  * <note>It handles legacy kerning only (as returned by the corresponding
  * #hb_font_funcs_t function).</note>
@@ -1079,7 +1085,7 @@ hb_font_get_glyph_v_kerning (hb_font_t      *font,
  * Fetches the #hb_glyph_extents_t data for a glyph ID
  * in the specified font.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1102,7 +1108,7 @@ hb_font_get_glyph_extents (hb_font_t          *font,
  * Fetches the (x,y) coordinates of a specified contour-point index
  * in the specified glyph, within the specified font.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1125,7 +1131,7 @@ hb_font_get_glyph_contour_point (hb_font_t      *font,
  *
  * Fetches the glyph-name string for a glyph ID in the specified @font.
  *
- * Return value: %true if data found, zero otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1149,7 +1155,7 @@ hb_font_get_glyph_name (hb_font_t      *font,
  *
  * <note>Note: @len == -1 means the name string is null-terminated.</note>
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1169,7 +1175,7 @@ hb_font_get_glyph_from_name (hb_font_t      *font,
  * hb_font_get_extents_for_direction:
  * @font: #hb_font_t to work upon
  * @direction: The direction of the text segment
- * @extents: (out): The #hb_glyph_extents_t retrieved
+ * @extents: (out): The #hb_font_extents_t retrieved
  *
  * Fetches the extents for a font in a text segment of the
  * specified direction.
@@ -1364,7 +1370,7 @@ hb_font_get_glyph_kerning_for_direction (hb_font_t      *font,
  * Calls the appropriate direction-specific variant (horizontal
  * or vertical) depending on the value of @direction.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1393,7 +1399,7 @@ hb_font_get_glyph_extents_for_origin (hb_font_t          *font,
  * Calls the appropriate direction-specific variant (horizontal
  * or vertical) depending on the value of @direction.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1444,7 +1450,7 @@ hb_font_glyph_to_string (hb_font_t      *font,
  *
  * <note>Note: @len == -1 means the string is null-terminated.</note>
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1664,12 +1670,12 @@ hb_font_destroy (hb_font_t *font)
  * @font: #hb_font_t to work upon
  * @key: The user-data key 
  * @data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified font object. 
  *
- * Return value:
+ * Return value: %true if success, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1728,7 +1734,7 @@ hb_font_make_immutable (hb_font_t *font)
  *
  * Tests whether a font object is immutable.
  *
- * Return value: %true if @font is immutable, false otherwise
+ * Return value: %true if @font is immutable, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -1828,9 +1834,9 @@ hb_font_get_face (hb_font_t *font)
 /**
  * hb_font_set_funcs:
  * @font: #hb_font_t to work upon
- * @klass: (closure font_data) (destroy destroy) (scope notified):
+ * @klass: (closure font_data) (destroy destroy) (scope notified): The font-functions structure.
  * @font_data: Data to attach to @font
- * @destroy: (optional): The function to call when @font_data is not needed anymore
+ * @destroy: (nullable): The function to call when @font_data is not needed anymore
  *
  * Replaces the font-functions structure attached to a font, updating
  * the font's user-data with @font-data and the @destroy callback.
@@ -1867,7 +1873,7 @@ hb_font_set_funcs (hb_font_t         *font,
  * hb_font_set_funcs_data:
  * @font: #hb_font_t to work upon
  * @font_data: (destroy destroy) (scope notified): Data to attach to @font
- * @destroy: (optional): The function to call when @font_data is not needed anymore
+ * @destroy: (nullable): The function to call when @font_data is not needed anymore
  *
  * Replaces the user data attached to a font, updating the font's 
  * @destroy callback.
@@ -2212,10 +2218,14 @@ hb_font_get_var_coords_normalized (hb_font_t    *font,
 #ifdef HB_EXPERIMENTAL_API
 /**
  * hb_font_get_var_coords_design:
+ * @font: #hb_font_t to work upon
+ * @length: (out): number of coordinates
  *
  * Return value is valid as long as variation coordinates of the font
  * are not modified.
  *
+ * Return value: coordinates array
+ *
  * Since: EXPERIMENTAL
  */
 const float *
@@ -2319,7 +2329,7 @@ hb_font_get_variation_glyph_trampoline (hb_font_t      *font,
  * @ffuncs: The font-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): callback function
  * @user_data: data to pass to @func
- * @destroy: (optional): function to call when @user_data is not needed anymore
+ * @destroy: (nullable): function to call when @user_data is not needed anymore
  *
  * Deprecated.  Use hb_font_funcs_set_nominal_glyph_func() and
  * hb_font_funcs_set_variation_glyph_func() instead.
diff --git a/thirdparty/harfbuzz/src/hb-font.h b/thirdparty/harfbuzz/src/hb-font.h
index 05f6c03f47..15dc126523 100644
--- a/thirdparty/harfbuzz/src/hb-font.h
+++ b/thirdparty/harfbuzz/src/hb-font.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -37,7 +37,12 @@
 
 HB_BEGIN_DECLS
 
-
+/**
+ * hb_font_t:
+ *
+ * Data type for holding fonts.
+ *
+ */
 typedef struct hb_font_t hb_font_t;
 
 
@@ -141,6 +146,16 @@ typedef struct hb_glyph_extents_t {
 
 /* func types */
 
+/**
+ * hb_font_get_font_extents_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @extents: (out): The font extents retrieved
+ * @user_data: User data pointer passed by the caller
+ *
+ * This method should retrieve the extents for a font.
+ *
+ **/
 typedef hb_bool_t (*hb_font_get_font_extents_func_t) (hb_font_t *font, void *font_data,
 						       hb_font_extents_t *extents,
 						       void *user_data);
@@ -150,7 +165,7 @@ typedef hb_bool_t (*hb_font_get_font_extents_func_t) (hb_font_t *font, void *fon
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
- * This method should retrieve the extents for a font, in horizontal-direction
+ * This method should retrieve the extents for a font, for horizontal-direction
  * text segments. Extents must be returned in an #hb_glyph_extents output
  * parameter.
  * 
@@ -162,7 +177,7 @@ typedef hb_font_get_font_extents_func_t hb_font_get_font_h_extents_func_t;
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
- * This method should retrieve the extents for a font, in vertical-direction
+ * This method should retrieve the extents for a font, for vertical-direction
  * text segments. Extents must be returned in an #hb_glyph_extents output
  * parameter.
  * 
@@ -172,12 +187,19 @@ typedef hb_font_get_font_extents_func_t hb_font_get_font_v_extents_func_t;
 
 /**
  * hb_font_get_nominal_glyph_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @unicode: The Unicode code point to query
+ * @glyph: (out): The glyph ID retrieved
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the nominal glyph ID for a specified Unicode code
  * point. Glyph IDs must be returned in a #hb_codepoint_t output parameter.
  * 
+ * Return value: %true if data found, %false otherwise
+ *
  **/
 typedef hb_bool_t (*hb_font_get_nominal_glyph_func_t) (hb_font_t *font, void *font_data,
 						       hb_codepoint_t unicode,
@@ -186,6 +208,12 @@ typedef hb_bool_t (*hb_font_get_nominal_glyph_func_t) (hb_font_t *font, void *fo
 
 /**
  * hb_font_get_variation_glyph_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @unicode: The Unicode code point to query
+ * @variation_selector: The  variation-selector code point to query
+ * @glyph: (out): The glyph ID retrieved
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
@@ -193,6 +221,8 @@ typedef hb_bool_t (*hb_font_get_nominal_glyph_func_t) (hb_font_t *font, void *fo
  * followed by a specified Variation Selector code point. Glyph IDs must be
  * returned in a #hb_codepoint_t output parameter.
  * 
+ * Return value: %true if data found, %false otherwise
+ *
  **/
 typedef hb_bool_t (*hb_font_get_variation_glyph_func_t) (hb_font_t *font, void *font_data,
 							 hb_codepoint_t unicode, hb_codepoint_t variation_selector,
@@ -202,12 +232,22 @@ typedef hb_bool_t (*hb_font_get_variation_glyph_func_t) (hb_font_t *font, void *
 
 /**
  * hb_font_get_nominal_glyphs_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @count: number of code points to query
+ * @first_unicode: The first Unicode code point to query
+ * @unicode_stride: The stride between successive code points
+ * @first_glyph: (out): The first glyph ID retrieved
+ * @glyph_stride: The stride between successive glyph IDs
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the nominal glyph IDs for a sequence of
  * Unicode code points. Glyph IDs must be returned in a #hb_codepoint_t
  * output parameter.
+ *
+ * Return value: the number of code points processed
  * 
  **/
 typedef unsigned int (*hb_font_get_nominal_glyphs_func_t) (hb_font_t *font, void *font_data,
@@ -220,12 +260,18 @@ typedef unsigned int (*hb_font_get_nominal_glyphs_func_t) (hb_font_t *font, void
 
 /**
  * hb_font_get_glyph_advance_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @glyph: The glyph ID to query
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the advance for a specified glyph. The
  * method must return an #hb_position_t.
  * 
+ * Return value: The advance of @glyph within @font
+ *
  **/
 typedef hb_position_t (*hb_font_get_glyph_advance_func_t) (hb_font_t *font, void *font_data,
 							   hb_codepoint_t glyph,
@@ -257,6 +303,14 @@ typedef hb_font_get_glyph_advance_func_t hb_font_get_glyph_v_advance_func_t;
 
 /**
  * hb_font_get_glyph_advances_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @count: The number of glyph IDs in the sequence queried
+ * @first_glyph: The first glyph ID to query
+ * @glyph_stride: The stride between successive glyph IDs
+ * @first_advance: (out): The first advance retrieved
+ * @advance_stride: The stride between successive advances
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
@@ -295,12 +349,20 @@ typedef hb_font_get_glyph_advances_func_t hb_font_get_glyph_v_advances_func_t;
 
 /**
  * hb_font_get_glyph_origin_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @glyph: The glyph ID to query
+ * @x: (out): The X coordinate of the origin
+ * @y: (out): The Y coordinate of the origin
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the (X,Y) coordinates (in font units) of the
  * origin for a glyph. Each coordinate must be returned in an #hb_position_t
  * output parameter.
+ *
+ * Return value: %true if data found, %false otherwise
  * 
  **/
 typedef hb_bool_t (*hb_font_get_glyph_origin_func_t) (hb_font_t *font, void *font_data,
@@ -314,7 +376,7 @@ typedef hb_bool_t (*hb_font_get_glyph_origin_func_t) (hb_font_t *font, void *fon
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the (X,Y) coordinates (in font units) of the
- * origin for a glyph, in horizontal-direction text segments. Each
+ * origin for a glyph, for horizontal-direction text segments. Each
  * coordinate must be returned in an #hb_position_t output parameter.
  * 
  **/
@@ -326,25 +388,53 @@ typedef hb_font_get_glyph_origin_func_t hb_font_get_glyph_h_origin_func_t;
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the (X,Y) coordinates (in font units) of the
- * origin for a glyph, in vertical-direction text segments. Each coordinate
+ * origin for a glyph, for vertical-direction text segments. Each coordinate
  * must be returned in an #hb_position_t output parameter.
  * 
  **/
 typedef hb_font_get_glyph_origin_func_t hb_font_get_glyph_v_origin_func_t;
 
+/**
+ * hb_font_get_glyph_kerning_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @first_glyph: The glyph ID of the first glyph in the glyph pair
+ * @second_glyph: The glyph ID of the second glyph in the glyph pair
+ * @user_data: User data pointer passed by the caller
+ *
+ * This method should retrieve the kerning-adjustment value for a glyph-pair in
+ * the specified font, for horizontal text segments.
+ *
+ **/
 typedef hb_position_t (*hb_font_get_glyph_kerning_func_t) (hb_font_t *font, void *font_data,
 							   hb_codepoint_t first_glyph, hb_codepoint_t second_glyph,
 							   void *user_data);
+/**
+ * hb_font_get_glyph_h_kerning_func_t:
+ *
+ * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
+ *
+ * This method should retrieve the kerning-adjustment value for a glyph-pair in
+ * the specified font, for horizontal text segments.
+ *
+ **/
 typedef hb_font_get_glyph_kerning_func_t hb_font_get_glyph_h_kerning_func_t;
 
 
 /**
  * hb_font_get_glyph_extents_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @glyph: The glyph ID to query
+ * @extents: (out): The #hb_glyph_extents_t retrieved
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the extents for a specified glyph. Extents must be 
  * returned in an #hb_glyph_extents output parameter.
+ *
+ * Return value: %true if data found, %false otherwise
  * 
  **/
 typedef hb_bool_t (*hb_font_get_glyph_extents_func_t) (hb_font_t *font, void *font_data,
@@ -354,6 +444,13 @@ typedef hb_bool_t (*hb_font_get_glyph_extents_func_t) (hb_font_t *font, void *fo
 
 /**
  * hb_font_get_glyph_contour_point_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @glyph: The glyph ID to query
+ * @point_index: The contour-point index to query
+ * @x: (out): The X value retrieved for the contour point
+ * @y: (out): The Y value retrieved for the contour point
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
@@ -361,6 +458,8 @@ typedef hb_bool_t (*hb_font_get_glyph_extents_func_t) (hb_font_t *font, void *fo
  * specified contour point in a glyph. Each coordinate must be returned as
  * an #hb_position_t output parameter.
  * 
+ * Return value: %true if data found, %false otherwise
+ *
  **/
 typedef hb_bool_t (*hb_font_get_glyph_contour_point_func_t) (hb_font_t *font, void *font_data,
 							     hb_codepoint_t glyph, unsigned int point_index,
@@ -370,12 +469,20 @@ typedef hb_bool_t (*hb_font_get_glyph_contour_point_func_t) (hb_font_t *font, vo
 
 /**
  * hb_font_get_glyph_name_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @glyph: The glyph ID to query
+ * @name: (out) (array length=size): Name string retrieved for the glyph ID
+ * @size: Length of the glyph-name string retrieved
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the glyph name that corresponds to a
  * glyph ID. The name should be returned in a string output parameter.
  * 
+ * Return value: %true if data found, %false otherwise
+ *
  **/
 typedef hb_bool_t (*hb_font_get_glyph_name_func_t) (hb_font_t *font, void *font_data,
 						    hb_codepoint_t glyph,
@@ -384,12 +491,20 @@ typedef hb_bool_t (*hb_font_get_glyph_name_func_t) (hb_font_t *font, void *font_
 
 /**
  * hb_font_get_glyph_from_name_func_t:
+ * @font: #hb_font_t to work upon
+ * @font_data: @font user data pointer
+ * @name: (array length=len): The name string to query
+ * @len: The length of the name queried
+ * @glyph: (out): The glyph ID retrieved
+ * @user_data: User data pointer passed by the caller
  *
  * A virtual method for the #hb_font_funcs_t of an #hb_font_t object.
  *
  * This method should retrieve the glyph ID that corresponds to a glyph-name
  * string. 
  * 
+ * Return value: %true if data found, %false otherwise
+ *
  **/
 typedef hb_bool_t (*hb_font_get_glyph_from_name_func_t) (hb_font_t *font, void *font_data,
 							 const char *name, int len, /* -1 means nul-terminated */
@@ -404,7 +519,7 @@ typedef hb_bool_t (*hb_font_get_glyph_from_name_func_t) (hb_font_t *font, void *
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_font_h_extents_func_t.
  *
@@ -420,7 +535,7 @@ hb_font_funcs_set_font_h_extents_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_font_v_extents_func_t.
  *
@@ -436,7 +551,7 @@ hb_font_funcs_set_font_v_extents_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_nominal_glyph_func_t.
  *
@@ -452,7 +567,7 @@ hb_font_funcs_set_nominal_glyph_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_nominal_glyphs_func_t.
  *
@@ -468,7 +583,7 @@ hb_font_funcs_set_nominal_glyphs_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_variation_glyph_func_t.
  *
@@ -484,7 +599,7 @@ hb_font_funcs_set_variation_glyph_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_h_advance_func_t.
  *
@@ -500,7 +615,7 @@ hb_font_funcs_set_glyph_h_advance_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_v_advance_func_t.
  *
@@ -516,7 +631,7 @@ hb_font_funcs_set_glyph_v_advance_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_h_advances_func_t.
  *
@@ -532,7 +647,7 @@ hb_font_funcs_set_glyph_h_advances_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_v_advances_func_t.
  *
@@ -548,7 +663,7 @@ hb_font_funcs_set_glyph_v_advances_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_h_origin_func_t.
  *
@@ -564,7 +679,7 @@ hb_font_funcs_set_glyph_h_origin_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_v_origin_func_t.
  *
@@ -577,12 +692,12 @@ hb_font_funcs_set_glyph_v_origin_func (hb_font_funcs_t *ffuncs,
 
 /**
  * hb_font_funcs_set_glyph_h_kerning_func:
- * @ffuncs: font functions.
- * @func: (closure user_data) (destroy destroy) (scope notified):
- * @user_data:
- * @destroy:
- *
+ * @ffuncs: A font-function structure
+ * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
+ * @user_data: Data to pass to @func
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
+ * Sets the implementation function for #hb_font_get_glyph_h_kerning_func_t.
  *
  * Since: 0.9.2
  **/
@@ -596,7 +711,7 @@ hb_font_funcs_set_glyph_h_kerning_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_extents_func_t.
  *
@@ -612,7 +727,7 @@ hb_font_funcs_set_glyph_extents_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_contour_point_func_t.
  *
@@ -628,7 +743,7 @@ hb_font_funcs_set_glyph_contour_point_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_name_func_t.
  *
@@ -644,7 +759,7 @@ hb_font_funcs_set_glyph_name_func (hb_font_funcs_t *ffuncs,
  * @ffuncs: A font-function structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_font_get_glyph_from_name_func_t.
  *
diff --git a/thirdparty/harfbuzz/src/hb-ft.cc b/thirdparty/harfbuzz/src/hb-ft.cc
index ab7d6146ce..b82c1a67bd 100644
--- a/thirdparty/harfbuzz/src/hb-ft.cc
+++ b/thirdparty/harfbuzz/src/hb-ft.cc
@@ -84,7 +84,7 @@ struct hb_ft_font_t
   bool symbol; /* Whether selected cmap is symbol cmap. */
   bool unref; /* Whether to destroy ft_face when done. */
 
-  mutable hb_atomic_int_t cached_x_scale;
+  mutable int cached_x_scale;
   mutable hb_advance_cache_t advance_cache;
 };
 
@@ -101,7 +101,7 @@ _hb_ft_font_create (FT_Face ft_face, bool symbol, bool unref)
 
   ft_font->load_flags = FT_LOAD_DEFAULT | FT_LOAD_NO_HINTING;
 
-  ft_font->cached_x_scale.set_relaxed (0);
+  ft_font->cached_x_scale = 0;
   ft_font->advance_cache.init ();
 
   return ft_font;
@@ -179,13 +179,13 @@ hb_ft_font_get_load_flags (hb_font_t *font)
 }
 
 /**
- * hb_ft_get_face:
+ * hb_ft_font_get_face:
  * @font: #hb_font_t to work upon
  *
  * Fetches the FT_Face associated with the specified #hb_font_t
  * font object.
  *
- * Return value: the FT_Face found
+ * Return value: (nullable): the FT_Face found or %NULL
  *
  * Since: 0.9.2
  **/
@@ -202,11 +202,12 @@ hb_ft_font_get_face (hb_font_t *font)
 
 /**
  * hb_ft_font_lock_face:
- * @font:
- *
+ * @font: #hb_font_t to work upon
  *
+ * Gets the FT_Face associated with @font, This face will be kept around until
+ * you call hb_ft_font_unlock_face().
  *
- * Return value:
+ * Return value: (nullable): the FT_Face associated with @font or %NULL
  * Since: 2.6.5
  **/
 FT_Face
@@ -224,11 +225,10 @@ hb_ft_font_lock_face (hb_font_t *font)
 
 /**
  * hb_ft_font_unlock_face:
- * @font:
- *
+ * @font: #hb_font_t to work upon
  *
+ * Releases an FT_Face previously obtained with hb_ft_font_lock_face().
  *
- * Return value:
  * Since: 2.6.5
  **/
 void
@@ -335,10 +335,10 @@ hb_ft_get_glyph_h_advances (hb_font_t* font, void* font_data,
   int load_flags = ft_font->load_flags;
   int mult = font->x_scale < 0 ? -1 : +1;
 
-  if (font->x_scale != ft_font->cached_x_scale.get ())
+  if (font->x_scale != ft_font->cached_x_scale)
   {
     ft_font->advance_cache.clear ();
-    ft_font->cached_x_scale.set (font->x_scale);
+    ft_font->cached_x_scale = font->x_scale;
   }
 
   for (unsigned int i = 0; i < count; i++)
@@ -661,7 +661,7 @@ _hb_ft_reference_table (hb_face_t *face HB_UNUSED, hb_tag_t tag, void *user_data
 /**
  * hb_ft_face_create:
  * @ft_face: (destroy destroy) (scope notified): FT_Face to work upon
- * @destroy: (optional): A callback to call when the face object is not needed anymore
+ * @destroy: (nullable): A callback to call when the face object is not needed anymore
  *
  * Creates an #hb_face_t face object from the specified FT_Face.
  *
@@ -771,13 +771,13 @@ hb_ft_face_create_cached (FT_Face ft_face)
 /**
  * hb_ft_font_create:
  * @ft_face: (destroy destroy) (scope notified): FT_Face to work upon
- * @destroy: (optional): A callback to call when the font object is not needed anymore
+ * @destroy: (nullable): A callback to call when the font object is not needed anymore
  *
  * Creates an #hb_font_t font object from the specified FT_Face.
  *
  * <note>Note: You must set the face size on @ft_face before calling
- * hb_ft_font_create() on it. Otherwise, HarfBuzz will not pick up
- * the face size.</note>
+ * hb_ft_font_create() on it. HarfBuzz assumes size is always set and will
+ * access `size` member of FT_Face unconditionally.</note>
  *
  * This variant of the function does not provide any life-cycle management.
  *
@@ -814,7 +814,7 @@ hb_ft_font_create (FT_Face           ft_face,
 }
 
 /**
- * hb_ft_font_has_changed:
+ * hb_ft_font_changed:
  * @font: #hb_font_t to work upon
  *
  * Refreshes the state of @font when the underlying FT_Face has changed.
@@ -884,8 +884,8 @@ hb_ft_font_changed (hb_font_t *font)
  * Creates an #hb_font_t font object from the specified FT_Face.
  *
  * <note>Note: You must set the face size on @ft_face before calling
- * hb_ft_font_create_references() on it. Otherwise, HarfBuzz will not pick up
- * the face size.</note>
+ * hb_ft_font_create_referenced() on it. HarfBuzz assumes size is always set
+ * and will access `size` member of FT_Face unconditionally.</note>
  *
  * This is the preferred variant of the hb_ft_font_create*
  * function family, because it calls FT_Reference_Face() on @ft_face,
diff --git a/thirdparty/harfbuzz/src/hb-gdi.cc b/thirdparty/harfbuzz/src/hb-gdi.cc
index 3a67cef160..dc4659c7f6 100644
--- a/thirdparty/harfbuzz/src/hb-gdi.cc
+++ b/thirdparty/harfbuzz/src/hb-gdi.cc
@@ -70,6 +70,8 @@ fail:
  * hb_gdi_face_create:
  * @hfont: a HFONT object.
  *
+ * Constructs a new face object from the specified GDI HFONT.
+ *
  * Return value: #hb_face_t object corresponding to the given input
  *
  * Since: 2.6.0
diff --git a/thirdparty/harfbuzz/src/hb-gobject-structs.h b/thirdparty/harfbuzz/src/hb-gobject-structs.h
index 6fad8d7019..63467f80df 100644
--- a/thirdparty/harfbuzz/src/hb-gobject-structs.h
+++ b/thirdparty/harfbuzz/src/hb-gobject-structs.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_GOBJECT_H_IN
+#if !defined(HB_GOBJECT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-gobject.h> instead."
 #endif
 
@@ -40,47 +40,22 @@ HB_BEGIN_DECLS
 
 /* Object types */
 
-/**
- * hb_gobject_blob_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_blob_get_type (void);
 #define HB_GOBJECT_TYPE_BLOB (hb_gobject_blob_get_type ())
 
-/**
- * hb_gobject_buffer_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_buffer_get_type (void);
 #define HB_GOBJECT_TYPE_BUFFER (hb_gobject_buffer_get_type ())
 
-/**
- * hb_gobject_face_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_face_get_type (void);
 #define HB_GOBJECT_TYPE_FACE (hb_gobject_face_get_type ())
 
-/**
- * hb_gobject_font_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_font_get_type (void);
 #define HB_GOBJECT_TYPE_FONT (hb_gobject_font_get_type ())
 
-/**
- * hb_gobject_font_funcs_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_font_funcs_get_type (void);
 #define HB_GOBJECT_TYPE_FONT_FUNCS (hb_gobject_font_funcs_get_type ())
@@ -97,11 +72,6 @@ HB_EXTERN GType
 hb_gobject_shape_plan_get_type (void);
 #define HB_GOBJECT_TYPE_SHAPE_PLAN (hb_gobject_shape_plan_get_type ())
 
-/**
- * hb_gobject_unicode_funcs_get_type:
- *
- * Since: 0.9.2
- **/
 HB_EXTERN GType
 hb_gobject_unicode_funcs_get_type (void);
 #define HB_GOBJECT_TYPE_UNICODE_FUNCS (hb_gobject_unicode_funcs_get_type ())
diff --git a/thirdparty/harfbuzz/src/hb-graphite2.cc b/thirdparty/harfbuzz/src/hb-graphite2.cc
index d8a72dc2f1..9dafe654c8 100644
--- a/thirdparty/harfbuzz/src/hb-graphite2.cc
+++ b/thirdparty/harfbuzz/src/hb-graphite2.cc
@@ -195,6 +195,11 @@ _hb_graphite2_shaper_font_data_destroy (hb_graphite2_font_data_t *data HB_UNUSED
 #ifndef HB_DISABLE_DEPRECATED
 /**
  * hb_graphite2_font_get_gr_font:
+ * @font: An #hb_font_t
+ *
+ * Always returns %NULL. Use hb_graphite2_face_get_gr_face() instead.
+ *
+ * Return value: (nullable): Graphite2 font associated with @font.
  *
  * Since: 0.9.10
  * Deprecated: 1.4.2
@@ -284,7 +289,7 @@ _hb_graphite2_shape (hb_shape_plan_t    *shape_plan HB_UNUSED,
     return true;
   }
 
-  buffer->ensure (glyph_count);
+  (void) buffer->ensure (glyph_count);
   scratch = buffer->get_scratch_buffer (&scratch_size);
   while ((DIV_CEIL (sizeof (hb_graphite2_cluster_t) * buffer->len, sizeof (*scratch)) +
 	  DIV_CEIL (sizeof (hb_codepoint_t) * glyph_count, sizeof (*scratch))) > scratch_size)
diff --git a/thirdparty/harfbuzz/src/hb-iter.hh b/thirdparty/harfbuzz/src/hb-iter.hh
index 981c5c218c..f7018150e4 100644
--- a/thirdparty/harfbuzz/src/hb-iter.hh
+++ b/thirdparty/harfbuzz/src/hb-iter.hh
@@ -922,7 +922,7 @@ HB_FUNCOBJ (hb_none);
 template <typename C, typename V,
 	  hb_requires (hb_is_iterable (C))>
 inline void
-hb_fill (C& c, const V &v)
+hb_fill (C&& c, const V &v)
 {
   for (auto i = hb_iter (c); i; i++)
     *i = v;
diff --git a/thirdparty/harfbuzz/src/hb-machinery.hh b/thirdparty/harfbuzz/src/hb-machinery.hh
index 54bc60d4c8..3bd5a979b0 100644
--- a/thirdparty/harfbuzz/src/hb-machinery.hh
+++ b/thirdparty/harfbuzz/src/hb-machinery.hh
@@ -80,6 +80,11 @@ static inline Type& StructAfter(TObject &X)
  * Size checking
  */
 
+/* Size signifying variable-sized array */
+#ifndef HB_VAR_ARRAY
+#define HB_VAR_ARRAY 1
+#endif
+
 /* Check _assertion in a method environment */
 #define _DEFINE_INSTANCE_ASSERTION1(_line, _assertion) \
   void _instance_assertion_on_line_##_line () const \
diff --git a/thirdparty/harfbuzz/src/hb-map.cc b/thirdparty/harfbuzz/src/hb-map.cc
index f898bd8f92..f115da2bb8 100644
--- a/thirdparty/harfbuzz/src/hb-map.cc
+++ b/thirdparty/harfbuzz/src/hb-map.cc
@@ -117,7 +117,7 @@ hb_map_destroy (hb_map_t *map)
  * @map: A map
  * @key: The user-data key to set
  * @data: A pointer to the user data to set
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified map.
@@ -162,7 +162,7 @@ hb_map_get_user_data (hb_map_t           *map,
  *
  * Tests whether memory allocation for a set was successful.
  *
- * Return value: %true if allocation succeeded, false otherwise
+ * Return value: %true if allocation succeeded, %false otherwise
  *
  * Since: 1.7.7
  **/
@@ -230,7 +230,7 @@ hb_map_del (hb_map_t       *map,
  *
  * Tests whether @key is an element of @map.
  *
- * Return value: %true if @key is found in @map, false otherwise
+ * Return value: %true if @key is found in @map, %false otherwise
  *
  * Since: 1.7.7
  **/
@@ -253,6 +253,9 @@ hb_map_has (const hb_map_t *map,
 void
 hb_map_clear (hb_map_t *map)
 {
+  if (unlikely (hb_object_is_immutable (map)))
+    return;
+
   return map->clear ();
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-map.h b/thirdparty/harfbuzz/src/hb-map.h
index 0c19ac8fb5..6a45a7bdd5 100644
--- a/thirdparty/harfbuzz/src/hb-map.h
+++ b/thirdparty/harfbuzz/src/hb-map.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -36,7 +36,11 @@
 HB_BEGIN_DECLS
 
 
-/*
+/**
+ * HB_MAP_VALUE_INVALID:
+ *
+ * Unset #hb_map_t value.
+ *
  * Since: 1.7.7
  */
 #define HB_MAP_VALUE_INVALID ((hb_codepoint_t) -1)
diff --git a/thirdparty/harfbuzz/src/hb-map.hh b/thirdparty/harfbuzz/src/hb-map.hh
index 92c1bd67e5..84fe1d549b 100644
--- a/thirdparty/harfbuzz/src/hb-map.hh
+++ b/thirdparty/harfbuzz/src/hb-map.hh
@@ -97,8 +97,6 @@ struct hb_hashmap_t
 
   void reset ()
   {
-    if (unlikely (hb_object_is_immutable (this)))
-      return;
     successful = true;
     clear ();
   }
@@ -171,8 +169,6 @@ struct hb_hashmap_t
 
   void clear ()
   {
-    if (unlikely (hb_object_is_immutable (this)))
-      return;
     if (items)
       for (auto &_ : hb_iter (items, mask + 1))
 	_.clear ();
@@ -181,6 +177,7 @@ struct hb_hashmap_t
   }
 
   bool is_empty () const { return population == 0; }
+  explicit operator bool () const { return !is_empty (); }
 
   unsigned int get_population () const { return population; }
 
diff --git a/thirdparty/harfbuzz/src/hb-meta.hh b/thirdparty/harfbuzz/src/hb-meta.hh
index 4c0898b1b7..e40d9fd178 100644
--- a/thirdparty/harfbuzz/src/hb-meta.hh
+++ b/thirdparty/harfbuzz/src/hb-meta.hh
@@ -49,6 +49,10 @@ template <bool b> using hb_bool_constant = hb_integral_constant<bool, b>;
 using hb_true_type = hb_bool_constant<true>;
 using hb_false_type = hb_bool_constant<false>;
 
+/* Static-assert as expression. */
+template <bool cond> struct static_assert_expr;
+template <> struct static_assert_expr<true> : hb_false_type {};
+#define static_assert_expr(C) static_assert_expr<C>::value
 
 /* Basic type SFINAE. */
 
@@ -220,6 +224,8 @@ struct hb_reference_wrapper<T&>
 };
 
 
+/* Type traits */
+
 template <typename T>
 using hb_is_integral = hb_bool_constant<
   hb_is_same (hb_decay<T>, char) ||
@@ -292,6 +298,15 @@ template <> struct hb_int_max<unsigned long long>	: hb_integral_constant<unsigne
 #define hb_int_max(T) hb_int_max<T>::value
 
 
+/* Class traits. */
+
+#define HB_DELETE_COPY_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete; \
+  void operator=(const TypeName&) = delete
+#define HB_DELETE_CREATE_COPY_ASSIGN(TypeName) \
+  TypeName() = delete; \
+  TypeName(const TypeName&) = delete; \
+  void operator=(const TypeName&) = delete
 
 template <typename T, typename>
 struct _hb_is_destructible : hb_false_type {};
diff --git a/thirdparty/harfbuzz/src/hb-mutex.hh b/thirdparty/harfbuzz/src/hb-mutex.hh
index 56392d049b..2fc8d7ee58 100644
--- a/thirdparty/harfbuzz/src/hb-mutex.hh
+++ b/thirdparty/harfbuzz/src/hb-mutex.hh
@@ -73,24 +73,6 @@ typedef CRITICAL_SECTION hb_mutex_impl_t;
 #define hb_mutex_impl_finish(M)	DeleteCriticalSection (M)
 
 
-#elif !defined(HB_NO_MT) && defined(HAVE_INTEL_ATOMIC_PRIMITIVES)
-
-#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_YIELD)
-# include <sched.h>
-# define HB_SCHED_YIELD() sched_yield ()
-#else
-# define HB_SCHED_YIELD() HB_STMT_START {} HB_STMT_END
-#endif
-
-/* This actually is not a totally awful implementation. */
-typedef volatile int hb_mutex_impl_t;
-#define HB_MUTEX_IMPL_INIT	0
-#define hb_mutex_impl_init(M)	*(M) = 0
-#define hb_mutex_impl_lock(M)	HB_STMT_START { while (__sync_lock_test_and_set((M), 1)) HB_SCHED_YIELD (); } HB_STMT_END
-#define hb_mutex_impl_unlock(M)	__sync_lock_release (M)
-#define hb_mutex_impl_finish(M)	HB_STMT_START {} HB_STMT_END
-
-
 #elif defined(HB_NO_MT)
 
 typedef int hb_mutex_impl_t;
diff --git a/thirdparty/harfbuzz/src/hb-object.hh b/thirdparty/harfbuzz/src/hb-object.hh
index 39845a70e7..f3048b1c3e 100644
--- a/thirdparty/harfbuzz/src/hb-object.hh
+++ b/thirdparty/harfbuzz/src/hb-object.hh
@@ -140,9 +140,7 @@ struct hb_lockable_set_t
  * Reference-count.
  */
 
-#define HB_REFERENCE_COUNT_INERT_VALUE 0
-#define HB_REFERENCE_COUNT_POISON_VALUE -0x0000DEAD
-#define HB_REFERENCE_COUNT_INIT {HB_ATOMIC_INT_INIT (HB_REFERENCE_COUNT_INERT_VALUE)}
+#define HB_REFERENCE_COUNT_INIT {0}
 
 struct hb_reference_count_t
 {
@@ -152,9 +150,9 @@ struct hb_reference_count_t
   int get_relaxed () const { return ref_count.get_relaxed (); }
   int inc () const { return ref_count.inc (); }
   int dec () const { return ref_count.dec (); }
-  void fini () { ref_count.set_relaxed (HB_REFERENCE_COUNT_POISON_VALUE); }
+  void fini () { ref_count.set_relaxed (-0x0000DEAD); }
 
-  bool is_inert () const { return ref_count.get_relaxed () == HB_REFERENCE_COUNT_INERT_VALUE; }
+  bool is_inert () const { return !ref_count.get_relaxed (); }
   bool is_valid () const { return ref_count.get_relaxed () > 0; }
 };
 
@@ -197,15 +195,10 @@ struct hb_user_data_array_t
 struct hb_object_header_t
 {
   hb_reference_count_t ref_count;
-  mutable hb_atomic_int_t writable;
+  mutable hb_atomic_int_t writable = 0;
   hb_atomic_ptr_t<hb_user_data_array_t> user_data;
 };
-#define HB_OBJECT_HEADER_STATIC \
-	{ \
-	  HB_REFERENCE_COUNT_INIT, \
-	  HB_ATOMIC_INT_INIT (false), \
-	  HB_ATOMIC_PTR_INIT (nullptr) \
-	}
+#define HB_OBJECT_HEADER_STATIC {}
 
 
 /*
diff --git a/thirdparty/harfbuzz/src/hb-open-file.hh b/thirdparty/harfbuzz/src/hb-open-file.hh
index ac13dd23c3..54c07ff13c 100644
--- a/thirdparty/harfbuzz/src/hb-open-file.hh
+++ b/thirdparty/harfbuzz/src/hb-open-file.hh
@@ -48,7 +48,7 @@ namespace OT {
  */
 
 struct OpenTypeFontFile;
-struct OffsetTable;
+struct OpenTypeOffsetTable;
 struct TTCHeader;
 
 
@@ -78,7 +78,7 @@ typedef struct TableRecord
   DEFINE_SIZE_STATIC (16);
 } OpenTypeTable;
 
-typedef struct OffsetTable
+typedef struct OpenTypeOffsetTable
 {
   friend struct OpenTypeFontFile;
 
@@ -218,7 +218,7 @@ struct TTCHeaderVersion1
   Tag		ttcTag;		/* TrueType Collection ID string: 'ttcf' */
   FixedVersion<>version;	/* Version of the TTC Header (1.0),
 				 * 0x00010000u */
-  LArrayOf<LOffsetTo<OffsetTable>>
+  LArrayOf<LOffsetTo<OpenTypeOffsetTable>>
 		table;		/* Array of offsets to the OffsetTable for each font
 				 * from the beginning of the file */
   public:
diff --git a/thirdparty/harfbuzz/src/hb-open-type.hh b/thirdparty/harfbuzz/src/hb-open-type.hh
index 99634b76f0..dc0ae1d989 100644
--- a/thirdparty/harfbuzz/src/hb-open-type.hh
+++ b/thirdparty/harfbuzz/src/hb-open-type.hh
@@ -53,14 +53,19 @@ namespace OT {
  */
 
 /* Integer types in big-endian order and no alignment requirement */
-template <typename Type, unsigned int Size>
+template <typename Type,
+	  unsigned int Size = sizeof (Type)>
 struct IntType
 {
   typedef Type type;
-  typedef hb_conditional<hb_is_signed (Type), signed, unsigned> wide_type;
 
-  IntType& operator = (wide_type i) { v = i; return *this; }
-  operator wide_type () const { return v; }
+  IntType () = default;
+  explicit constexpr IntType (Type V) : v {V} {}
+  IntType& operator = (Type i) { v = i; return *this; }
+  /* For reason we define cast out operator for signed/unsigned, instead of Type, see:
+   * https://github.com/harfbuzz/harfbuzz/pull/2875/commits/09836013995cab2b9f07577a179ad7b024130467 */
+  operator hb_conditional<hb_is_signed (Type), signed, unsigned> () const { return v; }
+
   bool operator == (const IntType &o) const { return (Type) v == (Type) o.v; }
   bool operator != (const IntType &o) const { return !(*this == o); }
 
@@ -80,14 +85,21 @@ struct IntType
 
     return pb->cmp (*pa);
   }
-  template <typename Type2>
+  template <typename Type2,
+	    hb_enable_if (hb_is_integral (Type2) &&
+			  sizeof (Type2) < sizeof (int) &&
+			  sizeof (Type) < sizeof (int))>
   int cmp (Type2 a) const
   {
     Type b = v;
-    if (sizeof (Type) < sizeof (int) && sizeof (Type2) < sizeof (int))
-      return (int) a - (int) b;
-    else
-      return a < b ? -1 : a == b ? 0 : +1;
+    return (int) a - (int) b;
+  }
+  template <typename Type2,
+	    hb_enable_if (hb_is_convertible (Type2, Type))>
+  int cmp (Type2 a) const
+  {
+    Type b = v;
+    return a < b ? -1 : a == b ? 0 : +1;
   }
   bool sanitize (hb_sanitize_context_t *c) const
   {
@@ -100,12 +112,12 @@ struct IntType
   DEFINE_SIZE_STATIC (Size);
 };
 
-typedef IntType<uint8_t,  1> HBUINT8;	/* 8-bit unsigned integer. */
-typedef IntType<int8_t,   1> HBINT8;	/* 8-bit signed integer. */
-typedef IntType<uint16_t, 2> HBUINT16;	/* 16-bit unsigned integer. */
-typedef IntType<int16_t,  2> HBINT16;	/* 16-bit signed integer. */
-typedef IntType<uint32_t, 4> HBUINT32;	/* 32-bit unsigned integer. */
-typedef IntType<int32_t,  4> HBINT32;	/* 32-bit signed integer. */
+typedef IntType<uint8_t>  HBUINT8;	/* 8-bit unsigned integer. */
+typedef IntType<int8_t>   HBINT8;	/* 8-bit signed integer. */
+typedef IntType<uint16_t> HBUINT16;	/* 16-bit unsigned integer. */
+typedef IntType<int16_t>  HBINT16;	/* 16-bit signed integer. */
+typedef IntType<uint32_t> HBUINT32;	/* 32-bit unsigned integer. */
+typedef IntType<int32_t>  HBINT32;	/* 32-bit signed integer. */
 /* Note: we cannot defined a signed HBINT24 because there's no corresponding C type.
  * Works for unsigned, but not signed, since we rely on compiler for sign-extension. */
 typedef IntType<uint32_t, 3> HBUINT24;	/* 24-bit unsigned integer. */
@@ -163,8 +175,8 @@ struct Tag : HBUINT32
 {
   Tag& operator = (hb_tag_t i) { HBUINT32::operator= (i); return *this; }
   /* What the char* converters return is NOT nul-terminated.  Print using "%.4s" */
-  operator const char* () const { return reinterpret_cast<const char *> (&this->v); }
-  operator char* ()             { return reinterpret_cast<char *> (&this->v); }
+  operator const char* () const { return reinterpret_cast<const char *> (this); }
+  operator char* ()             { return reinterpret_cast<char *> (this); }
   public:
   DEFINE_SIZE_STATIC (4);
 };
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
index e5286cd792..864a27f458 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
@@ -183,7 +183,7 @@ struct CFFIndex
     else
     {
       serialize_header(c, + it | hb_map ([] (const byte_str_t &_) { return _.length; }));
-      for (const byte_str_t &_ : +it)
+      for (const auto &_ : +it)
 	_.copy (c);
     }
     return_trace (true);
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff1-table.cc b/thirdparty/harfbuzz/src/hb-ot-cff1-table.cc
index 66b9c8c907..3298fa35ae 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff1-table.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-cff1-table.cc
@@ -426,7 +426,7 @@ bool OT::cff1::accelerator_t::get_extents (hb_font_t *font, hb_codepoint_t glyph
   else
   {
     extents->x_bearing = font->em_scalef_x (bounds.min.x.to_real ());
-    extents->width = font->em_scalef_x (bounds.max.x.to_real () - bounds.min.x.to_real ());
+    extents->width = font->em_scalef_x (bounds.max.x.to_real ()) - extents->x_bearing;
   }
   if (bounds.min.y >= bounds.max.y)
   {
@@ -436,7 +436,7 @@ bool OT::cff1::accelerator_t::get_extents (hb_font_t *font, hb_codepoint_t glyph
   else
   {
     extents->y_bearing = font->em_scalef_y (bounds.max.y.to_real ());
-    extents->height = font->em_scalef_y (bounds.min.y.to_real () - bounds.max.y.to_real ());
+    extents->height = font->em_scalef_y (bounds.min.y.to_real ()) - extents->y_bearing;
   }
 
   return true;
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff2-table.cc b/thirdparty/harfbuzz/src/hb-ot-cff2-table.cc
index ac0feeee21..879b7cdb23 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff2-table.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-cff2-table.cc
@@ -127,7 +127,7 @@ bool OT::cff2::accelerator_t::get_extents (hb_font_t *font,
   else
   {
     extents->x_bearing = font->em_scalef_x (param.min_x.to_real ());
-    extents->width = font->em_scalef_x (param.max_x.to_real () - param.min_x.to_real ());
+    extents->width = font->em_scalef_x (param.max_x.to_real ()) - extents->x_bearing;
   }
   if (param.min_y >= param.max_y)
   {
@@ -137,7 +137,7 @@ bool OT::cff2::accelerator_t::get_extents (hb_font_t *font,
   else
   {
     extents->y_bearing = font->em_scalef_y (param.max_y.to_real ());
-    extents->height = font->em_scalef_y (param.min_y.to_real () - param.max_y.to_real ());
+    extents->height = font->em_scalef_y (param.min_y.to_real ()) - extents->y_bearing;
   }
 
   return true;
diff --git a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
index cc48379bb8..878e02ff17 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
@@ -95,7 +95,7 @@ struct CmapSubtableFormat4
     HBUINT16 *endCode = c->start_embed<HBUINT16> ();
     hb_codepoint_t prev_endcp = 0xFFFF;
 
-    for (const hb_item_type<Iterator> _ : +it)
+    for (const auto& _ : +it)
     {
       if (prev_endcp != 0xFFFF && prev_endcp + 1u != _.first)
       {
@@ -131,7 +131,7 @@ struct CmapSubtableFormat4
     HBUINT16 *startCode = c->start_embed<HBUINT16> ();
     hb_codepoint_t prev_cp = 0xFFFF;
 
-    for (const hb_item_type<Iterator> _ : +it)
+    for (const auto& _ : +it)
     {
       if (prev_cp == 0xFFFF || prev_cp + 1u != _.first)
       {
@@ -170,7 +170,7 @@ struct CmapSubtableFormat4
     if ((char *)idDelta - (char *)startCode != (int) segcount * (int) HBINT16::static_size)
       return nullptr;
 
-    for (const hb_item_type<Iterator> _ : +it)
+    for (const auto& _ : +it)
     {
       if (_.first == startCode[i])
       {
@@ -696,7 +696,7 @@ struct CmapSubtableFormat12 : CmapSubtableLongSegmented<CmapSubtableFormat12>
     hb_codepoint_t startCharCode = 0xFFFF, endCharCode = 0xFFFF;
     hb_codepoint_t glyphID = 0;
 
-    for (const hb_item_type<Iterator> _ : +it)
+    for (const auto& _ : +it)
     {
       if (startCharCode == 0xFFFF)
       {
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
index aaa1c37c64..e285acec3d 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
@@ -455,8 +455,8 @@ struct IndexSubtableRecord
     unsigned int old_cbdt_prime_length = bitmap_size_context->cbdt_prime->length;
 
     // Set to invalid state to indicate filling glyphs is not yet started.
-    if (unlikely (!records->resize (records->length + 1)))
-      return_trace (c->serializer->check_success (false));
+    if (unlikely (!c->serializer->check_success (records->resize (records->length + 1))))
+      return_trace (false);
 
     (*records)[records->length - 1].firstGlyphIndex = 1;
     (*records)[records->length - 1].lastGlyphIndex = 0;
@@ -567,8 +567,8 @@ struct IndexSubtableArray
 
     hb_vector_t<hb_pair_t<hb_codepoint_t, const IndexSubtableRecord*>> lookup;
     build_lookup (c, bitmap_size_context, &lookup);
-    if (unlikely (lookup.in_error ()))
-      return c->serializer->check_success (false);
+    if (unlikely (!c->serializer->propagate_error (lookup)))
+      return false;
 
     bitmap_size_context->size = 0;
     bitmap_size_context->num_tables = 0;
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
index 92a49bb4f4..e2a1ff4662 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
@@ -214,7 +214,7 @@ struct COLR
 				if (unlikely (!old_record))
 				  return hb_pair_t<bool, BaseGlyphRecord> (false, Null (BaseGlyphRecord));
 
-				BaseGlyphRecord new_record;
+				BaseGlyphRecord new_record = {};
 				new_record.glyphId = new_gid;
 				new_record.numLayers = old_record->numLayers;
 				return hb_pair_t<bool, BaseGlyphRecord> (true, new_record);
diff --git a/thirdparty/harfbuzz/src/hb-ot-color.cc b/thirdparty/harfbuzz/src/hb-ot-color.cc
index 0e7203a88b..4170b71317 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-color.cc
@@ -37,9 +37,6 @@
 #include "hb-ot-color-sbix-table.hh"
 #include "hb-ot-color-svg-table.hh"
 
-#include <stdlib.h>
-#include <string.h>
-
 
 /**
  * SECTION:hb-ot-color
@@ -64,7 +61,7 @@
  *
  * Tests whether a face includes a `CPAL` color-palette table.
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.1.0
  */
@@ -195,7 +192,7 @@ hb_ot_color_palette_get_colors (hb_face_t     *face,
  *
  * Tests whether a face includes any `COLR` color layers.
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.1.0
  */
@@ -242,7 +239,7 @@ hb_ot_color_glyph_get_layers (hb_face_t           *face,
  *
  * Tests whether a face includes any `SVG` glyph images.
  *
- * Return value: true if data found, false otherwise.
+ * Return value: %true if data found, %false otherwise.
  *
  * Since: 2.1.0
  */
@@ -280,7 +277,7 @@ hb_ot_color_glyph_reference_svg (hb_face_t *face, hb_codepoint_t glyph)
  *
  * Tests whether a face has PNG glyph images (either in `CBDT` or `sbix` tables).
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.1.0
  */
diff --git a/thirdparty/harfbuzz/src/hb-ot-color.h b/thirdparty/harfbuzz/src/hb-ot-color.h
index 4f37a4386f..c23ce4de44 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color.h
+++ b/thirdparty/harfbuzz/src/hb-ot-color.h
@@ -26,7 +26,7 @@
  * Google Author(s): Sascha Brawer, Behdad Esfahbod
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -66,6 +66,8 @@ hb_ot_color_palette_color_get_name_id (hb_face_t *face,
  * @HB_OT_COLOR_PALETTE_FLAG_USABLE_WITH_DARK_BACKGROUND: Flag indicating that the color
  *   palette is appropriate to use when displaying the font on a dark background such as black.
  *
+ * Flags that describe the properties of color palette.
+ *
  * Since: 2.1.0
  */
 typedef enum { /*< flags >*/
@@ -95,6 +97,8 @@ hb_ot_color_has_layers (hb_face_t *face);
 
 /**
  * hb_ot_color_layer_t:
+ * @glyph: the glyph ID of the layer
+ * @color_index: the palette color index of the layer
  *
  * Pairs of glyph and color index.
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-deprecated.h b/thirdparty/harfbuzz/src/hb-ot-deprecated.h
index 2e75deef2d..ce6b6fef11 100644
--- a/thirdparty/harfbuzz/src/hb-ot-deprecated.h
+++ b/thirdparty/harfbuzz/src/hb-ot-deprecated.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -41,6 +41,13 @@ HB_BEGIN_DECLS
 
 
 /* https://github.com/harfbuzz/harfbuzz/issues/1734 */
+/**
+ * HB_MATH_GLYPH_PART_FLAG_EXTENDER:
+ *
+ * Use #HB_OT_MATH_GLYPH_PART_FLAG_EXTENDER instead.
+ *
+ * Deprecated: 2.5.1
+ */
 #define HB_MATH_GLYPH_PART_FLAG_EXTENDER HB_OT_MATH_GLYPH_PART_FLAG_EXTENDER
 
 
@@ -71,6 +78,8 @@ hb_ot_tag_from_language (hb_language_t language);
 /**
  * HB_OT_VAR_NO_AXIS_INDEX:
  *
+ * Do not use.
+ *
  * Since: 1.4.2
  * Deprecated: 2.2.0
  */
@@ -78,6 +87,13 @@ hb_ot_tag_from_language (hb_language_t language);
 
 /**
  * hb_ot_var_axis_t:
+ * @tag: axis tag
+ * @name_id: axis name identifier
+ * @min_value: minimum value of the axis
+ * @default_value: default value of the axis
+ * @max_value: maximum value of the axis
+ *
+ * Use #hb_ot_var_axis_info_t instead.
  *
  * Since: 1.4.2
  * Deprecated: 2.2.0
diff --git a/thirdparty/harfbuzz/src/hb-ot-font.h b/thirdparty/harfbuzz/src/hb-ot-font.h
index 80eaa54b1a..e7959d1ae2 100644
--- a/thirdparty/harfbuzz/src/hb-ot-font.h
+++ b/thirdparty/harfbuzz/src/hb-ot-font.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod, Roozbeh Pournader
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
index 5470bd96da..5352156f02 100644
--- a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
@@ -186,7 +186,7 @@ struct glyf
     | hb_map (&SubsetGlyph::padded_size)
     ;
 
-    if (c->serializer->in_error ()) return_trace (false);
+    if (unlikely (c->serializer->in_error ())) return_trace (false);
     return_trace (c->serializer->check_success (_add_loca_and_head (c->plan,
 								    padded_offsets)));
   }
@@ -944,9 +944,9 @@ struct glyf
 	    return;
 	  }
 	  extents->x_bearing = font->em_scalef_x (min_x);
-	  extents->width = font->em_scalef_x (max_x - min_x);
+	  extents->width = font->em_scalef_x (max_x) - extents->x_bearing;
 	  extents->y_bearing = font->em_scalef_y (max_y);
-	  extents->height = font->em_scalef_y (min_y - max_y);
+	  extents->height = font->em_scalef_y (min_y) - extents->y_bearing;
 	}
 
 	protected:
diff --git a/thirdparty/harfbuzz/src/hb-ot-head-table.hh b/thirdparty/harfbuzz/src/hb-ot-head-table.hh
index 5613a96dbf..ac588e3af6 100644
--- a/thirdparty/harfbuzz/src/hb-ot-head-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-head-table.hh
@@ -43,7 +43,7 @@ namespace OT {
 
 struct head
 {
-  friend struct OffsetTable;
+  friend struct OpenTypeOffsetTable;
 
   static constexpr hb_tag_t tableTag = HB_OT_TAG_head;
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
index 6ab950a322..0ba7e3c061 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
@@ -1128,7 +1128,7 @@ struct Lookup
     out->lookupType = lookupType;
     out->lookupFlag = lookupFlag;
 
-    const hb_set_t *glyphset = c->plan->glyphset ();
+    const hb_set_t *glyphset = c->plan->glyphset_gsub ();
     unsigned int lookup_type = get_type ();
     + hb_iter (get_subtables <TSubTable> ())
     | hb_filter ([this, glyphset, lookup_type] (const OffsetTo<TSubTable> &_) { return (this+_).intersects (glyphset, lookup_type); })
@@ -1251,8 +1251,9 @@ struct CoverageFormat1
   {
     /* TODO Speed up, using hb_set_next() and bsearch()? */
     unsigned int count = glyphArray.len;
+    const HBGlyphID *arr = glyphArray.arrayZ;
     for (unsigned int i = 0; i < count; i++)
-      if (glyphs->has (glyphArray[i]))
+      if (glyphs->has (arr[i]))
 	return true;
     return false;
   }
@@ -1356,18 +1357,21 @@ struct CoverageFormat2
   bool intersects (const hb_set_t *glyphs) const
   {
     /* TODO Speed up, using hb_set_next() and bsearch()? */
-    unsigned int count = rangeRecord.len;
-    for (unsigned int i = 0; i < count; i++)
-      if (rangeRecord[i].intersects (glyphs))
+    /* TODO(iter) Rewrite as dagger. */
+    unsigned count = rangeRecord.len;
+    const RangeRecord *arr = rangeRecord.arrayZ;
+    for (unsigned i = 0; i < count; i++)
+      if (arr[i].intersects (glyphs))
 	return true;
     return false;
   }
   bool intersects_coverage (const hb_set_t *glyphs, unsigned int index) const
   {
-    unsigned int i;
-    unsigned int count = rangeRecord.len;
-    for (i = 0; i < count; i++) {
-      const RangeRecord &range = rangeRecord[i];
+    /* TODO(iter) Rewrite as dagger. */
+    unsigned count = rangeRecord.len;
+    const RangeRecord *arr = rangeRecord.arrayZ;
+    for (unsigned i = 0; i < count; i++) {
+      const RangeRecord &range = arr[i];
       if (range.value <= index &&
 	  index < (unsigned int) range.value + (range.last - range.first) &&
 	  range.intersects (glyphs))
@@ -1502,7 +1506,7 @@ struct Coverage
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto it =
@@ -1729,7 +1733,7 @@ struct ClassDefFormat1
 	       hb_map_t *klass_map = nullptr /*OUT*/) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->_glyphset_gsub;
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     hb_sorted_vector_t<HBGlyphID> glyphs;
@@ -1784,7 +1788,7 @@ struct ClassDefFormat1
   }
 
   template <typename set_t>
-  bool collect_class (set_t *glyphs, unsigned int klass) const
+  bool collect_class (set_t *glyphs, unsigned klass) const
   {
     unsigned int count = classValue.len;
     for (unsigned int i = 0; i < count; i++)
@@ -1802,7 +1806,7 @@ struct ClassDefFormat1
       if (classValue[iter - start]) return true;
     return false;
   }
-  bool intersects_class (const hb_set_t *glyphs, unsigned int klass) const
+  bool intersects_class (const hb_set_t *glyphs, uint16_t klass) const
   {
     unsigned int count = classValue.len;
     if (klass == 0)
@@ -1815,8 +1819,12 @@ struct ClassDefFormat1
       if (hb_set_next (glyphs, &g)) return true;
       /* Fall through. */
     }
+    /* TODO Speed up, using set overlap first? */
+    /* TODO(iter) Rewrite as dagger. */
+    HBUINT16 k {klass};
+    const HBUINT16 *arr = classValue.arrayZ;
     for (unsigned int i = 0; i < count; i++)
-      if (classValue[i] == klass && glyphs->has (startGlyph + i))
+      if (arr[i] == k && glyphs->has (startGlyph + i))
 	return true;
     return false;
   }
@@ -1898,7 +1906,7 @@ struct ClassDefFormat2
 	       hb_map_t *klass_map = nullptr /*OUT*/) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->_glyphset_gsub;
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     hb_sorted_vector_t<HBGlyphID> glyphs;
@@ -1961,11 +1969,14 @@ struct ClassDefFormat2
     /* TODO Speed up, using hb_set_next() and bsearch()? */
     unsigned int count = rangeRecord.len;
     for (unsigned int i = 0; i < count; i++)
-      if (rangeRecord[i].intersects (glyphs))
+    {
+      const auto& range = rangeRecord[i];
+      if (range.intersects (glyphs) && range.value)
 	return true;
+    }
     return false;
   }
-  bool intersects_class (const hb_set_t *glyphs, unsigned int klass) const
+  bool intersects_class (const hb_set_t *glyphs, uint16_t klass) const
   {
     unsigned int count = rangeRecord.len;
     if (klass == 0)
@@ -1984,8 +1995,12 @@ struct ClassDefFormat2
 	return true;
       /* Fall through. */
     }
+    /* TODO Speed up, using set overlap first? */
+    /* TODO(iter) Rewrite as dagger. */
+    HBUINT16 k {klass};
+    const RangeRecord *arr = rangeRecord.arrayZ;
     for (unsigned int i = 0; i < count; i++)
-      if (rangeRecord[i].value == klass && rangeRecord[i].intersects (glyphs))
+      if (arr[i].value == k && arr[i].intersects (glyphs))
 	return true;
     return false;
   }
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
index 2217d298fb..f523e35c00 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
@@ -566,6 +566,26 @@ struct AnchorMatrix
     return_trace (true);
   }
 
+  bool subset (hb_subset_context_t *c,
+	       unsigned cols,
+	       const hb_map_t *klass_mapping) const
+  {
+    TRACE_SUBSET (this);
+    auto *out = c->serializer->start_embed (*this);
+
+    auto indexes =
+    + hb_range (rows * cols)
+    | hb_filter ([=] (unsigned index) { return klass_mapping->has (index % cols); })
+    ;
+
+    out->serialize (c->serializer,
+                    (unsigned) rows,
+                    this,
+                    c->plan->layout_variation_idx_map,
+                    indexes);
+    return_trace (true);
+  }
+
   bool sanitize (hb_sanitize_context_t *c, unsigned int cols) const
   {
     TRACE_SANITIZE (this);
@@ -755,7 +775,7 @@ struct SinglePosFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto it =
@@ -870,7 +890,7 @@ struct SinglePosFormat2
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     unsigned sub_length = valueFormat.get_len ();
@@ -1129,7 +1149,7 @@ struct PairSet
     if (unlikely (!c->serializer->extend_min (out))) return_trace (false);
     out->len = 0;
 
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     unsigned len1 = valueFormats[0].get_len ();
@@ -1250,7 +1270,7 @@ struct PairPosFormat1
   {
     TRACE_SUBSET (this);
 
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -1441,7 +1461,7 @@ struct PairPosFormat2
 		})
     ;
 
-    const hb_set_t &glyphset = *c->plan->_glyphset_gsub;
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto it =
@@ -1728,7 +1748,7 @@ struct CursivePosFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -1904,7 +1924,7 @@ struct MarkBasePosFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -2025,10 +2045,37 @@ typedef AnchorMatrix LigatureAttach;	/* component-major--
 					 * mark-minor--
 					 * ordered by class--zero-based. */
 
-typedef OffsetListOf<LigatureAttach> LigatureArray;
-					/* Array of LigatureAttach
-					 * tables ordered by
-					 * LigatureCoverage Index */
+/* Array of LigatureAttach tables ordered by LigatureCoverage Index */
+struct LigatureArray : OffsetListOf<LigatureAttach>
+{
+  template <typename Iterator,
+	    hb_requires (hb_is_iterator (Iterator))>
+  bool subset (hb_subset_context_t *c,
+	       Iterator		    coverage,
+	       unsigned		    class_count,
+	       const hb_map_t	   *klass_mapping) const
+  {
+    TRACE_SUBSET (this);
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
+
+    auto *out = c->serializer->start_embed (this);
+    if (unlikely (!c->serializer->extend_min (out)))  return_trace (false);
+
+    for (const auto _ : + hb_zip (coverage, *this)
+		  | hb_filter (glyphset, hb_first))
+    {
+      auto *matrix = out->serialize_append (c->serializer);
+      if (unlikely (!matrix)) return_trace (false);
+
+      matrix->serialize_subset (c,
+				_.second,
+				this,
+				class_count,
+				klass_mapping);
+    }
+    return_trace (this->len);
+  }
+};
 
 struct MarkLigPosFormat1
 {
@@ -2130,8 +2177,56 @@ struct MarkLigPosFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    // TODO(subset)
-    return_trace (false);
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
+    const hb_map_t &glyph_map = *c->plan->glyph_map;
+
+    auto *out = c->serializer->start_embed (*this);
+    if (unlikely (!c->serializer->extend_min (out))) return_trace (false);
+    out->format = format;
+
+    hb_map_t klass_mapping;
+    Markclass_closure_and_remap_indexes (this+markCoverage, this+markArray, glyphset, &klass_mapping);
+
+    if (!klass_mapping.get_population ()) return_trace (false);
+    out->classCount = klass_mapping.get_population ();
+
+    auto mark_iter =
+    + hb_zip (this+markCoverage, this+markArray)
+    | hb_filter (glyphset, hb_first)
+    ;
+
+    auto new_mark_coverage =
+    + mark_iter
+    | hb_map_retains_sorting (hb_first)
+    | hb_map_retains_sorting (glyph_map)
+    ;
+
+    if (!out->markCoverage.serialize (c->serializer, out)
+			  .serialize (c->serializer, new_mark_coverage))
+      return_trace (false);
+
+    out->markArray.serialize (c->serializer, out)
+		  .serialize (c->serializer,
+                              &klass_mapping,
+                              c->plan->layout_variation_idx_map,
+                              &(this+markArray),
+                              + mark_iter
+                              | hb_map (hb_second));
+
+    auto new_ligature_coverage =
+    + hb_iter (this + ligatureCoverage)
+    | hb_filter (glyphset)
+    | hb_map_retains_sorting (glyph_map)
+    ;
+
+    if (!out->ligatureCoverage.serialize (c->serializer, out)
+			      .serialize (c->serializer, new_ligature_coverage))
+      return_trace (false);
+
+    out->ligatureArray.serialize_subset (c, ligatureArray, this,
+                                         hb_iter (this+ligatureCoverage), classCount, &klass_mapping);
+
+    return_trace (true);
   }
 
   bool sanitize (hb_sanitize_context_t *c) const
@@ -2164,6 +2259,7 @@ struct MarkLigPosFormat1
   DEFINE_SIZE_STATIC (12);
 };
 
+
 struct MarkLigPos
 {
   template <typename context_t, typename ...Ts>
@@ -2288,7 +2384,7 @@ struct MarkMarkPosFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
index 2f41d67819..5f10ecb7ee 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
@@ -356,7 +356,7 @@ struct Sequence
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     if (!intersects (&glyphset)) return_trace (false);
@@ -447,7 +447,7 @@ struct MultipleSubstFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -582,7 +582,7 @@ struct AlternateSet
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto it =
@@ -682,7 +682,7 @@ struct AlternateSubstFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -840,7 +840,7 @@ struct Ligature
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     if (!intersects (&glyphset) || !glyphset.has (ligGlyph)) return_trace (false);
@@ -1058,7 +1058,7 @@ struct LigatureSubstFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
index cb95e6dcd5..36a95ead15 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
@@ -89,7 +89,7 @@ struct hb_closure_context_t :
 
   bool is_lookup_done (unsigned int lookup_index)
   {
-    if (done_lookups->in_error ())
+    if (unlikely (done_lookups->in_error ()))
       return true;
 
     /* Have we visited this lookup with the current set of glyphs? */
@@ -146,7 +146,6 @@ struct hb_closure_lookups_context_t :
     if (is_lookup_visited (lookup_index))
       return;
 
-    set_lookup_visited (lookup_index);
     nesting_level_left--;
     recurse_func (this, lookup_index);
     nesting_level_left++;
@@ -163,10 +162,10 @@ struct hb_closure_lookups_context_t :
 
   bool is_lookup_visited (unsigned lookup_index)
   {
-    if (lookup_count++ > HB_MAX_LOOKUP_INDICES)
+    if (unlikely (lookup_count++ > HB_MAX_LOOKUP_INDICES))
       return true;
 
-    if (visited_lookups->in_error ())
+    if (unlikely (visited_lookups->in_error ()))
       return true;
 
     return visited_lookups->has (lookup_index);
@@ -660,7 +659,7 @@ struct hb_ot_apply_context_t :
   void replace_glyph (hb_codepoint_t glyph_index) const
   {
     _set_glyph_props (glyph_index);
-    buffer->replace_glyph (glyph_index);
+    (void) buffer->replace_glyph (glyph_index);
   }
   void replace_glyph_inplace (hb_codepoint_t glyph_index) const
   {
@@ -671,13 +670,13 @@ struct hb_ot_apply_context_t :
 				    unsigned int class_guess) const
   {
     _set_glyph_props (glyph_index, class_guess, true);
-    buffer->replace_glyph (glyph_index);
+    (void) buffer->replace_glyph (glyph_index);
   }
   void output_glyph_for_component (hb_codepoint_t glyph_index,
 				   unsigned int class_guess) const
   {
     _set_glyph_props (glyph_index, class_guess, false, true);
-    buffer->output_glyph (glyph_index);
+    (void) buffer->output_glyph (glyph_index);
   }
 };
 
@@ -1044,7 +1043,7 @@ static inline bool ligate_input (hb_ot_apply_context_t *c,
 				    hb_min (this_comp, last_num_components);
 	  _hb_glyph_info_set_lig_props_for_mark (&buffer->cur(), lig_id, new_lig_comp);
       }
-      buffer->next_glyph ();
+      (void) buffer->next_glyph ();
     }
 
     last_lig_id = _hb_glyph_info_get_lig_id (&buffer->cur());
@@ -1188,7 +1187,7 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
 
     /* Don't recurse to ourself at same position.
      * Note that this test is too naive, it doesn't catch longer loops. */
-    if (idx == 0 && lookupRecord[i].lookupListIndex == c->lookup_index)
+    if (unlikely (idx == 0 && lookupRecord[i].lookupListIndex == c->lookup_index))
       continue;
 
     if (unlikely (!buffer->move_to (match_positions[idx])))
@@ -1226,7 +1225,8 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
      *     mean that n match positions where removed, as there might
      *     have been marks and default-ignorables in the sequence.  We
      *     should instead drop match positions between current-position
-     *     and current-position + n instead.
+     *     and current-position + n instead. Though, am not sure which
+     *     one is better. Both cases have valid uses. Sigh.
      *
      * It should be possible to construct tests for both of these cases.
      */
@@ -1272,7 +1272,7 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
       match_positions[next] += delta;
   }
 
-  buffer->move_to (end);
+  (void) buffer->move_to (end);
 
   return_trace (true);
 }
@@ -1389,9 +1389,11 @@ struct Rule
 			    lookup_context);
   }
 
-  void closure_lookups (hb_closure_lookups_context_t *c) const
+  void closure_lookups (hb_closure_lookups_context_t *c,
+                        ContextClosureLookupContext &lookup_context) const
   {
     if (unlikely (c->lookup_limit_exceeded ())) return;
+    if (!intersects (c->glyphs, lookup_context)) return;
 
     const UnsizedArrayOf<LookupRecord> &lookupRecord = StructAfter<UnsizedArrayOf<LookupRecord>>
 						       (inputZ.as_array (inputCount ? inputCount - 1 : 0));
@@ -1521,14 +1523,13 @@ struct RuleSet
     ;
   }
 
-  void closure_lookups (hb_closure_lookups_context_t *c) const
+  void closure_lookups (hb_closure_lookups_context_t *c,
+                        ContextClosureLookupContext &lookup_context) const
   {
     if (unlikely (c->lookup_limit_exceeded ())) return;
-
-    return
     + hb_iter (rule)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const Rule &_) { _.closure_lookups (c); })
+    | hb_apply ([&] (const Rule &_) { _.closure_lookups (c, lookup_context); })
     ;
   }
 
@@ -1647,9 +1648,16 @@ struct ContextFormat1
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
-    + hb_iter (ruleSet)
+    struct ContextClosureLookupContext lookup_context = {
+      {intersects_glyph},
+      nullptr
+    };
+
+    + hb_zip (this+coverage, ruleSet)
+    | hb_filter (*c->glyphs, hb_first)
+    | hb_map (hb_second)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const RuleSet &_) { _.closure_lookups (c); })
+    | hb_apply ([&] (const RuleSet &_) { _.closure_lookups (c, lookup_context); })
     ;
   }
 
@@ -1700,7 +1708,7 @@ struct ContextFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -1791,10 +1799,24 @@ struct ContextFormat2
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
+    if (!(this+coverage).intersects (c->glyphs))
+      return;
+
+    const ClassDef &class_def = this+classDef;
+
+    struct ContextClosureLookupContext lookup_context = {
+      {intersects_class},
+      &class_def
+    };
+
     + hb_iter (ruleSet)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const RuleSet &_) { _.closure_lookups (c); })
-    ;
+    | hb_enumerate
+    | hb_filter ([&] (const hb_pair_t<unsigned, const RuleSet &> p)
+    { return class_def.intersects_class (c->glyphs, p.first); })
+    | hb_map (hb_second)
+    | hb_apply ([&] (const RuleSet & _)
+    { _.closure_lookups (c, lookup_context); });
   }
 
   void collect_variation_indices (hb_collect_variation_indices_context_t *c) const {}
@@ -1860,8 +1882,8 @@ struct ContextFormat2
     const hb_map_t *lookup_map = c->table_tag == HB_OT_TAG_GSUB ? c->plan->gsub_lookups : c->plan->gpos_lookups;
     bool ret = true;
     int non_zero_index = 0, index = 0;
-    for (const hb_pair_t<unsigned, const OffsetTo<RuleSet>&> _ : + hb_enumerate (ruleSet)
-								 | hb_filter (klass_map, hb_first))
+    for (const auto& _ : + hb_enumerate (ruleSet)
+			 | hb_filter (klass_map, hb_first))
     {
       auto *o = out->ruleSet.serialize_append (c->serializer);
       if (unlikely (!o))
@@ -1945,6 +1967,8 @@ struct ContextFormat3
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
+    if (!intersects (c->glyphs))
+      return;
     const LookupRecord *lookupRecord = &StructAfter<LookupRecord> (coverageZ.as_array (glyphCount));
     recurse_lookups (c, lookupCount, lookupRecord);
   }
@@ -2010,6 +2034,7 @@ struct ContextFormat3
 
     for (const OffsetTo<Coverage>& offset : coverages)
     {
+      /* TODO(subset) This looks like should not be necessary to write this way. */
       auto *o = c->serializer->allocate_size<OffsetTo<Coverage>> (OffsetTo<Coverage>::static_size);
       if (unlikely (!o)) return_trace (false);
       if (!o->serialize_subset (c, offset, this)) return_trace (false);
@@ -2238,9 +2263,11 @@ struct ChainRule
 				  lookup_context);
   }
 
-  void closure_lookups (hb_closure_lookups_context_t *c) const
+  void closure_lookups (hb_closure_lookups_context_t *c,
+                        ChainContextClosureLookupContext &lookup_context) const
   {
     if (unlikely (c->lookup_limit_exceeded ())) return;
+    if (!intersects (c->glyphs, lookup_context)) return;
 
     const HeadlessArrayOf<HBUINT16> &input = StructAfter<HeadlessArrayOf<HBUINT16>> (backtrack);
     const ArrayOf<HBUINT16> &lookahead = StructAfter<ArrayOf<HBUINT16>> (input);
@@ -2296,11 +2323,7 @@ struct ChainRule
   {
     c->copy (len);
     for (const auto g : it)
-    {
-      HBUINT16 gid;
-      gid = g;
-      c->copy (gid);
-    }
+      c->copy ((HBUINT16) g);
   }
 
   ChainRule* copy (hb_serialize_context_t *c,
@@ -2328,12 +2351,19 @@ struct ChainRule
 				       | hb_map (mapping));
 
     const ArrayOf<LookupRecord> &lookupRecord = StructAfter<ArrayOf<LookupRecord>> (lookahead);
-    HBUINT16 lookupCount;
-    lookupCount = lookupRecord.len;
-    if (!c->copy (lookupCount)) return_trace (nullptr);
 
-    for (unsigned i = 0; i < (unsigned) lookupCount; i++)
+    HBUINT16* lookupCount = c->embed (&(lookupRecord.len));
+    if (!lookupCount) return_trace (nullptr);
+
+    for (unsigned i = 0; i < lookupRecord.len; i++)
+    {
+      if (!lookup_map->has (lookupRecord[i].lookupListIndex))
+      {
+        (*lookupCount)--;
+        continue;
+      }
       if (!c->copy (lookupRecord[i], lookup_map)) return_trace (nullptr);
+    }
 
     return_trace (out);
   }
@@ -2351,7 +2381,7 @@ struct ChainRule
 
     if (!backtrack_map)
     {
-      const hb_set_t &glyphset = *c->plan->glyphset ();
+      const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
       if (!hb_all (backtrack, glyphset) ||
 	  !hb_all (input, glyphset) ||
 	  !hb_all (lookahead, glyphset))
@@ -2424,14 +2454,14 @@ struct ChainRuleSet
     ;
   }
 
-  void closure_lookups (hb_closure_lookups_context_t *c) const
+  void closure_lookups (hb_closure_lookups_context_t *c,
+                        ChainContextClosureLookupContext &lookup_context) const
   {
     if (unlikely (c->lookup_limit_exceeded ())) return;
 
-    return
     + hb_iter (rule)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const ChainRule &_) { _.closure_lookups (c); })
+    | hb_apply ([&] (const ChainRule &_) { _.closure_lookups (c, lookup_context); })
     ;
   }
 
@@ -2552,9 +2582,16 @@ struct ChainContextFormat1
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
-    + hb_iter (ruleSet)
+    struct ChainContextClosureLookupContext lookup_context = {
+      {intersects_glyph},
+      {nullptr, nullptr, nullptr}
+    };
+
+    + hb_zip (this+coverage, ruleSet)
+    | hb_filter (*c->glyphs, hb_first)
+    | hb_map (hb_second)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const ChainRuleSet &_) { _.closure_lookups (c); })
+    | hb_apply ([&] (const ChainRuleSet &_) { _.closure_lookups (c, lookup_context); })
     ;
   }
 
@@ -2604,7 +2641,7 @@ struct ChainContextFormat1
   bool subset (hb_subset_context_t *c) const
   {
     TRACE_SUBSET (this);
-    const hb_set_t &glyphset = *c->plan->glyphset ();
+    const hb_set_t &glyphset = *c->plan->glyphset_gsub ();
     const hb_map_t &glyph_map = *c->plan->glyph_map;
 
     auto *out = c->serializer->start_embed (*this);
@@ -2701,9 +2738,28 @@ struct ChainContextFormat2
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
+    if (!(this+coverage).intersects (c->glyphs))
+      return;
+
+    const ClassDef &backtrack_class_def = this+backtrackClassDef;
+    const ClassDef &input_class_def = this+inputClassDef;
+    const ClassDef &lookahead_class_def = this+lookaheadClassDef;
+
+    struct ChainContextClosureLookupContext lookup_context = {
+      {intersects_class},
+      {&backtrack_class_def,
+       &input_class_def,
+       &lookahead_class_def}
+    };
+
     + hb_iter (ruleSet)
     | hb_map (hb_add (this))
-    | hb_apply ([&] (const ChainRuleSet &_) { _.closure_lookups (c); })
+    | hb_enumerate
+    | hb_filter([&] (unsigned klass)
+    { return input_class_def.intersects_class (c->glyphs, klass); }, hb_first)
+    | hb_map (hb_second)
+    | hb_apply ([&] (const ChainRuleSet &_)
+    { _.closure_lookups (c, lookup_context); })
     ;
   }
 
@@ -2779,24 +2835,23 @@ struct ChainContextFormat2
     out->coverage.serialize_subset (c, coverage, this);
 
     hb_map_t backtrack_klass_map;
-    out->backtrackClassDef.serialize_subset (c, backtrackClassDef, this, &backtrack_klass_map);
-    if (unlikely (!c->serializer->check_success (!backtrack_klass_map.in_error ())))
-      return_trace (false);
-
-    // subset inputClassDef based on glyphs survived in Coverage subsetting
     hb_map_t input_klass_map;
-    out->inputClassDef.serialize_subset (c, inputClassDef, this, &input_klass_map);
-    if (unlikely (!c->serializer->check_success (!input_klass_map.in_error ())))
-      return_trace (false);
-
     hb_map_t lookahead_klass_map;
+
+    out->backtrackClassDef.serialize_subset (c, backtrackClassDef, this, &backtrack_klass_map);
+    // TODO: subset inputClassDef based on glyphs survived in Coverage subsetting
+    out->inputClassDef.serialize_subset (c, inputClassDef, this, &input_klass_map);
     out->lookaheadClassDef.serialize_subset (c, lookaheadClassDef, this, &lookahead_klass_map);
-    if (unlikely (!c->serializer->check_success (!lookahead_klass_map.in_error ())))
+
+    if (unlikely (!c->serializer->propagate_error (backtrack_klass_map,
+						   input_klass_map,
+						   lookahead_klass_map)))
       return_trace (false);
 
-    unsigned non_zero_index = 0, index = 0;
+    int non_zero_index = -1, index = 0;
     bool ret = true;
     const hb_map_t *lookup_map = c->table_tag == HB_OT_TAG_GSUB ? c->plan->gsub_lookups : c->plan->gpos_lookups;
+    auto last_non_zero = c->serializer->snapshot ();
     for (const OffsetTo<ChainRuleSet>& _ : + hb_enumerate (ruleSet)
 					   | hb_filter (input_klass_map, hb_first)
 					   | hb_map (hb_second))
@@ -2812,19 +2867,20 @@ struct ChainContextFormat2
 			       &backtrack_klass_map,
 			       &input_klass_map,
 			       &lookahead_klass_map))
+      {
+        last_non_zero = c->serializer->snapshot ();
 	non_zero_index = index;
+      }
 
       index++;
     }
 
     if (!ret) return_trace (ret);
 
-    //prune empty trailing ruleSets
-    --index;
-    while (index > non_zero_index)
-    {
-      out->ruleSet.pop ();
-      index--;
+    // prune empty trailing ruleSets
+    if (index > non_zero_index) {
+      c->serializer->revert (last_non_zero);
+      out->ruleSet.len = non_zero_index + 1;
     }
 
     return_trace (bool (out->ruleSet));
@@ -2908,6 +2964,9 @@ struct ChainContextFormat3
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
   {
+    if (!intersects (c->glyphs))
+      return;
+
     const OffsetArrayOf<Coverage> &input = StructAfter<OffsetArrayOf<Coverage>> (backtrack);
     const OffsetArrayOf<Coverage> &lookahead = StructAfter<OffsetArrayOf<Coverage>> (input);
     const ArrayOf<LookupRecord> &lookup = StructAfter<ArrayOf<LookupRecord>> (lookahead);
@@ -2986,13 +3045,16 @@ struct ChainContextFormat3
     TRACE_SERIALIZE (this);
     auto *out = c->serializer->start_embed<OffsetArrayOf<Coverage>> ();
 
-    if (unlikely (!c->serializer->allocate_size<HBUINT16> (HBUINT16::static_size))) return_trace (false);
+    if (unlikely (!c->serializer->allocate_size<HBUINT16> (HBUINT16::static_size)))
+      return_trace (false);
 
-    + it
-    | hb_apply (subset_offset_array (c, *out, base))
-    ;
+    for (auto& offset : it) {
+      auto *o = out->serialize_append (c->serializer);
+      if (unlikely (!o) || !o->serialize_subset (c, offset, base))
+        return_trace (false);
+    }
 
-    return_trace (out->len);
+    return_trace (true);
   }
 
   bool subset (hb_subset_context_t *c) const
@@ -3113,6 +3175,24 @@ struct ExtensionFormat1
 		  extensionLookupType != T::SubTable::Extension);
   }
 
+  bool subset (hb_subset_context_t *c) const
+  {
+    TRACE_SUBSET (this);
+
+    auto *out = c->serializer->start_embed (this);
+    if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false);
+
+    out->format = format;
+    out->extensionLookupType = extensionLookupType;
+
+    const auto& src_offset =
+        reinterpret_cast<const LOffsetTo<typename T::SubTable> &> (extensionOffset);
+    auto& dest_offset =
+        reinterpret_cast<LOffsetTo<typename T::SubTable> &> (out->extensionOffset);
+
+    return_trace (dest_offset.serialize_subset (c, src_offset, this, get_type ()));
+  }
+
   protected:
   HBUINT16	format;			/* Format identifier. Set to 1. */
   HBUINT16	extensionLookupType;	/* Lookup type of subtable referenced
@@ -3143,6 +3223,18 @@ struct Extension
     }
   }
 
+  // Specialization of dispatch for subset. dispatch() normally just
+  // dispatches to the sub table this points too, but for subset
+  // we need to run subset on this subtable too.
+  template <typename ...Ts>
+  typename hb_subset_context_t::return_t dispatch (hb_subset_context_t *c, Ts&&... ds) const
+  {
+    switch (u.format) {
+    case 1: return u.format1.subset (c);
+    default: return c->default_return_value ();
+    }
+  }
+
   template <typename context_t, typename ...Ts>
   typename context_t::return_t dispatch (context_t *c, Ts&&... ds) const
   {
@@ -3320,20 +3412,34 @@ struct GSUBGPOS
     return_trace (true);
   }
 
-  void closure_features (const hb_map_t *lookup_indexes, /* IN */
-			 hb_set_t       *feature_indexes /* OUT */) const
+  void prune_features (const hb_map_t *lookup_indices, /* IN */
+                       hb_set_t       *feature_indices /* IN/OUT */) const
   {
-    unsigned int feature_count = hb_min (get_feature_count (), (unsigned) HB_MAX_FEATURES);
-    for (unsigned i = 0; i < feature_count; i++)
+#ifndef HB_NO_VAR
+    // This is the set of feature indices which have alternate versions defined
+    // if the FeatureVariation's table and the alternate version(s) intersect the
+    // set of lookup indices.
+    hb_set_t alternate_feature_indices;
+    if (version.to_int () >= 0x00010001u)
+      (this+featureVars).closure_features (lookup_indices, &alternate_feature_indices);
+    if (unlikely (alternate_feature_indices.in_error())) {
+      feature_indices->successful = false;
+      return;
+    }
+#endif
+
+    for (unsigned i : feature_indices->iter())
     {
       const Feature& f = get_feature (i);
-      if ((!f.featureParams.is_null ()) || f.intersects_lookup_indexes (lookup_indexes))
-	feature_indexes->add (i);
-    }
+
+      if (f.featureParams.is_null ()
+	  && !f.intersects_lookup_indexes (lookup_indices)
 #ifndef HB_NO_VAR
-    if (version.to_int () >= 0x00010001u)
-      (this+featureVars).closure_features (lookup_indexes, feature_indexes);
+          && !alternate_feature_indices.has (i)
 #endif
+	  )
+	feature_indices->del (i);
+    }
   }
 
   unsigned int get_size () const
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.cc b/thirdparty/harfbuzz/src/hb-ot-layout.cc
index f25f0f9e23..89df949b26 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.cc
@@ -76,7 +76,7 @@
  * Tests whether a face includes any kerning data in the 'kern' table.
  * Does NOT test for kerning lookups in the GPOS table.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  **/
 bool
@@ -92,7 +92,7 @@ hb_ot_layout_has_kerning (hb_face_t *face)
  * Tests whether a face includes any state-machine kerning in the 'kern' table.
  * Does NOT examine the GPOS table.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  **/
 bool
@@ -112,7 +112,7 @@ hb_ot_layout_has_machine_kerning (hb_face_t *face)
  *
  * Does NOT examine the GPOS table.
  *
- * Return value: %true is data found, false otherwise
+ * Return value: %true is data found, %false otherwise
  *
  **/
 bool
@@ -268,7 +268,7 @@ _hb_ot_layout_set_glyph_props (hb_font_t *font,
  *
  * Tests whether a face has any glyph classes defined in its GDEF table.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  **/
 hb_bool_t
@@ -322,7 +322,7 @@ hb_ot_layout_get_glyphs_in_class (hb_face_t                  *face,
  * @face: The #hb_face_t to work on
  * @glyph: The #hb_codepoint_t code point to query
  * @start_offset: offset of the first attachment point to retrieve
- * @point_count: (inout) (allow-none): Input = the maximum number of attachment points to return;
+ * @point_count: (inout) (optional): Input = the maximum number of attachment points to return;
  *               Output = the actual number of attachment points returned (may be zero)
  * @point_array: (out) (array length=point_count): The array of attachment points found for the query
  *
@@ -350,7 +350,7 @@ hb_ot_layout_get_attach_points (hb_face_t      *face,
  * @direction: The #hb_direction_t text direction to use
  * @glyph: The #hb_codepoint_t code point to query
  * @start_offset: offset of the first caret position to retrieve
- * @caret_count: (inout) (allow-none): Input = the maximum number of caret positions to return;
+ * @caret_count: (inout) (optional): Input = the maximum number of caret positions to return;
  *               Output = the actual number of caret positions returned (may be zero)
  * @caret_array: (out) (array length=caret_count): The array of caret positions found for the query
  *
@@ -410,9 +410,9 @@ get_gsubgpos_table (hb_face_t *face,
 /**
  * hb_ot_layout_table_get_script_tags:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @start_offset: offset of the first script tag to retrieve
- * @script_count: (inout) (allow-none): Input = the maximum number of script tags to return;
+ * @script_count: (inout) (optional): Input = the maximum number of script tags to return;
  *                Output = the actual number of script tags returned (may be zero)
  * @script_tags: (out) (array length=script_count): The array of #hb_tag_t script tags found for the query
  *
@@ -437,14 +437,14 @@ hb_ot_layout_table_get_script_tags (hb_face_t    *face,
 /**
  * hb_ot_layout_table_find_script:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_tag: #hb_tag_t of the script tag requested
  * @script_index: (out): The index of the requested script tag
  *
  * Fetches the index if a given script tag in the specified face's GSUB table
  * or GPOS table.
  *
- * Return value: %true if the script is found, false otherwise
+ * Return value: %true if the script is found, %false otherwise
  *
  **/
 hb_bool_t
@@ -481,7 +481,7 @@ hb_ot_layout_table_find_script (hb_face_t    *face,
 /**
  * hb_ot_layout_table_choose_script:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_tags: Array of #hb_tag_t script tags
  * @script_index: (out): The index of the requested script tag
  * @chosen_script: (out): #hb_tag_t of the script tag requested
@@ -504,11 +504,22 @@ hb_ot_layout_table_choose_script (hb_face_t      *face,
 /**
  * hb_ot_layout_table_select_script:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_count: Number of script tags in the array
  * @script_tags: Array of #hb_tag_t script tags
- * @script_index: (out): The index of the requested script
- * @chosen_script: (out): #hb_tag_t of the requested script
+ * @script_index: (out) (optional): The index of the requested script
+ * @chosen_script: (out) (optional): #hb_tag_t of the requested script
+ *
+ * Selects an OpenType script for @table_tag from the @script_tags array.
+ *
+ * If the table does not have any of the requested scripts, then `DFLT`,
+ * `dflt`, and `latn` tags are tried in that order. If the table still does not
+ * have any of these scripts, @script_index and @chosen_script are set to
+ * #HB_OT_LAYOUT_NO_SCRIPT_INDEX.
+ *
+ * Return value:
+ * %true if one of the requested scripts is selected, %false if a fallback
+ * script is selected or if no scripts are selected.
  *
  * Since: 2.0.0
  **/
@@ -566,9 +577,9 @@ hb_ot_layout_table_select_script (hb_face_t      *face,
 /**
  * hb_ot_layout_table_get_feature_tags:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @start_offset: offset of the first feature tag to retrieve
- * @feature_count: (inout) (allow-none): Input = the maximum number of feature tags to return;
+ * @feature_count: (inout) (optional): Input = the maximum number of feature tags to return;
  *                 Output = the actual number of feature tags returned (may be zero)
  * @feature_tags: (out) (array length=feature_count): Array of feature tags found in the table
  *
@@ -591,14 +602,14 @@ hb_ot_layout_table_get_feature_tags (hb_face_t    *face,
 /**
  * hb_ot_layout_table_find_feature:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @feature_tag: The #hb_tag_t og the requested feature tag
  * @feature_index: (out): The index of the requested feature
  *
  * Fetches the index for a given feature tag in the specified face's GSUB table
  * or GPOS table.
  *
- * Return value: %true if the feature is found, false otherwise
+ * Return value: %true if the feature is found, %false otherwise
  **/
 bool
 hb_ot_layout_table_find_feature (hb_face_t    *face,
@@ -626,10 +637,10 @@ hb_ot_layout_table_find_feature (hb_face_t    *face,
 /**
  * hb_ot_layout_script_get_language_tags:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @start_offset: offset of the first language tag to retrieve
- * @language_count: (inout) (allow-none): Input = the maximum number of language tags to return;
+ * @language_count: (inout) (optional): Input = the maximum number of language tags to return;
  *                  Output = the actual number of language tags returned (may be zero)
  * @language_tags: (out) (array length=language_count): Array of language tags found in the table
  *
@@ -655,7 +666,7 @@ hb_ot_layout_script_get_language_tags (hb_face_t    *face,
 /**
  * hb_ot_layout_script_find_language:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_tag: The #hb_tag_t of the requested language
  * @language_index: The index of the requested language
@@ -663,7 +674,7 @@ hb_ot_layout_script_get_language_tags (hb_face_t    *face,
  * Fetches the index of a given language tag in the specified face's GSUB table
  * or GPOS table, underneath the specified script tag.
  *
- * Return value: %true if the language tag is found, false otherwise
+ * Return value: %true if the language tag is found, %false otherwise
  *
  * Since: ??
  * Deprecated: ??
@@ -688,7 +699,7 @@ hb_ot_layout_script_find_language (hb_face_t    *face,
 /**
  * hb_ot_layout_script_select_language:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_count: The number of languages in the specified script
  * @language_tags: The array of language tags
@@ -697,7 +708,7 @@ hb_ot_layout_script_find_language (hb_face_t    *face,
  * Fetches the index of a given language tag in the specified face's GSUB table
  * or GPOS table, underneath the specified script index.
  *
- * Return value: %true if the language tag is found, false otherwise
+ * Return value: %true if the language tag is found, %false otherwise
  *
  * Since: 2.0.0
  **/
@@ -731,7 +742,7 @@ hb_ot_layout_script_select_language (hb_face_t      *face,
 /**
  * hb_ot_layout_language_get_required_feature_index:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_index: The index of the requested language tag
  * @feature_index: (out): The index of the requested feature
@@ -739,7 +750,7 @@ hb_ot_layout_script_select_language (hb_face_t      *face,
  * Fetches the index of a requested feature in the given face's GSUB or GPOS table,
  * underneath the specified script and language.
  *
- * Return value: %true if the feature is found, false otherwise
+ * Return value: %true if the feature is found, %false otherwise
  *
  **/
 hb_bool_t
@@ -761,7 +772,7 @@ hb_ot_layout_language_get_required_feature_index (hb_face_t    *face,
 /**
  * hb_ot_layout_language_get_required_feature:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_index: The index of the requested language tag
  * @feature_index: (out): The index of the requested feature
@@ -770,7 +781,7 @@ hb_ot_layout_language_get_required_feature_index (hb_face_t    *face,
  * Fetches the tag of a requested feature index in the given face's GSUB or GPOS table,
  * underneath the specified script and language.
  *
- * Return value: %true if the feature is found, false otherwise
+ * Return value: %true if the feature is found, %false otherwise
  *
  * Since: 0.9.30
  **/
@@ -796,11 +807,11 @@ hb_ot_layout_language_get_required_feature (hb_face_t    *face,
 /**
  * hb_ot_layout_language_get_feature_indexes:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_index: The index of the requested language tag
  * @start_offset: offset of the first feature tag to retrieve
- * @feature_count: (inout) (allow-none): Input = the maximum number of feature tags to return;
+ * @feature_count: (inout) (optional): Input = the maximum number of feature tags to return;
  *                 Output: the actual number of feature tags returned (may be zero)
  * @feature_indexes: (out) (array length=feature_count): The array of feature indexes found for the query
  *
@@ -827,11 +838,11 @@ hb_ot_layout_language_get_feature_indexes (hb_face_t    *face,
 /**
  * hb_ot_layout_language_get_feature_tags:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_index: The index of the requested language tag
  * @start_offset: offset of the first feature tag to retrieve
- * @feature_count: (inout) (allow-none): Input = the maximum number of feature tags to return;
+ * @feature_count: (inout) (optional): Input = the maximum number of feature tags to return;
  *                 Output = the actual number of feature tags returned (may be zero)
  * @feature_tags: (out) (array length=feature_count): The array of #hb_tag_t feature tags found for the query
  *
@@ -868,7 +879,7 @@ hb_ot_layout_language_get_feature_tags (hb_face_t    *face,
 /**
  * hb_ot_layout_language_find_feature:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @script_index: The index of the requested script tag
  * @language_index: The index of the requested language tag
  * @feature_tag: #hb_tag_t of the feature tag requested
@@ -877,7 +888,7 @@ hb_ot_layout_language_get_feature_tags (hb_face_t    *face,
  * Fetches the index of a given feature tag in the specified face's GSUB table
  * or GPOS table, underneath the specified script and language.
  *
- * Return value: %true if the feature is found, false otherwise
+ * Return value: %true if the feature is found, %false otherwise
  *
  **/
 hb_bool_t
@@ -910,10 +921,10 @@ hb_ot_layout_language_find_feature (hb_face_t    *face,
 /**
  * hb_ot_layout_feature_get_lookups:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @feature_index: The index of the requested feature
  * @start_offset: offset of the first lookup to retrieve
- * @lookup_count: (inout) (allow-none): Input = the maximum number of lookups to return;
+ * @lookup_count: (inout) (optional): Input = the maximum number of lookups to return;
  *                Output = the actual number of lookups returned (may be zero)
  * @lookup_indexes: (out) (array length=lookup_count): The array of lookup indexes found for the query
  *
@@ -944,7 +955,7 @@ hb_ot_layout_feature_get_lookups (hb_face_t    *face,
 /**
  * hb_ot_layout_table_get_lookup_count:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  *
  * Fetches the total number of lookups enumerated in the specified
  * face's GSUB table or GPOS table.
@@ -1101,7 +1112,7 @@ script_collect_features (hb_collect_features_context_t *c,
 /**
  * hb_ot_layout_collect_features:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @scripts: The array of scripts to collect features for
  * @languages: The array of languages to collect features for
  * @features: The array of features to collect
@@ -1152,7 +1163,7 @@ hb_ot_layout_collect_features (hb_face_t      *face,
 /**
  * hb_ot_layout_collect_lookups:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @scripts: The array of scripts to collect lookups for
  * @languages: The array of languages to collect lookups for
  * @features: The array of features to collect lookups for
@@ -1191,7 +1202,7 @@ hb_ot_layout_collect_lookups (hb_face_t      *face,
 /**
  * hb_ot_layout_lookup_collect_glyphs:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @lookup_index: The index of the feature lookup to query
  * @glyphs_before: (out): Array of glyphs preceding the substitution range
  * @glyphs_input: (out): Array of input glyphs that would be substituted by the lookup
@@ -1243,7 +1254,7 @@ hb_ot_layout_lookup_collect_glyphs (hb_face_t    *face,
 /**
  * hb_ot_layout_table_find_feature_variations:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @coords: The variation coordinates to query
  * @num_coords: The number of variation coordinates
  * @variations_index: (out): The array of feature variations found for the query
@@ -1268,11 +1279,11 @@ hb_ot_layout_table_find_feature_variations (hb_face_t    *face,
 /**
  * hb_ot_layout_feature_with_variations_get_lookups:
  * @face: #hb_face_t to work upon
- * @table_tag: HB_OT_TAG_GSUB or HB_OT_TAG_GPOS
+ * @table_tag: #HB_OT_TAG_GSUB or #HB_OT_TAG_GPOS
  * @feature_index: The index of the feature to query
  * @variations_index: The index of the feature variation to query
  * @start_offset: offset of the first lookup to retrieve
- * @lookup_count: (inout) (allow-none): Input = the maximum number of lookups to return;
+ * @lookup_count: (inout) (optional): Input = the maximum number of lookups to return;
  *                Output = the actual number of lookups returned (may be zero)
  * @lookup_indexes: (out) (array length=lookup_count): The array of lookups found for the query
  *
@@ -1310,7 +1321,7 @@ hb_ot_layout_feature_with_variations_get_lookups (hb_face_t    *face,
  *
  * Tests whether the specified face includes any GSUB substitutions.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  **/
 hb_bool_t
@@ -1331,7 +1342,7 @@ hb_ot_layout_has_substitution (hb_face_t *face)
  * Tests whether a specified lookup in the specified face would
  * trigger a substitution on the given glyph sequence.
  *
- * Return value: %true if a substitution would be triggered, false otherwise
+ * Return value: %true if a substitution would be triggered, %false otherwise
  *
  * Since: 0.9.7
  **/
@@ -1488,7 +1499,9 @@ hb_ot_layout_lookups_substitute_closure (hb_face_t      *face,
  * hb_ot_layout_has_positioning:
  * @face: #hb_face_t to work upon
  *
- * Return value: %true if the face has GPOS data, false otherwise
+ * Tests whether the specified face includes any GPOS positioning.
+ *
+ * Return value: %true if the face has GPOS data, %false otherwise
  *
  **/
 hb_bool_t
@@ -1561,7 +1574,7 @@ hb_ot_layout_position_finish_offsets (hb_font_t *font, hb_buffer_t *buffer)
  * For more information on this distinction, see the [`size` feature documentation](
  * https://docs.microsoft.com/en-us/typography/opentype/spec/features_pt#tag-size).
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 0.9.10
  **/
@@ -1610,22 +1623,22 @@ hb_ot_layout_get_size_params (hb_face_t       *face,
  * @face: #hb_face_t to work upon
  * @table_tag: table tag to query, "GSUB" or "GPOS".
  * @feature_index: index of feature to query.
- * @label_id: (out) (allow-none): The ‘name’ table name ID that specifies a string
+ * @label_id: (out) (optional): The ‘name’ table name ID that specifies a string
  *            for a user-interface label for this feature. (May be NULL.)
- * @tooltip_id: (out) (allow-none): The ‘name’ table name ID that specifies a string
+ * @tooltip_id: (out) (optional): The ‘name’ table name ID that specifies a string
  *              that an application can use for tooltip text for this
  *              feature. (May be NULL.)
- * @sample_id: (out) (allow-none): The ‘name’ table name ID that specifies sample text
+ * @sample_id: (out) (optional): The ‘name’ table name ID that specifies sample text
  *             that illustrates the effect of this feature. (May be NULL.)
- * @num_named_parameters: (out) (allow-none):  Number of named parameters. (May be zero.)
- * @first_param_id: (out) (allow-none): The first ‘name’ table name ID used to specify
+ * @num_named_parameters: (out) (optional):  Number of named parameters. (May be zero.)
+ * @first_param_id: (out) (optional): The first ‘name’ table name ID used to specify
  *                  strings for user-interface labels for the feature
  *                  parameters. (Must be zero if numParameters is zero.)
  *
  * Fetches name indices from feature parameters for "Stylistic Set" ('ssXX') or
  * "Character Variant" ('cvXX') features.
  *
- * Return value: %true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.0.0
  **/
@@ -1685,7 +1698,7 @@ hb_ot_layout_feature_get_name_ids (hb_face_t       *face,
  * @table_tag: table tag to query, "GSUB" or "GPOS".
  * @feature_index: index of feature to query.
  * @start_offset: offset of the first character to retrieve
- * @char_count: (inout) (allow-none): Input = the maximum number of characters to return;
+ * @char_count: (inout) (optional): Input = the maximum number of characters to return;
  *              Output = the actual number of characters returned (may be zero)
  * @characters: (out caller-allocates) (array length=char_count): A buffer pointer.
  *              The Unicode codepoints of the characters for which this feature provides
@@ -1769,7 +1782,7 @@ apply_forward (OT::hb_ot_apply_context_t *c,
     if (applied)
       ret = true;
     else
-      buffer->next_glyph ();
+      (void) buffer->next_glyph ();
   }
   return ret;
 }
@@ -1907,7 +1920,7 @@ hb_ot_layout_substitute_lookup (OT::hb_ot_apply_context_t *c,
  * @baseline_tag: a baseline tag
  * @direction: text direction.
  * @script_tag:  script tag.
- * @language_tag: language tag.
+ * @language_tag: language tag, currently unused.
  * @coord: (out): baseline value if found.
  *
  * Fetches a baseline value from the face.
@@ -1964,7 +1977,7 @@ struct hb_get_glyph_alternates_dispatch_t :
  * @lookup_index: index of the feature lookup to query.
  * @glyph: a glyph id.
  * @start_offset: starting offset.
- * @alternate_count: (inout) (allow-none): Input = the maximum number of alternate glyphs to return;
+ * @alternate_count: (inout) (optional): Input = the maximum number of alternate glyphs to return;
  *                   Output = the actual number of alternate glyphs returned (may be zero).
  * @alternate_glyphs: (out caller-allocates) (array length=alternate_count): A glyphs buffer.
  *                    Alternate glyphs associated with the glyph id.
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.h b/thirdparty/harfbuzz/src/hb-ot-layout.h
index 545d5f7fc4..d47ba0fc92 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.h
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -38,10 +38,35 @@
 HB_BEGIN_DECLS
 
 
+/**
+ * HB_OT_TAG_BASE:
+ *
+ * OpenType [Baseline Table](https://docs.microsoft.com/en-us/typography/opentype/spec/base).
+ */
 #define HB_OT_TAG_BASE HB_TAG('B','A','S','E')
+/**
+ * HB_OT_TAG_GDEF:
+ *
+ * OpenType [Glyph Definition Table](https://docs.microsoft.com/en-us/typography/opentype/spec/gdef).
+ */
 #define HB_OT_TAG_GDEF HB_TAG('G','D','E','F')
+/**
+ * HB_OT_TAG_GSUB:
+ *
+ * OpenType [Glyph Substitution Table](https://docs.microsoft.com/en-us/typography/opentype/spec/gsub).
+ */
 #define HB_OT_TAG_GSUB HB_TAG('G','S','U','B')
+/**
+ * HB_OT_TAG_GPOS:
+ *
+ * OpenType [Glyph Positioning Table](https://docs.microsoft.com/en-us/typography/opentype/spec/gpos).
+ */
 #define HB_OT_TAG_GPOS HB_TAG('G','P','O','S')
+/**
+ * HB_OT_TAG_JSTF:
+ *
+ * OpenType [Justification Table](https://docs.microsoft.com/en-us/typography/opentype/spec/jstf).
+ */
 #define HB_OT_TAG_JSTF HB_TAG('J','S','T','F')
 
 
@@ -49,18 +74,34 @@ HB_BEGIN_DECLS
  * Script & Language tags.
  */
 
+/**
+ * HB_OT_TAG_DEFAULT_SCRIPT:
+ *
+ * OpenType script tag, `DFLT`, for features that are not script-specific.
+ *
+ */
 #define HB_OT_TAG_DEFAULT_SCRIPT	HB_TAG ('D', 'F', 'L', 'T')
+/**
+ * HB_OT_TAG_DEFAULT_LANGUAGE:
+ *
+ * OpenType language tag, `dflt`. Not a valid language tag, but some fonts
+ * mistakenly use it.
+ */
 #define HB_OT_TAG_DEFAULT_LANGUAGE	HB_TAG ('d', 'f', 'l', 't')
 
 /**
  * HB_OT_MAX_TAGS_PER_SCRIPT:
  *
+ * Maximum number of OpenType tags that can correspond to a give #hb_script_t.
+ *
  * Since: 2.0.0
  **/
 #define HB_OT_MAX_TAGS_PER_SCRIPT	3u
 /**
  * HB_OT_MAX_TAGS_PER_LANGUAGE:
  *
+ * Maximum number of OpenType tags that can correspond to a give #hb_language_t.
+ *
  * Since: 2.0.0
  **/
 #define HB_OT_MAX_TAGS_PER_LANGUAGE	3u
@@ -144,9 +185,29 @@ hb_ot_layout_get_ligature_carets (hb_font_t      *font,
  * GSUB/GPOS feature query and enumeration interface
  */
 
+/**
+ * HB_OT_LAYOUT_NO_SCRIPT_INDEX:
+ *
+ * Special value for script index indicating unsupported script.
+ */
 #define HB_OT_LAYOUT_NO_SCRIPT_INDEX		0xFFFFu
+/**
+ * HB_OT_LAYOUT_NO_FEATURE_INDEX:
+ *
+ * Special value for feature index indicating unsupported feature.
+ */
 #define HB_OT_LAYOUT_NO_FEATURE_INDEX		0xFFFFu
+/**
+ * HB_OT_LAYOUT_DEFAULT_LANGUAGE_INDEX:
+ *
+ * Special value for language index indicating default or unsupported language.
+ */
 #define HB_OT_LAYOUT_DEFAULT_LANGUAGE_INDEX	0xFFFFu
+/**
+ * HB_OT_LAYOUT_NO_VARIATIONS_INDEX:
+ *
+ * Special value for variations index indicating unsupported variation.
+ */
 #define HB_OT_LAYOUT_NO_VARIATIONS_INDEX	0xFFFFFFFFu
 
 HB_EXTERN unsigned int
@@ -433,7 +494,7 @@ hb_ot_layout_feature_get_characters (hb_face_t      *face,
  * @HB_OT_LAYOUT_BASELINE_TAG_MATH: The baseline about which mathematical characters are centered.
  * In vertical writing mode when mathematical characters rotated 90 degrees clockwise, are centered.
  *
- * Baseline tags from https://docs.microsoft.com/en-us/typography/opentype/spec/baselinetags
+ * Baseline tags from [Baseline Tags](https://docs.microsoft.com/en-us/typography/opentype/spec/baselinetags) registry.
  *
  * Since: 2.6.0
  */
@@ -446,6 +507,7 @@ typedef enum {
   HB_OT_LAYOUT_BASELINE_TAG_IDEO_EMBOX_TOP_OR_RIGHT	= HB_TAG ('i','d','t','p'),
   HB_OT_LAYOUT_BASELINE_TAG_MATH			= HB_TAG ('m','a','t','h'),
 
+  /*< private >*/
   _HB_OT_LAYOUT_BASELINE_TAG_MAX_VALUE = HB_TAG_MAX_SIGNED /*< skip >*/
 } hb_ot_layout_baseline_tag_t;
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.hh b/thirdparty/harfbuzz/src/hb-ot-layout.hh
index f3bb15581a..ac61bc70de 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.hh
@@ -315,12 +315,13 @@ _hb_glyph_info_get_unicode_space_fallback_type (const hb_glyph_info_t *info)
 }
 
 static inline bool _hb_glyph_info_ligated (const hb_glyph_info_t *info);
+static inline bool _hb_glyph_info_substituted (const hb_glyph_info_t *info);
 
 static inline bool
 _hb_glyph_info_is_default_ignorable (const hb_glyph_info_t *info)
 {
   return (info->unicode_props() & UPROPS_MASK_IGNORABLE) &&
-	 !_hb_glyph_info_ligated (info);
+	 !_hb_glyph_info_substituted (info);
 }
 static inline bool
 _hb_glyph_info_is_default_ignorable_and_not_hidden (const hb_glyph_info_t *info)
diff --git a/thirdparty/harfbuzz/src/hb-ot-math.cc b/thirdparty/harfbuzz/src/hb-ot-math.cc
index 9d8c6e735a..5781d25f2a 100644
--- a/thirdparty/harfbuzz/src/hb-ot-math.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-math.cc
@@ -56,7 +56,7 @@
  *
  * Tests whether a face has a `MATH` table.
  *
- * Return value: true if the table is found, false otherwise
+ * Return value: %true if the table is found, %false otherwise
  *
  * Since: 1.3.3
  **/
@@ -142,7 +142,7 @@ hb_ot_math_get_glyph_top_accent_attachment (hb_font_t *font,
  *
  * Tests whether the given glyph index is an extended shape in the face.
  *
- * Return value: true if the glyph is an extended shape, false otherwise
+ * Return value: %true if the glyph is an extended shape, %false otherwise
  *
  * Since: 1.3.3
  **/
diff --git a/thirdparty/harfbuzz/src/hb-ot-math.h b/thirdparty/harfbuzz/src/hb-ot-math.h
index ad864a762d..d3ffa19d85 100644
--- a/thirdparty/harfbuzz/src/hb-ot-math.h
+++ b/thirdparty/harfbuzz/src/hb-ot-math.h
@@ -24,7 +24,7 @@
  * Igalia Author(s): Frédéric Wang
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -40,18 +40,89 @@ HB_BEGIN_DECLS
  * MATH
  */
 
+/**
+ * HB_OT_TAG_MATH:
+ *
+ * OpenType [Mathematical Typesetting Table](https://docs.microsoft.com/en-us/typography/opentype/spec/math).
+ *
+ * Since: 1.3.3
+ */
 #define HB_OT_TAG_MATH HB_TAG('M','A','T','H')
 
-/* Use with hb_buffer_set_script() for math shaping. */
+/**
+ * HB_OT_MATH_SCRIPT:
+ *
+ * OpenType script tag for math shaping, for use with
+ * Use with hb_buffer_set_script().
+ *
+ * Since: 1.3.3
+ */
 #define HB_OT_MATH_SCRIPT HB_TAG('m','a','t','h')
 
 /* Types */
 
 /**
  * hb_ot_math_constant_t:
+ * @HB_OT_MATH_CONSTANT_SCRIPT_PERCENT_SCALE_DOWN: scriptPercentScaleDown
+ * @HB_OT_MATH_CONSTANT_SCRIPT_SCRIPT_PERCENT_SCALE_DOWN: scriptScriptPercentScaleDown
+ * @HB_OT_MATH_CONSTANT_DELIMITED_SUB_FORMULA_MIN_HEIGHT: delimitedSubFormulaMinHeight
+ * @HB_OT_MATH_CONSTANT_DISPLAY_OPERATOR_MIN_HEIGHT: displayOperatorMinHeight
+ * @HB_OT_MATH_CONSTANT_MATH_LEADING: mathLeading
+ * @HB_OT_MATH_CONSTANT_AXIS_HEIGHT: axisHeight
+ * @HB_OT_MATH_CONSTANT_ACCENT_BASE_HEIGHT: accentBaseHeight
+ * @HB_OT_MATH_CONSTANT_FLATTENED_ACCENT_BASE_HEIGHT: flattenedAccentBaseHeight
+ * @HB_OT_MATH_CONSTANT_SUBSCRIPT_SHIFT_DOWN: subscriptShiftDown
+ * @HB_OT_MATH_CONSTANT_SUBSCRIPT_TOP_MAX: subscriptTopMax
+ * @HB_OT_MATH_CONSTANT_SUBSCRIPT_BASELINE_DROP_MIN: subscriptBaselineDropMin
+ * @HB_OT_MATH_CONSTANT_SUPERSCRIPT_SHIFT_UP: superscriptShiftUp
+ * @HB_OT_MATH_CONSTANT_SUPERSCRIPT_SHIFT_UP_CRAMPED: superscriptShiftUpCramped
+ * @HB_OT_MATH_CONSTANT_SUPERSCRIPT_BOTTOM_MIN: superscriptBottomMin
+ * @HB_OT_MATH_CONSTANT_SUPERSCRIPT_BASELINE_DROP_MAX: superscriptBaselineDropMax
+ * @HB_OT_MATH_CONSTANT_SUB_SUPERSCRIPT_GAP_MIN: subSuperscriptGapMin
+ * @HB_OT_MATH_CONSTANT_SUPERSCRIPT_BOTTOM_MAX_WITH_SUBSCRIPT: superscriptBottomMaxWithSubscript
+ * @HB_OT_MATH_CONSTANT_SPACE_AFTER_SCRIPT: spaceAfterScript
+ * @HB_OT_MATH_CONSTANT_UPPER_LIMIT_GAP_MIN: upperLimitGapMin
+ * @HB_OT_MATH_CONSTANT_UPPER_LIMIT_BASELINE_RISE_MIN: upperLimitBaselineRiseMin
+ * @HB_OT_MATH_CONSTANT_LOWER_LIMIT_GAP_MIN: lowerLimitGapMin
+ * @HB_OT_MATH_CONSTANT_LOWER_LIMIT_BASELINE_DROP_MIN: lowerLimitBaselineDropMin
+ * @HB_OT_MATH_CONSTANT_STACK_TOP_SHIFT_UP: stackTopShiftUp
+ * @HB_OT_MATH_CONSTANT_STACK_TOP_DISPLAY_STYLE_SHIFT_UP: stackTopDisplayStyleShiftUp
+ * @HB_OT_MATH_CONSTANT_STACK_BOTTOM_SHIFT_DOWN: stackBottomShiftDown
+ * @HB_OT_MATH_CONSTANT_STACK_BOTTOM_DISPLAY_STYLE_SHIFT_DOWN: stackBottomDisplayStyleShiftDown
+ * @HB_OT_MATH_CONSTANT_STACK_GAP_MIN: stackGapMin
+ * @HB_OT_MATH_CONSTANT_STACK_DISPLAY_STYLE_GAP_MIN: stackDisplayStyleGapMin
+ * @HB_OT_MATH_CONSTANT_STRETCH_STACK_TOP_SHIFT_UP: stretchStackTopShiftUp
+ * @HB_OT_MATH_CONSTANT_STRETCH_STACK_BOTTOM_SHIFT_DOWN: stretchStackBottomShiftDown
+ * @HB_OT_MATH_CONSTANT_STRETCH_STACK_GAP_ABOVE_MIN: stretchStackGapAboveMin
+ * @HB_OT_MATH_CONSTANT_STRETCH_STACK_GAP_BELOW_MIN: stretchStackGapBelowMin
+ * @HB_OT_MATH_CONSTANT_FRACTION_NUMERATOR_SHIFT_UP: fractionNumeratorShiftUp
+ * @HB_OT_MATH_CONSTANT_FRACTION_NUMERATOR_DISPLAY_STYLE_SHIFT_UP: fractionNumeratorDisplayStyleShiftUp
+ * @HB_OT_MATH_CONSTANT_FRACTION_DENOMINATOR_SHIFT_DOWN: fractionDenominatorShiftDown
+ * @HB_OT_MATH_CONSTANT_FRACTION_DENOMINATOR_DISPLAY_STYLE_SHIFT_DOWN: fractionDenominatorDisplayStyleShiftDown
+ * @HB_OT_MATH_CONSTANT_FRACTION_NUMERATOR_GAP_MIN: fractionNumeratorGapMin
+ * @HB_OT_MATH_CONSTANT_FRACTION_NUM_DISPLAY_STYLE_GAP_MIN: fractionNumDisplayStyleGapMin
+ * @HB_OT_MATH_CONSTANT_FRACTION_RULE_THICKNESS: fractionRuleThickness
+ * @HB_OT_MATH_CONSTANT_FRACTION_DENOMINATOR_GAP_MIN: fractionDenominatorGapMin
+ * @HB_OT_MATH_CONSTANT_FRACTION_DENOM_DISPLAY_STYLE_GAP_MIN: fractionDenomDisplayStyleGapMin
+ * @HB_OT_MATH_CONSTANT_SKEWED_FRACTION_HORIZONTAL_GAP: skewedFractionHorizontalGap
+ * @HB_OT_MATH_CONSTANT_SKEWED_FRACTION_VERTICAL_GAP: skewedFractionVerticalGap
+ * @HB_OT_MATH_CONSTANT_OVERBAR_VERTICAL_GAP: overbarVerticalGap
+ * @HB_OT_MATH_CONSTANT_OVERBAR_RULE_THICKNESS: overbarRuleThickness
+ * @HB_OT_MATH_CONSTANT_OVERBAR_EXTRA_ASCENDER: overbarExtraAscender
+ * @HB_OT_MATH_CONSTANT_UNDERBAR_VERTICAL_GAP: underbarVerticalGap
+ * @HB_OT_MATH_CONSTANT_UNDERBAR_RULE_THICKNESS: underbarRuleThickness
+ * @HB_OT_MATH_CONSTANT_UNDERBAR_EXTRA_DESCENDER: underbarExtraDescender
+ * @HB_OT_MATH_CONSTANT_RADICAL_VERTICAL_GAP: radicalVerticalGap
+ * @HB_OT_MATH_CONSTANT_RADICAL_DISPLAY_STYLE_VERTICAL_GAP: radicalDisplayStyleVerticalGap
+ * @HB_OT_MATH_CONSTANT_RADICAL_RULE_THICKNESS: radicalRuleThickness
+ * @HB_OT_MATH_CONSTANT_RADICAL_EXTRA_ASCENDER: radicalExtraAscender
+ * @HB_OT_MATH_CONSTANT_RADICAL_KERN_BEFORE_DEGREE: radicalKernBeforeDegree
+ * @HB_OT_MATH_CONSTANT_RADICAL_KERN_AFTER_DEGREE: radicalKernAfterDegree
+ * @HB_OT_MATH_CONSTANT_RADICAL_DEGREE_BOTTOM_RAISE_PERCENT: radicalDegreeBottomRaisePercent
  *
- * The 'MATH' table constants specified at
- * https://docs.microsoft.com/en-us/typography/opentype/spec/math
+ * The 'MATH' table constants, refer to
+ * [OpenType documentation](https://docs.microsoft.com/en-us/typography/opentype/spec/math#mathconstants-table)
+ * For more explanations.
  *
  * Since: 1.3.3
  */
@@ -116,6 +187,10 @@ typedef enum {
 
 /**
  * hb_ot_math_kern_t:
+ * @HB_OT_MATH_KERN_TOP_RIGHT: The top right corner of the glyph.
+ * @HB_OT_MATH_KERN_TOP_LEFT: The top left corner of the glyph.
+ * @HB_OT_MATH_KERN_BOTTOM_RIGHT: The bottom right corner of the glyph.
+ * @HB_OT_MATH_KERN_BOTTOM_LEFT: The bottom left corner of the glyph.
  *
  * The math kerning-table types defined for the four corners
  * of a glyph.
@@ -145,6 +220,8 @@ typedef struct hb_ot_math_glyph_variant_t {
 
 /**
  * hb_ot_math_glyph_part_flags_t:
+ * @HB_OT_MATH_GLYPH_PART_FLAG_EXTENDER: This is an extender glyph part that
+ * can be repeated to reach the desired length.
  *
  * Flags for math glyph parts.
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-meta.cc b/thirdparty/harfbuzz/src/hb-ot-meta.cc
index 54a0e10f9b..35c8eb523f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-meta.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-meta.cc
@@ -41,9 +41,11 @@
  * hb_ot_meta_get_entry_tags:
  * @face: a face object
  * @start_offset: iteration's start offset
- * @entries_count:(inout) (allow-none): buffer size as input, filled size as output
+ * @entries_count:(inout) (optional): buffer size as input, filled size as output
  * @entries: (out caller-allocates) (array length=entries_count): entries tags buffer
  *
+ * Fetches all available feature types.
+ *
  * Return value: Number of all available feature types.
  *
  * Since: 2.6.0
diff --git a/thirdparty/harfbuzz/src/hb-ot-meta.h b/thirdparty/harfbuzz/src/hb-ot-meta.h
index 0278d84148..7748eb4958 100644
--- a/thirdparty/harfbuzz/src/hb-ot-meta.h
+++ b/thirdparty/harfbuzz/src/hb-ot-meta.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -54,6 +54,7 @@ typedef enum {
   HB_OT_META_TAG_DESIGN_LANGUAGES	= HB_TAG ('d','l','n','g'),
   HB_OT_META_TAG_SUPPORTED_LANGUAGES	= HB_TAG ('s','l','n','g'),
 
+  /*< private >*/
   _HB_OT_META_TAG_MAX_VALUE = HB_TAG_MAX_SIGNED /*< skip >*/
 } hb_ot_meta_tag_t;
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-metrics.cc b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
index 3065ea2adf..72aeff82d6 100644
--- a/thirdparty/harfbuzz/src/hb-ot-metrics.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
@@ -119,11 +119,11 @@ _get_gasp (hb_face_t *face, float *result, hb_ot_metrics_tag_t metrics_tag)
 
 /**
  * hb_ot_metrics_get_position:
- * @font: a #hb_font_t object.
+ * @font: an #hb_font_t object.
  * @metrics_tag: tag of metrics value you like to fetch.
  * @position: (out) (optional): result of metrics value from the font.
  *
- * It fetches metrics value corresponding to a given tag from a font.
+ * Fetches metrics value corresponding to @metrics_tag from @font.
  *
  * Returns: Whether found the requested metrics in the font.
  * Since: 2.6.0
@@ -193,10 +193,13 @@ hb_ot_metrics_get_position (hb_font_t           *font,
 #ifndef HB_NO_VAR
 /**
  * hb_ot_metrics_get_variation:
- * @font:
- * @metrics_tag:
+ * @font: an #hb_font_t object.
+ * @metrics_tag: tag of metrics value you like to fetch.
+ *
+ * Fetches metrics value corresponding to @metrics_tag from @font with the
+ * current font variation settings applied.
  *
- * Returns:
+ * Returns: The requested metric value.
  *
  * Since: 2.6.0
  **/
@@ -208,10 +211,13 @@ hb_ot_metrics_get_variation (hb_font_t *font, hb_ot_metrics_tag_t metrics_tag)
 
 /**
  * hb_ot_metrics_get_x_variation:
- * @font:
- * @metrics_tag:
+ * @font: an #hb_font_t object.
+ * @metrics_tag: tag of metrics value you like to fetch.
  *
- * Returns:
+ * Fetches horizontal metrics value corresponding to @metrics_tag from @font
+ * with the current font variation settings applied.
+ *
+ * Returns: The requested metric value.
  *
  * Since: 2.6.0
  **/
@@ -223,10 +229,13 @@ hb_ot_metrics_get_x_variation (hb_font_t *font, hb_ot_metrics_tag_t metrics_tag)
 
 /**
  * hb_ot_metrics_get_y_variation:
- * @font:
- * @metrics_tag:
+ * @font: an #hb_font_t object.
+ * @metrics_tag: tag of metrics value you like to fetch.
+ *
+ * Fetches vertical metrics value corresponding to @metrics_tag from @font with
+ * the current font variation settings applied.
  *
- * Returns:
+ * Returns: The requested metric value.
  *
  * Since: 2.6.0
  **/
diff --git a/thirdparty/harfbuzz/src/hb-ot-metrics.h b/thirdparty/harfbuzz/src/hb-ot-metrics.h
index 42c7363c03..5841fc8b0f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-metrics.h
+++ b/thirdparty/harfbuzz/src/hb-ot-metrics.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -66,7 +66,8 @@ HB_BEGIN_DECLS
  * @HB_OT_METRICS_TAG_UNDERLINE_SIZE: underline size.
  * @HB_OT_METRICS_TAG_UNDERLINE_OFFSET: underline offset.
  *
- * From https://docs.microsoft.com/en-us/typography/opentype/spec/mvar#value-tags
+ * Metric tags corresponding to [MVAR Value
+ * Tags](https://docs.microsoft.com/en-us/typography/opentype/spec/mvar#value-tags)
  *
  * Since: 2.6.0
  **/
@@ -100,6 +101,7 @@ typedef enum {
   HB_OT_METRICS_TAG_UNDERLINE_SIZE		= HB_TAG ('u','n','d','s'),
   HB_OT_METRICS_TAG_UNDERLINE_OFFSET		= HB_TAG ('u','n','d','o'),
 
+  /*< private >*/
   _HB_OT_METRICS_TAG_MAX_VALUE = HB_TAG_MAX_SIGNED /*< skip >*/
 } hb_ot_metrics_tag_t;
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-name.cc b/thirdparty/harfbuzz/src/hb-ot-name.cc
index 10122b8c2e..4588226e6e 100644
--- a/thirdparty/harfbuzz/src/hb-ot-name.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-name.cc
@@ -46,7 +46,7 @@
 /**
  * hb_ot_name_list_names:
  * @face: font face.
- * @num_entries: (out) (allow-none): number of returned entries.
+ * @num_entries: (out) (optional): number of returned entries.
  *
  * Enumerates all available name IDs and language combinations. Returned
  * array is owned by the @face and should not be modified.  It can be
@@ -150,7 +150,7 @@ hb_ot_name_get_utf (hb_face_t       *face,
  * @face: font face.
  * @name_id: OpenType name identifier to fetch.
  * @language: language to fetch the name for.
- * @text_size: (inout) (allow-none): input size of @text buffer, and output size of
+ * @text_size: (inout) (optional): input size of @text buffer, and output size of
  *                                   text written to buffer.
  * @text: (out caller-allocates) (array length=text_size): buffer to write fetched name into.
  *
@@ -177,7 +177,7 @@ hb_ot_name_get_utf8 (hb_face_t       *face,
  * @face: font face.
  * @name_id: OpenType name identifier to fetch.
  * @language: language to fetch the name for.
- * @text_size: (inout) (allow-none): input size of @text buffer, and output size of
+ * @text_size: (inout) (optional): input size of @text buffer, and output size of
  *                                   text written to buffer.
  * @text: (out caller-allocates) (array length=text_size): buffer to write fetched name into.
  *
@@ -203,7 +203,7 @@ hb_ot_name_get_utf16 (hb_face_t       *face,
  * @face: font face.
  * @name_id: OpenType name identifier to fetch.
  * @language: language to fetch the name for.
- * @text_size: (inout) (allow-none): input size of @text buffer, and output size of
+ * @text_size: (inout) (optional): input size of @text buffer, and output size of
  *                                   text written to buffer.
  * @text: (out caller-allocates) (array length=text_size): buffer to write fetched name into.
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-name.h b/thirdparty/harfbuzz/src/hb-ot-name.h
index 6f3fcd2427..9359014c8a 100644
--- a/thirdparty/harfbuzz/src/hb-ot-name.h
+++ b/thirdparty/harfbuzz/src/hb-ot-name.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-os2-table.hh b/thirdparty/harfbuzz/src/hb-ot-os2-table.hh
index 7d31b712c4..8e98f87f4e 100644
--- a/thirdparty/harfbuzz/src/hb-ot-os2-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-os2-table.hh
@@ -177,15 +177,14 @@ struct OS2
     if (!c->plan->glyphs_requested->is_empty ())
     {
       hb_map_t unicode_glyphid_map;
-      
+
       OT::cmap::accelerator_t cmap;
       cmap.init (c->plan->source);
       cmap.collect_mapping (&unicodes, &unicode_glyphid_map);
       cmap.fini ();
-      
-      if (c->plan->unicodes->is_empty ()) unicodes.clear ();
-      else hb_set_set (&unicodes, c->plan->unicodes);
-  
+
+      hb_set_set (&unicodes, c->plan->unicodes);
+
       + unicode_glyphid_map.iter ()
       | hb_filter (c->plan->glyphs_requested, hb_second)
       | hb_map (hb_first)
diff --git a/thirdparty/harfbuzz/src/hb-ot-post-table.hh b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
index 8586331cd4..f22d6e244d 100644
--- a/thirdparty/harfbuzz/src/hb-ot-post-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
@@ -87,7 +87,6 @@ struct post
     if (unlikely (!post_prime)) return_trace (false);
 
     serialize (c->serializer);
-    if (c->serializer->in_error () || c->serializer->ran_out_of_room) return_trace (false);
 
     return_trace (true);
   }
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
index b15e145f2f..41e3dd38ab 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
@@ -142,7 +142,7 @@
 		OT_UARRAY(Name##Substitute, OT_LIST(ToGlyphs)) \
 	) \
 	OT_COVERAGE1(Name##Coverage, OT_LIST(FromGlyphs)) \
-	/* ASSERT_STATIC_EXPR_ZERO (len(FromGlyphs) == len(ToGlyphs)) */
+	/* static_assert_expr (len(FromGlyphs) == len(ToGlyphs)) */
 
 #define OT_SUBLOOKUP_LIGATURE_SUBST_FORMAT1(Name, FirstGlyphs, LigatureSetOffsets) \
 	OT_SUBLOOKUP(Name, 1, \
@@ -151,7 +151,7 @@
 		OT_UARRAY(Name##LigatureSetOffsetsArray, OT_LIST(LigatureSetOffsets)) \
 	) \
 	OT_COVERAGE1(Name##Coverage, OT_LIST(FirstGlyphs)) \
-	/* ASSERT_STATIC_EXPR_ZERO (len(FirstGlyphs) == len(LigatureSetOffsets)) */
+	/* static_assert_expr (len(FirstGlyphs) == len(LigatureSetOffsets)) */
 
 #define OT_LIGATURE_SET(Name, LigatureSetOffsets) \
 	OT_UARRAY(Name, OT_LIST(LigatureSetOffsets))
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
index 1e93f0efd5..1f244f940c 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
@@ -33,7 +33,7 @@
 
 
 /* buffer var allocations */
-#define arabic_shaping_action() complex_var_u8_0() /* arabic shaping action */
+#define arabic_shaping_action() complex_var_u8_auxiliary() /* arabic shaping action */
 
 #define HB_BUFFER_SCRATCH_FLAG_ARABIC_HAS_STCH HB_BUFFER_SCRATCH_FLAG_COMPLEX0
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
index f5915f43ae..dbedd6af0c 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
@@ -119,7 +119,7 @@ data_destroy_hangul (void *data)
 #define isHangulTone(u) (hb_in_range<hb_codepoint_t> ((u), 0x302Eu, 0x302Fu))
 
 /* buffer var allocations */
-#define hangul_shaping_feature() complex_var_u8_0() /* hangul jamo shaping feature */
+#define hangul_shaping_feature() complex_var_u8_auxiliary() /* hangul jamo shaping feature */
 
 static bool
 is_zero_width_char (hb_font_t *font,
@@ -205,7 +205,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
       {
 	/* Tone mark follows a valid syllable; move it in front, unless it's zero width. */
 	buffer->unsafe_to_break_from_outbuffer (start, buffer->idx);
-	buffer->next_glyph ();
+	if (unlikely (!buffer->next_glyph ())) break;
 	if (!is_zero_width_char (font, u))
 	{
 	  buffer->merge_out_clusters (start, end + 1);
@@ -218,23 +218,25 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
       else
       {
 	/* No valid syllable as base for tone mark; try to insert dotted circle. */
-      if (!(buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE) &&
-	  font->has_glyph (0x25CCu))
+	if (!(buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE) &&
+	    font->has_glyph (0x25CCu))
 	{
 	  hb_codepoint_t chars[2];
-	  if (!is_zero_width_char (font, u)) {
+	  if (!is_zero_width_char (font, u))
+	  {
 	    chars[0] = u;
 	    chars[1] = 0x25CCu;
-	  } else {
+	  } else
+	  {
 	    chars[0] = 0x25CCu;
 	    chars[1] = u;
 	  }
-	  buffer->replace_glyphs (1, 2, chars);
+	  (void) buffer->replace_glyphs (1, 2, chars);
 	}
 	else
 	{
 	  /* No dotted circle available in the font; just leave tone mark untouched. */
-	  buffer->next_glyph ();
+	  (void) buffer->next_glyph ();
 	}
       }
       start = end = buffer->out_len;
@@ -271,9 +273,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	  hb_codepoint_t s = SBase + (l - LBase) * NCount + (v - VBase) * TCount + tindex;
 	  if (font->has_glyph (s))
 	  {
-	    buffer->replace_glyphs (t ? 3 : 2, 1, &s);
-	    if (unlikely (!buffer->successful))
-	      return;
+	    (void) buffer->replace_glyphs (t ? 3 : 2, 1, &s);
 	    end = start + 1;
 	    continue;
 	  }
@@ -285,17 +285,19 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	 * Set jamo features on the individual glyphs, and advance past them.
 	 */
 	buffer->cur().hangul_shaping_feature() = LJMO;
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	buffer->cur().hangul_shaping_feature() = VJMO;
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (t)
 	{
 	  buffer->cur().hangul_shaping_feature() = TJMO;
-	  buffer->next_glyph ();
+	  (void) buffer->next_glyph ();
 	  end = start + 3;
 	}
 	else
 	  end = start + 2;
+	if (unlikely (!buffer->successful))
+	  break;
 	if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
 	  buffer->merge_out_clusters (start, end);
 	continue;
@@ -321,9 +323,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	hb_codepoint_t new_s = s + new_tindex;
 	if (font->has_glyph (new_s))
 	{
-	  buffer->replace_glyphs (2, 1, &new_s);
-	  if (unlikely (!buffer->successful))
-	    return;
+	  (void) buffer->replace_glyphs (2, 1, &new_s);
 	  end = start + 1;
 	  continue;
 	}
@@ -347,19 +347,18 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    (!tindex || font->has_glyph (decomposed[2])))
 	{
 	  unsigned int s_len = tindex ? 3 : 2;
-	  buffer->replace_glyphs (1, s_len, decomposed);
+	  (void) buffer->replace_glyphs (1, s_len, decomposed);
 
 	  /* If we decomposed an LV because of a non-combining T following,
 	   * we want to include this T in the syllable.
 	   */
 	  if (has_glyph && !tindex)
 	  {
-	    buffer->next_glyph ();
+	    (void) buffer->next_glyph ();
 	    s_len++;
 	  }
-
 	  if (unlikely (!buffer->successful))
-	    return;
+	    break;
 
 	  /* We decomposed S: apply jamo features to the individual glyphs
 	   * that are now in buffer->out_info.
@@ -383,17 +382,15 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
 
       if (has_glyph)
       {
-	/* We didn't decompose the S, so just advance past it. */
+	/* We didn't decompose the S, so just advance past it and fall through. */
 	end = start + 1;
-	buffer->next_glyph ();
-	continue;
       }
     }
 
     /* Didn't find a recognizable syllable, so we leave end <= start;
      * this will prevent tone-mark reordering happening.
      */
-    buffer->next_glyph ();
+    (void) buffer->next_glyph ();
   }
   buffer->swap_buffers ();
 }
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-machine.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-machine.hh
index 670b6bf486..74bf3ca0fa 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-machine.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-machine.hh
@@ -31,8 +31,37 @@
 
 #include "hb.hh"
 
+enum indic_syllable_type_t {
+  indic_consonant_syllable,
+  indic_vowel_syllable,
+  indic_standalone_cluster,
+  indic_symbol_cluster,
+  indic_broken_cluster,
+  indic_non_indic_cluster,
+};
+
 
-#line 36 "hb-ot-shape-complex-indic-machine.hh"
+#line 45 "hb-ot-shape-complex-indic-machine.hh"
+#define indic_syllable_machine_ex_A 10u
+#define indic_syllable_machine_ex_C 1u
+#define indic_syllable_machine_ex_CM 17u
+#define indic_syllable_machine_ex_CS 19u
+#define indic_syllable_machine_ex_DOTTEDCIRCLE 12u
+#define indic_syllable_machine_ex_H 4u
+#define indic_syllable_machine_ex_M 7u
+#define indic_syllable_machine_ex_N 3u
+#define indic_syllable_machine_ex_PLACEHOLDER 11u
+#define indic_syllable_machine_ex_RS 13u
+#define indic_syllable_machine_ex_Ra 16u
+#define indic_syllable_machine_ex_Repha 15u
+#define indic_syllable_machine_ex_SM 8u
+#define indic_syllable_machine_ex_Symbol 18u
+#define indic_syllable_machine_ex_V 2u
+#define indic_syllable_machine_ex_ZWJ 6u
+#define indic_syllable_machine_ex_ZWNJ 5u
+
+
+#line 65 "hb-ot-shape-complex-indic-machine.hh"
 static const unsigned char _indic_syllable_machine_trans_keys[] = {
 	8u, 8u, 4u, 8u, 5u, 7u, 5u, 8u, 4u, 8u, 6u, 6u, 16u, 16u, 4u, 8u, 
 	4u, 13u, 4u, 8u, 8u, 8u, 5u, 7u, 5u, 8u, 4u, 8u, 6u, 6u, 16u, 16u, 
@@ -384,18 +413,18 @@ static const int indic_syllable_machine_error = -1;
 static const int indic_syllable_machine_en_main = 39;
 
 
-#line 36 "hb-ot-shape-complex-indic-machine.rl"
+#line 46 "hb-ot-shape-complex-indic-machine.rl"
 
 
 
-#line 93 "hb-ot-shape-complex-indic-machine.rl"
+#line 102 "hb-ot-shape-complex-indic-machine.rl"
 
 
 #define found_syllable(syllable_type) \
   HB_STMT_START { \
     if (0) fprintf (stderr, "syllable %d..%d %s\n", ts, te, #syllable_type); \
     for (unsigned int i = ts; i < te; i++) \
-      info[i].syllable() = (syllable_serial << 4) | indic_##syllable_type; \
+      info[i].syllable() = (syllable_serial << 4) | syllable_type; \
     syllable_serial++; \
     if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
   } HB_STMT_END
@@ -407,7 +436,7 @@ find_syllables_indic (hb_buffer_t *buffer)
   int cs;
   hb_glyph_info_t *info = buffer->info;
   
-#line 411 "hb-ot-shape-complex-indic-machine.hh"
+#line 440 "hb-ot-shape-complex-indic-machine.hh"
 	{
 	cs = indic_syllable_machine_start;
 	ts = 0;
@@ -415,7 +444,7 @@ find_syllables_indic (hb_buffer_t *buffer)
 	act = 0;
 	}
 
-#line 113 "hb-ot-shape-complex-indic-machine.rl"
+#line 122 "hb-ot-shape-complex-indic-machine.rl"
 
 
   p = 0;
@@ -423,7 +452,7 @@ find_syllables_indic (hb_buffer_t *buffer)
 
   unsigned int syllable_serial = 1;
   
-#line 427 "hb-ot-shape-complex-indic-machine.hh"
+#line 456 "hb-ot-shape-complex-indic-machine.hh"
 	{
 	int _slen;
 	int _trans;
@@ -437,7 +466,7 @@ _resume:
 #line 1 "NONE"
 	{ts = p;}
 	break;
-#line 441 "hb-ot-shape-complex-indic-machine.hh"
+#line 470 "hb-ot-shape-complex-indic-machine.hh"
 	}
 
 	_keys = _indic_syllable_machine_trans_keys + (cs<<1);
@@ -460,64 +489,64 @@ _eof_trans:
 	{te = p+1;}
 	break;
 	case 11:
-#line 89 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p+1;{ found_syllable (non_indic_cluster); }}
+#line 98 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p+1;{ found_syllable (indic_non_indic_cluster); }}
 	break;
 	case 13:
-#line 84 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (consonant_syllable); }}
+#line 93 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_consonant_syllable); }}
 	break;
 	case 14:
-#line 85 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (vowel_syllable); }}
+#line 94 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_vowel_syllable); }}
 	break;
 	case 17:
-#line 86 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (standalone_cluster); }}
+#line 95 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_standalone_cluster); }}
 	break;
 	case 19:
-#line 87 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (symbol_cluster); }}
+#line 96 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_symbol_cluster); }}
 	break;
 	case 15:
-#line 88 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (broken_cluster); }}
+#line 97 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_broken_cluster); }}
 	break;
 	case 16:
-#line 89 "hb-ot-shape-complex-indic-machine.rl"
-	{te = p;p--;{ found_syllable (non_indic_cluster); }}
+#line 98 "hb-ot-shape-complex-indic-machine.rl"
+	{te = p;p--;{ found_syllable (indic_non_indic_cluster); }}
 	break;
 	case 1:
-#line 84 "hb-ot-shape-complex-indic-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (consonant_syllable); }}
+#line 93 "hb-ot-shape-complex-indic-machine.rl"
+	{{p = ((te))-1;}{ found_syllable (indic_consonant_syllable); }}
 	break;
 	case 3:
-#line 85 "hb-ot-shape-complex-indic-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (vowel_syllable); }}
+#line 94 "hb-ot-shape-complex-indic-machine.rl"
+	{{p = ((te))-1;}{ found_syllable (indic_vowel_syllable); }}
 	break;
 	case 7:
-#line 86 "hb-ot-shape-complex-indic-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (standalone_cluster); }}
+#line 95 "hb-ot-shape-complex-indic-machine.rl"
+	{{p = ((te))-1;}{ found_syllable (indic_standalone_cluster); }}
 	break;
 	case 8:
-#line 87 "hb-ot-shape-complex-indic-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (symbol_cluster); }}
+#line 96 "hb-ot-shape-complex-indic-machine.rl"
+	{{p = ((te))-1;}{ found_syllable (indic_symbol_cluster); }}
 	break;
 	case 4:
-#line 88 "hb-ot-shape-complex-indic-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (broken_cluster); }}
+#line 97 "hb-ot-shape-complex-indic-machine.rl"
+	{{p = ((te))-1;}{ found_syllable (indic_broken_cluster); }}
 	break;
 	case 6:
 #line 1 "NONE"
 	{	switch( act ) {
 	case 1:
-	{{p = ((te))-1;} found_syllable (consonant_syllable); }
+	{{p = ((te))-1;} found_syllable (indic_consonant_syllable); }
 	break;
 	case 5:
-	{{p = ((te))-1;} found_syllable (broken_cluster); }
+	{{p = ((te))-1;} found_syllable (indic_broken_cluster); }
 	break;
 	case 6:
-	{{p = ((te))-1;} found_syllable (non_indic_cluster); }
+	{{p = ((te))-1;} found_syllable (indic_non_indic_cluster); }
 	break;
 	}
 	}
@@ -525,22 +554,22 @@ _eof_trans:
 	case 18:
 #line 1 "NONE"
 	{te = p+1;}
-#line 84 "hb-ot-shape-complex-indic-machine.rl"
+#line 93 "hb-ot-shape-complex-indic-machine.rl"
 	{act = 1;}
 	break;
 	case 5:
 #line 1 "NONE"
 	{te = p+1;}
-#line 88 "hb-ot-shape-complex-indic-machine.rl"
+#line 97 "hb-ot-shape-complex-indic-machine.rl"
 	{act = 5;}
 	break;
 	case 12:
 #line 1 "NONE"
 	{te = p+1;}
-#line 89 "hb-ot-shape-complex-indic-machine.rl"
+#line 98 "hb-ot-shape-complex-indic-machine.rl"
 	{act = 6;}
 	break;
-#line 544 "hb-ot-shape-complex-indic-machine.hh"
+#line 573 "hb-ot-shape-complex-indic-machine.hh"
 	}
 
 _again:
@@ -549,7 +578,7 @@ _again:
 #line 1 "NONE"
 	{ts = 0;}
 	break;
-#line 553 "hb-ot-shape-complex-indic-machine.hh"
+#line 582 "hb-ot-shape-complex-indic-machine.hh"
 	}
 
 	if ( ++p != pe )
@@ -565,7 +594,7 @@ _again:
 
 	}
 
-#line 121 "hb-ot-shape-complex-indic-machine.rl"
+#line 130 "hb-ot-shape-complex-indic-machine.rl"
 
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-table.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-table.cc
index a150fd2486..dd204b23c1 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-table.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic-table.cc
@@ -82,7 +82,7 @@
 #define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)
 
 
-static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {
+static const uint16_t indic_table[] = {
 
 
 #define indic_offset_0x0028u 0
@@ -404,7 +404,7 @@ static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {
 
 }; /* Table items: 1792; occupancy: 70% */
 
-INDIC_TABLE_ELEMENT_TYPE
+uint16_t
 hb_indic_get_categories (hb_codepoint_t u)
 {
   switch (u >> 12)
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.cc
index 652ef47040..a4f2d9a847 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.cc
@@ -29,6 +29,7 @@
 #ifndef HB_NO_OT_SHAPE
 
 #include "hb-ot-shape-complex-indic.hh"
+#include "hb-ot-shape-complex-indic-machine.hh"
 #include "hb-ot-shape-complex-vowel-constraints.hh"
 #include "hb-ot-layout.hh"
 
@@ -337,19 +338,6 @@ consonant_position_from_face (const indic_shape_plan_t *indic_plan,
   return POS_BASE_C;
 }
 
-
-enum indic_syllable_type_t {
-  indic_consonant_syllable,
-  indic_vowel_syllable,
-  indic_standalone_cluster,
-  indic_symbol_cluster,
-  indic_broken_cluster,
-  indic_non_indic_cluster,
-};
-
-#include "hb-ot-shape-complex-indic-machine.hh"
-
-
 static void
 setup_masks_indic (const hb_ot_shape_plan_t *plan HB_UNUSED,
 		   hb_buffer_t              *buffer,
@@ -764,7 +752,28 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan,
      * We could use buffer->sort() for this, if there was no special
      * reordering of pre-base stuff happening later...
      * We don't want to merge_clusters all of that, which buffer->sort()
-     * would.
+     * would.  Here's a concrete example:
+     *
+     * Assume there's a pre-base consonant and explicit Halant before base,
+     * followed by a prebase-reordering (left) Matra:
+     *
+     *   C,H,ZWNJ,B,M
+     *
+     * At this point in reordering we would have:
+     *
+     *   M,C,H,ZWNJ,B
+     *
+     * whereas in final reordering we will bring the Matra closer to Base:
+     *
+     *   C,H,ZWNJ,M,B
+     *
+     * That's why we don't want to merge-clusters anything before the Base
+     * at this point.  But if something moved from after Base to before it,
+     * we should merge clusters from base to them.  In final-reordering, we
+     * only move things around before base, and merge-clusters up to base.
+     * These two merge-clusters from the two sides of base will interlock
+     * to merge things correctly.  See:
+     * https://github.com/harfbuzz/harfbuzz/issues/2272
      */
     if (indic_plan->is_old_spec || end - start > 127)
       buffer->merge_clusters (base, end);
@@ -774,17 +783,18 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan,
       for (unsigned int i = base; i < end; i++)
 	if (info[i].syllable() != 255)
 	{
+	  unsigned int min = i;
 	  unsigned int max = i;
 	  unsigned int j = start + info[i].syllable();
 	  while (j != i)
 	  {
+	    min = hb_min (min, j);
 	    max = hb_max (max, j);
 	    unsigned int next = start + info[j].syllable();
 	    info[j].syllable() = 255; /* So we don't process j later again. */
 	    j = next;
 	  }
-	  if (i != max)
-	    buffer->merge_clusters (i, max + 1);
+	  buffer->merge_clusters (hb_max (base, min), max + 1);
 	}
     }
 
@@ -938,69 +948,6 @@ initial_reordering_syllable_indic (const hb_ot_shape_plan_t *plan,
   }
 }
 
-static inline void
-insert_dotted_circles_indic (const hb_ot_shape_plan_t *plan HB_UNUSED,
-			     hb_font_t *font,
-			     hb_buffer_t *buffer)
-{
-  if (unlikely (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE))
-    return;
-
-  /* Note: This loop is extra overhead, but should not be measurable.
-   * TODO Use a buffer scratch flag to remove the loop. */
-  bool has_broken_syllables = false;
-  unsigned int count = buffer->len;
-  hb_glyph_info_t *info = buffer->info;
-  for (unsigned int i = 0; i < count; i++)
-    if ((info[i].syllable() & 0x0F) == indic_broken_cluster)
-    {
-      has_broken_syllables = true;
-      break;
-    }
-  if (likely (!has_broken_syllables))
-    return;
-
-
-  hb_codepoint_t dottedcircle_glyph;
-  if (!font->get_nominal_glyph (0x25CCu, &dottedcircle_glyph))
-    return;
-
-  hb_glyph_info_t dottedcircle = {0};
-  dottedcircle.codepoint = 0x25CCu;
-  set_indic_properties (dottedcircle);
-  dottedcircle.codepoint = dottedcircle_glyph;
-
-  buffer->clear_output ();
-
-  buffer->idx = 0;
-  unsigned int last_syllable = 0;
-  while (buffer->idx < buffer->len && buffer->successful)
-  {
-    unsigned int syllable = buffer->cur().syllable();
-    indic_syllable_type_t syllable_type = (indic_syllable_type_t) (syllable & 0x0F);
-    if (unlikely (last_syllable != syllable && syllable_type == indic_broken_cluster))
-    {
-      last_syllable = syllable;
-
-      hb_glyph_info_t ginfo = dottedcircle;
-      ginfo.cluster = buffer->cur().cluster;
-      ginfo.mask = buffer->cur().mask;
-      ginfo.syllable() = buffer->cur().syllable();
-
-      /* Insert dottedcircle after possible Repha. */
-      while (buffer->idx < buffer->len && buffer->successful &&
-	     last_syllable == buffer->cur().syllable() &&
-	     buffer->cur().indic_category() == OT_Repha)
-	buffer->next_glyph ();
-
-      buffer->output_info (ginfo);
-    }
-    else
-      buffer->next_glyph ();
-  }
-  buffer->swap_buffers ();
-}
-
 static void
 initial_reordering_indic (const hb_ot_shape_plan_t *plan,
 			  hb_font_t *font,
@@ -1008,11 +955,16 @@ initial_reordering_indic (const hb_ot_shape_plan_t *plan,
 {
   if (!buffer->message (font, "start reordering indic initial"))
     return;
+
   update_consonant_positions_indic (plan, font, buffer);
-  insert_dotted_circles_indic (plan, font, buffer);
+  hb_syllabic_insert_dotted_circles (font, buffer,
+				     indic_broken_cluster,
+				     OT_DOTTEDCIRCLE,
+				     OT_Repha);
 
   foreach_syllable (buffer, start, end)
     initial_reordering_syllable_indic (plan, font->face, buffer, start, end);
+
   (void) buffer->message (font, "end reordering indic initial");
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.hh
index 41bd8bd6cc..dcb28a4e84 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-indic.hh
@@ -29,16 +29,14 @@
 
 #include "hb.hh"
 
-#include "hb-ot-shape-complex.hh"
+#include "hb-ot-shape-complex-syllabic.hh"
 
 
 /* buffer var allocations */
-#define indic_category() complex_var_u8_0() /* indic_category_t */
-#define indic_position() complex_var_u8_1() /* indic_position_t */
+#define indic_category() complex_var_u8_category() /* indic_category_t */
+#define indic_position() complex_var_u8_auxiliary() /* indic_position_t */
 
 
-#define INDIC_TABLE_ELEMENT_TYPE uint16_t
-
 /* Cateories used in the OpenType spec:
  * https://docs.microsoft.com/en-us/typography/script-development/devanagari
  */
@@ -177,7 +175,7 @@ enum indic_matra_category_t {
 
 #define INDIC_COMBINE_CATEGORIES(S,M) \
   ( \
-    ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
+    static_assert_expr (S < 255 && M < 255) + \
     ( S | \
      ( \
       ( \
@@ -194,7 +192,7 @@ enum indic_matra_category_t {
     ) \
    )
 
-HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE
+HB_INTERNAL uint16_t
 hb_indic_get_categories (hb_codepoint_t u);
 
 
@@ -307,17 +305,12 @@ static const hb_codepoint_t ra_chars[] = {
   0x0D30u, /* Malayalam */	/* No Reph, Logical Repha */
 
   0x0DBBu, /* Sinhala */	/* Reph formed only with ZWJ */
-
-  0x179Au, /* Khmer */
 };
 
 static inline bool
 is_ra (hb_codepoint_t u)
 {
-  for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
-    if (u == ra_chars[i])
-      return true;
-  return false;
+  return hb_array (ra_chars).lfind (u);
 }
 
 static inline void
@@ -325,7 +318,7 @@ set_indic_properties (hb_glyph_info_t &info)
 {
   hb_codepoint_t u = info.codepoint;
   unsigned int type = hb_indic_get_categories (u);
-  indic_category_t cat = (indic_category_t) (type & 0x7Fu);
+  indic_category_t cat = (indic_category_t) (type & 0xFFu);
   indic_position_t pos = (indic_position_t) (type >> 8);
 
 
@@ -370,6 +363,7 @@ set_indic_properties (hb_glyph_info_t &info)
   else if (unlikely (u == 0x1133Bu || u == 0x1133Cu)) cat = OT_N;
 
   else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */
+  else if (unlikely (u == 0x0B55u)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/2849 */
 
   else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */
   else if (unlikely (u == 0x09FCu)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/1613 */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.hh
index a040318d34..82ab186a41 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.hh
@@ -1,211 +1,179 @@
-
 #line 1 "hb-ot-shape-complex-khmer-machine.rl"
 /*
- * Copyright © 2011,2012  Google, Inc.
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Google Author(s): Behdad Esfahbod
- */
+* Copyright © 2011,2012  Google, Inc.
+*
+*  This is part of HarfBuzz, a text shaping library.
+*
+* Permission is hereby granted, without written agreement and without
+* license or royalty fees, to use, copy, modify, and distribute this
+* software and its documentation for any purpose, provided that the
+* above copyright notice and the following two paragraphs appear in
+* all copies of this software.
+*
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+* DAMAGE.
+*
+* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+* FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+*
+* Google Author(s): Behdad Esfahbod
+*/
 
 #ifndef HB_OT_SHAPE_COMPLEX_KHMER_MACHINE_HH
 #define HB_OT_SHAPE_COMPLEX_KHMER_MACHINE_HH
 
 #include "hb.hh"
 
+enum khmer_syllable_type_t {
+	khmer_consonant_syllable,
+	khmer_broken_cluster,
+	khmer_non_khmer_cluster,
+};
+
 
-#line 36 "hb-ot-shape-complex-khmer-machine.hh"
+#line 41 "hb-ot-shape-complex-khmer-machine.hh"
+#define khmer_syllable_machine_ex_C 1u
+#define khmer_syllable_machine_ex_Coeng 14u
+#define khmer_syllable_machine_ex_DOTTEDCIRCLE 12u
+#define khmer_syllable_machine_ex_PLACEHOLDER 11u
+#define khmer_syllable_machine_ex_Ra 16u
+#define khmer_syllable_machine_ex_Robatic 20u
+#define khmer_syllable_machine_ex_V 2u
+#define khmer_syllable_machine_ex_VAbv 26u
+#define khmer_syllable_machine_ex_VBlw 27u
+#define khmer_syllable_machine_ex_VPre 28u
+#define khmer_syllable_machine_ex_VPst 29u
+#define khmer_syllable_machine_ex_Xgroup 21u
+#define khmer_syllable_machine_ex_Ygroup 22u
+#define khmer_syllable_machine_ex_ZWJ 6u
+#define khmer_syllable_machine_ex_ZWNJ 5u
+
+
+#line 59 "hb-ot-shape-complex-khmer-machine.hh"
 static const unsigned char _khmer_syllable_machine_trans_keys[] = {
-	5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u, 
-	5u, 26u, 5u, 21u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 
-	5u, 21u, 5u, 26u, 5u, 21u, 5u, 26u, 1u, 29u, 5u, 29u, 5u, 29u, 5u, 29u, 
-	22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 5u, 29u, 1u, 16u, 5u, 26u, 5u, 29u, 
-	5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 1u, 16u, 5u, 29u, 5u, 29u, 
-	0
+	2u, 8u, 2u, 6u, 2u, 8u, 2u, 6u,
+	0u, 0u, 2u, 6u, 2u, 8u, 2u, 6u,
+	2u, 8u, 2u, 6u, 2u, 6u, 2u, 8u,
+	2u, 6u, 0u, 0u, 2u, 6u, 2u, 8u,
+	2u, 6u, 2u, 8u, 2u, 6u, 2u, 8u,
+	0u, 11u, 2u, 11u, 2u, 11u, 2u, 11u,
+	7u, 7u, 2u, 7u, 2u, 11u, 2u, 11u,
+	2u, 11u, 0u, 0u, 2u, 8u, 2u, 11u,
+	2u, 11u, 7u, 7u, 2u, 7u, 2u, 11u,
+	2u, 11u, 0u, 0u, 2u, 11u, 2u, 11u,
+	0u
 };
 
-static const char _khmer_syllable_machine_key_spans[] = {
-	22, 17, 22, 17, 16, 17, 22, 17, 
-	22, 17, 17, 22, 17, 16, 17, 22, 
-	17, 22, 17, 22, 29, 25, 25, 25, 
-	1, 18, 25, 25, 25, 16, 22, 25, 
-	25, 1, 18, 25, 25, 16, 25, 25
+static const signed char _khmer_syllable_machine_char_class[] = {
+	0, 0, 1, 1, 2, 2, 1, 1,
+	1, 1, 3, 3, 1, 4, 1, 0,
+	1, 1, 1, 5, 6, 7, 1, 1,
+	1, 8, 9, 10, 11, 0
 };
 
 static const short _khmer_syllable_machine_index_offsets[] = {
-	0, 23, 41, 64, 82, 99, 117, 140, 
-	158, 181, 199, 217, 240, 258, 275, 293, 
-	316, 334, 357, 375, 398, 428, 454, 480, 
-	506, 508, 527, 553, 579, 605, 622, 645, 
-	671, 697, 699, 718, 744, 770, 787, 813
+	0, 7, 12, 19, 24, 25, 30, 37,
+	42, 49, 54, 59, 66, 71, 72, 77,
+	84, 89, 96, 101, 108, 120, 130, 140,
+	150, 151, 157, 167, 177, 187, 188, 195,
+	205, 215, 216, 222, 232, 242, 243, 253,
+	0
 };
 
-static const char _khmer_syllable_machine_indicies[] = {
-	1, 1, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 2, 
-	3, 0, 0, 0, 0, 4, 0, 1, 
-	1, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 3, 
-	0, 1, 1, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 3, 0, 0, 0, 0, 4, 0, 
-	5, 5, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	4, 0, 6, 6, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 6, 0, 7, 7, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 8, 0, 9, 9, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 10, 0, 0, 
-	0, 0, 4, 0, 9, 9, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 10, 0, 11, 11, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 12, 0, 
-	0, 0, 0, 4, 0, 11, 11, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 12, 0, 14, 
-	14, 13, 13, 13, 13, 13, 13, 13, 
-	13, 13, 13, 13, 13, 13, 13, 15, 
-	13, 14, 14, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 15, 16, 16, 16, 16, 17, 16, 
-	18, 18, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	17, 16, 19, 19, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 19, 16, 20, 20, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 21, 16, 22, 22, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 23, 16, 16, 
-	16, 16, 17, 16, 22, 22, 16, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 23, 16, 24, 24, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 25, 16, 
-	16, 16, 16, 17, 16, 24, 24, 16, 
-	16, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 25, 16, 14, 
-	14, 16, 16, 16, 16, 16, 16, 16, 
-	16, 16, 16, 16, 16, 16, 26, 15, 
-	16, 16, 16, 16, 17, 16, 28, 28, 
-	27, 27, 29, 29, 27, 27, 27, 27, 
-	2, 2, 27, 30, 27, 28, 27, 27, 
-	27, 27, 15, 19, 27, 27, 27, 17, 
-	23, 25, 21, 27, 32, 32, 31, 31, 
-	31, 31, 31, 31, 31, 33, 31, 31, 
-	31, 31, 31, 2, 3, 6, 31, 31, 
-	31, 4, 10, 12, 8, 31, 34, 34, 
-	31, 31, 31, 31, 31, 31, 31, 35, 
-	31, 31, 31, 31, 31, 31, 3, 6, 
-	31, 31, 31, 4, 10, 12, 8, 31, 
-	5, 5, 31, 31, 31, 31, 31, 31, 
-	31, 35, 31, 31, 31, 31, 31, 31, 
-	4, 6, 31, 31, 31, 31, 31, 31, 
-	8, 31, 6, 31, 7, 7, 31, 31, 
-	31, 31, 31, 31, 31, 35, 31, 31, 
-	31, 31, 31, 31, 8, 6, 31, 36, 
-	36, 31, 31, 31, 31, 31, 31, 31, 
-	35, 31, 31, 31, 31, 31, 31, 10, 
-	6, 31, 31, 31, 4, 31, 31, 8, 
-	31, 37, 37, 31, 31, 31, 31, 31, 
-	31, 31, 35, 31, 31, 31, 31, 31, 
-	31, 12, 6, 31, 31, 31, 4, 10, 
-	31, 8, 31, 34, 34, 31, 31, 31, 
-	31, 31, 31, 31, 33, 31, 31, 31, 
-	31, 31, 31, 3, 6, 31, 31, 31, 
-	4, 10, 12, 8, 31, 28, 28, 31, 
-	31, 31, 31, 31, 31, 31, 31, 31, 
-	31, 31, 31, 31, 28, 31, 14, 14, 
-	38, 38, 38, 38, 38, 38, 38, 38, 
-	38, 38, 38, 38, 38, 38, 15, 38, 
-	38, 38, 38, 17, 38, 40, 40, 39, 
-	39, 39, 39, 39, 39, 39, 41, 39, 
-	39, 39, 39, 39, 39, 15, 19, 39, 
-	39, 39, 17, 23, 25, 21, 39, 18, 
-	18, 39, 39, 39, 39, 39, 39, 39, 
-	41, 39, 39, 39, 39, 39, 39, 17, 
-	19, 39, 39, 39, 39, 39, 39, 21, 
-	39, 19, 39, 20, 20, 39, 39, 39, 
-	39, 39, 39, 39, 41, 39, 39, 39, 
-	39, 39, 39, 21, 19, 39, 42, 42, 
-	39, 39, 39, 39, 39, 39, 39, 41, 
-	39, 39, 39, 39, 39, 39, 23, 19, 
-	39, 39, 39, 17, 39, 39, 21, 39, 
-	43, 43, 39, 39, 39, 39, 39, 39, 
-	39, 41, 39, 39, 39, 39, 39, 39, 
-	25, 19, 39, 39, 39, 17, 23, 39, 
-	21, 39, 44, 44, 39, 39, 39, 39, 
-	39, 39, 39, 39, 39, 39, 39, 39, 
-	39, 44, 39, 45, 45, 39, 39, 39, 
-	39, 39, 39, 39, 30, 39, 39, 39, 
-	39, 39, 26, 15, 19, 39, 39, 39, 
-	17, 23, 25, 21, 39, 40, 40, 39, 
-	39, 39, 39, 39, 39, 39, 30, 39, 
-	39, 39, 39, 39, 39, 15, 19, 39, 
-	39, 39, 17, 23, 25, 21, 39, 0
+static const signed char _khmer_syllable_machine_indicies[] = {
+	1, 0, 0, 2, 3, 0, 4, 1,
+	0, 0, 0, 3, 1, 0, 0, 0,
+	3, 0, 4, 5, 0, 0, 0, 4,
+	6, 7, 0, 0, 0, 8, 9, 0,
+	0, 0, 10, 0, 4, 9, 0, 0,
+	0, 10, 11, 0, 0, 0, 12, 0,
+	4, 11, 0, 0, 0, 12, 14, 13,
+	13, 13, 15, 14, 16, 16, 16, 15,
+	16, 17, 18, 16, 16, 16, 17, 19,
+	20, 16, 16, 16, 21, 22, 16, 16,
+	16, 23, 16, 17, 22, 16, 16, 16,
+	23, 24, 16, 16, 16, 25, 16, 17,
+	24, 16, 16, 16, 25, 14, 16, 16,
+	26, 15, 16, 17, 29, 28, 30, 2,
+	31, 28, 15, 19, 17, 23, 25, 21,
+	33, 32, 34, 2, 3, 6, 4, 10,
+	12, 8, 35, 32, 36, 32, 3, 6,
+	4, 10, 12, 8, 5, 32, 36, 32,
+	4, 6, 32, 32, 32, 8, 6, 7,
+	32, 36, 32, 8, 6, 37, 32, 36,
+	32, 10, 6, 4, 32, 32, 8, 38,
+	32, 36, 32, 12, 6, 4, 10, 32,
+	8, 35, 32, 34, 32, 3, 6, 4,
+	10, 12, 8, 29, 14, 39, 39, 39,
+	15, 39, 17, 41, 40, 42, 40, 15,
+	19, 17, 23, 25, 21, 18, 40, 42,
+	40, 17, 19, 40, 40, 40, 21, 19,
+	20, 40, 42, 40, 21, 19, 43, 40,
+	42, 40, 23, 19, 17, 40, 40, 21,
+	44, 40, 42, 40, 25, 19, 17, 23,
+	40, 21, 45, 46, 40, 31, 26, 15,
+	19, 17, 23, 25, 21, 41, 40, 31,
+	40, 15, 19, 17, 23, 25, 21, 0
 };
 
-static const char _khmer_syllable_machine_trans_targs[] = {
-	20, 1, 28, 22, 23, 3, 24, 5, 
-	25, 7, 26, 9, 27, 20, 10, 31, 
-	20, 32, 12, 33, 14, 34, 16, 35, 
-	18, 36, 39, 20, 21, 30, 37, 20, 
-	0, 29, 2, 4, 6, 8, 20, 20, 
-	11, 13, 15, 17, 38, 19
+static const signed char _khmer_syllable_machine_index_defaults[] = {
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 13, 16, 16, 16, 16, 16,
+	16, 16, 16, 16, 28, 32, 32, 32,
+	32, 32, 32, 32, 32, 32, 39, 40,
+	40, 40, 40, 40, 40, 40, 40, 40,
+	0
 };
 
-static const char _khmer_syllable_machine_trans_actions[] = {
-	1, 0, 2, 2, 2, 0, 0, 0, 
-	2, 0, 2, 0, 2, 3, 0, 4, 
-	5, 2, 0, 0, 0, 2, 0, 2, 
-	0, 2, 4, 8, 2, 9, 0, 10, 
-	0, 0, 0, 0, 0, 0, 11, 12, 
-	0, 0, 0, 0, 4, 0
+static const signed char _khmer_syllable_machine_cond_targs[] = {
+	20, 1, 28, 22, 23, 3, 24, 5,
+	25, 7, 26, 9, 27, 20, 10, 31,
+	20, 32, 12, 33, 14, 34, 16, 35,
+	18, 36, 39, 20, 20, 21, 30, 37,
+	20, 0, 29, 2, 4, 6, 8, 20,
+	20, 11, 13, 15, 17, 38, 19, 0
 };
 
-static const char _khmer_syllable_machine_to_state_actions[] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 6, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0
+static const signed char _khmer_syllable_machine_cond_actions[] = {
+	1, 0, 2, 2, 2, 0, 0, 0,
+	2, 0, 2, 0, 2, 3, 0, 4,
+	5, 2, 0, 0, 0, 2, 0, 2,
+	0, 2, 4, 0, 8, 2, 9, 0,
+	10, 0, 0, 0, 0, 0, 0, 11,
+	12, 0, 0, 0, 0, 4, 0, 0
 };
 
-static const char _khmer_syllable_machine_from_state_actions[] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 7, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0
+static const signed char _khmer_syllable_machine_to_state_actions[] = {
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 6, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0
 };
 
-static const unsigned char _khmer_syllable_machine_eof_trans[] = {
-	1, 1, 1, 1, 1, 1, 1, 1, 
-	1, 1, 14, 17, 17, 17, 17, 17, 
-	17, 17, 17, 17, 0, 32, 32, 32, 
-	32, 32, 32, 32, 32, 32, 39, 40, 
-	40, 40, 40, 40, 40, 40, 40, 40
+static const signed char _khmer_syllable_machine_from_state_actions[] = {
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 7, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0
+};
+
+static const signed char _khmer_syllable_machine_eof_trans[] = {
+	1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 14, 17, 17, 17, 17, 17,
+	17, 17, 17, 17, 28, 33, 33, 33,
+	33, 33, 33, 33, 33, 33, 40, 41,
+	41, 41, 41, 41, 41, 41, 41, 41,
+	0
 };
 
 static const int khmer_syllable_machine_start = 20;
@@ -215,156 +183,271 @@ static const int khmer_syllable_machine_error = -1;
 static const int khmer_syllable_machine_en_main = 20;
 
 
-#line 36 "hb-ot-shape-complex-khmer-machine.rl"
+#line 43 "hb-ot-shape-complex-khmer-machine.rl"
 
 
 
-#line 80 "hb-ot-shape-complex-khmer-machine.rl"
+#line 86 "hb-ot-shape-complex-khmer-machine.rl"
 
 
 #define found_syllable(syllable_type) \
-  HB_STMT_START { \
-    if (0) fprintf (stderr, "syllable %d..%d %s\n", ts, te, #syllable_type); \
-    for (unsigned int i = ts; i < te; i++) \
-      info[i].syllable() = (syllable_serial << 4) | khmer_##syllable_type; \
-    syllable_serial++; \
-    if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
-  } HB_STMT_END
+HB_STMT_START { \
+	if (0) fprintf (stderr, "syllable %d..%d %s\n", ts, te, #syllable_type); \
+		for (unsigned int i = ts; i < te; i++) \
+	info[i].syllable() = (syllable_serial << 4) | syllable_type; \
+	syllable_serial++; \
+	if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
+	} HB_STMT_END
 
 static void
 find_syllables_khmer (hb_buffer_t *buffer)
 {
-  unsigned int p, pe, eof, ts, te, act HB_UNUSED;
-  int cs;
-  hb_glyph_info_t *info = buffer->info;
-  
-#line 242 "hb-ot-shape-complex-khmer-machine.hh"
+	unsigned int p, pe, eof, ts, te, act HB_UNUSED;
+	int cs;
+	hb_glyph_info_t *info = buffer->info;
+	
+#line 210 "hb-ot-shape-complex-khmer-machine.hh"
 	{
-	cs = khmer_syllable_machine_start;
-	ts = 0;
-	te = 0;
-	act = 0;
+		cs = (int)khmer_syllable_machine_start;
+		ts = 0;
+		te = 0;
+		act = 0;
 	}
-
-#line 100 "hb-ot-shape-complex-khmer-machine.rl"
-
-
-  p = 0;
-  pe = eof = buffer->len;
-
-  unsigned int syllable_serial = 1;
-  
-#line 258 "hb-ot-shape-complex-khmer-machine.hh"
+	
+#line 106 "hb-ot-shape-complex-khmer-machine.rl"
+	
+	
+	p = 0;
+	pe = eof = buffer->len;
+	
+	unsigned int syllable_serial = 1;
+	
+#line 226 "hb-ot-shape-complex-khmer-machine.hh"
 	{
-	int _slen;
-	int _trans;
-	const unsigned char *_keys;
-	const char *_inds;
-	if ( p == pe )
-		goto _test_eof;
-_resume:
-	switch ( _khmer_syllable_machine_from_state_actions[cs] ) {
-	case 7:
+		unsigned int _trans = 0;
+		const unsigned char * _keys;
+		const signed char * _inds;
+		int _ic;
+		_resume: {}
+		if ( p == pe && p != eof )
+			goto _out;
+		switch ( _khmer_syllable_machine_from_state_actions[cs] ) {
+			case 7:  {
+				{
 #line 1 "NONE"
-	{ts = p;}
-	break;
-#line 272 "hb-ot-shape-complex-khmer-machine.hh"
-	}
-
-	_keys = _khmer_syllable_machine_trans_keys + (cs<<1);
-	_inds = _khmer_syllable_machine_indicies + _khmer_syllable_machine_index_offsets[cs];
-
-	_slen = _khmer_syllable_machine_key_spans[cs];
-	_trans = _inds[ _slen > 0 && _keys[0] <=( info[p].khmer_category()) &&
-		( info[p].khmer_category()) <= _keys[1] ?
-		( info[p].khmer_category()) - _keys[0] : _slen ];
-
-_eof_trans:
-	cs = _khmer_syllable_machine_trans_targs[_trans];
-
-	if ( _khmer_syllable_machine_trans_actions[_trans] == 0 )
-		goto _again;
-
-	switch ( _khmer_syllable_machine_trans_actions[_trans] ) {
-	case 2:
+					{ts = p;}}
+				
+#line 241 "hb-ot-shape-complex-khmer-machine.hh"
+				
+				
+				break; 
+			}
+		}
+		
+		if ( p == eof ) {
+			if ( _khmer_syllable_machine_eof_trans[cs] > 0 ) {
+				_trans = (unsigned int)_khmer_syllable_machine_eof_trans[cs] - 1;
+			}
+		}
+		else {
+			_keys = ( _khmer_syllable_machine_trans_keys + ((cs<<1)));
+			_inds = ( _khmer_syllable_machine_indicies + (_khmer_syllable_machine_index_offsets[cs]));
+			
+			if ( (info[p].khmer_category()) <= 29 && (info[p].khmer_category()) >= 1 ) {
+				_ic = (int)_khmer_syllable_machine_char_class[(int)(info[p].khmer_category()) - 1];
+				if ( _ic <= (int)(*( _keys+1)) && _ic >= (int)(*( _keys)) )
+					_trans = (unsigned int)(*( _inds + (int)( _ic - (int)(*( _keys)) ) )); 
+				else
+					_trans = (unsigned int)_khmer_syllable_machine_index_defaults[cs];
+			}
+			else {
+				_trans = (unsigned int)_khmer_syllable_machine_index_defaults[cs];
+			}
+			
+		}
+		cs = (int)_khmer_syllable_machine_cond_targs[_trans];
+		
+		if ( _khmer_syllable_machine_cond_actions[_trans] != 0 ) {
+			
+			switch ( _khmer_syllable_machine_cond_actions[_trans] ) {
+				case 2:  {
+					{
 #line 1 "NONE"
-	{te = p+1;}
-	break;
-	case 8:
-#line 76 "hb-ot-shape-complex-khmer-machine.rl"
-	{te = p+1;{ found_syllable (non_khmer_cluster); }}
-	break;
-	case 10:
-#line 74 "hb-ot-shape-complex-khmer-machine.rl"
-	{te = p;p--;{ found_syllable (consonant_syllable); }}
-	break;
-	case 12:
-#line 75 "hb-ot-shape-complex-khmer-machine.rl"
-	{te = p;p--;{ found_syllable (broken_cluster); }}
-	break;
-	case 11:
-#line 76 "hb-ot-shape-complex-khmer-machine.rl"
-	{te = p;p--;{ found_syllable (non_khmer_cluster); }}
-	break;
-	case 1:
-#line 74 "hb-ot-shape-complex-khmer-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (consonant_syllable); }}
-	break;
-	case 5:
-#line 75 "hb-ot-shape-complex-khmer-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (broken_cluster); }}
-	break;
-	case 3:
+						{te = p+1;}}
+					
+#line 279 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 8:  {
+					{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+						{te = p+1;{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_non_khmer_cluster); }
+						}}
+					
+#line 292 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 10:  {
+					{
+#line 80 "hb-ot-shape-complex-khmer-machine.rl"
+						{te = p;p = p - 1;{
+#line 80 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_consonant_syllable); }
+						}}
+					
+#line 305 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 12:  {
+					{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+						{te = p;p = p - 1;{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_broken_cluster); }
+						}}
+					
+#line 318 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 11:  {
+					{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+						{te = p;p = p - 1;{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_non_khmer_cluster); }
+						}}
+					
+#line 331 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 1:  {
+					{
+#line 80 "hb-ot-shape-complex-khmer-machine.rl"
+						{p = ((te))-1;
+							{
+#line 80 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_consonant_syllable); }
+						}}
+					
+#line 345 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 5:  {
+					{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+						{p = ((te))-1;
+							{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+								found_syllable (khmer_broken_cluster); }
+						}}
+					
+#line 359 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 3:  {
+					{
 #line 1 "NONE"
-	{	switch( act ) {
-	case 2:
-	{{p = ((te))-1;} found_syllable (broken_cluster); }
-	break;
-	case 3:
-	{{p = ((te))-1;} found_syllable (non_khmer_cluster); }
-	break;
-	}
-	}
-	break;
-	case 4:
+						{switch( act ) {
+								case 2:  {
+									p = ((te))-1;
+									{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+										found_syllable (khmer_broken_cluster); }
+									break; 
+								}
+								case 3:  {
+									p = ((te))-1;
+									{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+										found_syllable (khmer_non_khmer_cluster); }
+									break; 
+								}
+							}}
+					}
+					
+#line 385 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 4:  {
+					{
 #line 1 "NONE"
-	{te = p+1;}
-#line 75 "hb-ot-shape-complex-khmer-machine.rl"
-	{act = 2;}
-	break;
-	case 9:
+						{te = p+1;}}
+					
+#line 395 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					{
+#line 81 "hb-ot-shape-complex-khmer-machine.rl"
+						{act = 2;}}
+					
+#line 401 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+				case 9:  {
+					{
 #line 1 "NONE"
-	{te = p+1;}
-#line 76 "hb-ot-shape-complex-khmer-machine.rl"
-	{act = 3;}
-	break;
-#line 342 "hb-ot-shape-complex-khmer-machine.hh"
-	}
-
-_again:
-	switch ( _khmer_syllable_machine_to_state_actions[cs] ) {
-	case 6:
+						{te = p+1;}}
+					
+#line 411 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					{
+#line 82 "hb-ot-shape-complex-khmer-machine.rl"
+						{act = 3;}}
+					
+#line 417 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+			}
+			
+		}
+		
+		if ( p == eof ) {
+			if ( cs >= 20 )
+				goto _out;
+		}
+		else {
+			switch ( _khmer_syllable_machine_to_state_actions[cs] ) {
+				case 6:  {
+					{
 #line 1 "NONE"
-	{ts = 0;}
-	break;
-#line 351 "hb-ot-shape-complex-khmer-machine.hh"
-	}
-
-	if ( ++p != pe )
-		goto _resume;
-	_test_eof: {}
-	if ( p == eof )
-	{
-	if ( _khmer_syllable_machine_eof_trans[cs] > 0 ) {
-		_trans = _khmer_syllable_machine_eof_trans[cs] - 1;
-		goto _eof_trans;
-	}
+						{ts = 0;}}
+					
+#line 437 "hb-ot-shape-complex-khmer-machine.hh"
+					
+					
+					break; 
+				}
+			}
+			
+			p += 1;
+			goto _resume;
+		}
+		_out: {}
 	}
-
-	}
-
-#line 108 "hb-ot-shape-complex-khmer-machine.rl"
-
+	
+#line 114 "hb-ot-shape-complex-khmer-machine.rl"
+	
 }
 
 #undef found_syllable
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.rl b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.rl
deleted file mode 100644
index e7f14533dd..0000000000
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer-machine.rl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright © 2011,2012  Google, Inc.
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Google Author(s): Behdad Esfahbod
- */
-
-#ifndef HB_OT_SHAPE_COMPLEX_KHMER_MACHINE_HH
-#define HB_OT_SHAPE_COMPLEX_KHMER_MACHINE_HH
-
-#include "hb.hh"
-
-%%{
-  machine khmer_syllable_machine;
-  alphtype unsigned char;
-  write data;
-}%%
-
-%%{
-
-# Same order as enum khmer_category_t.  Not sure how to avoid duplication.
-C    = 1;
-V    = 2;
-ZWNJ = 5;
-ZWJ  = 6;
-PLACEHOLDER = 11;
-DOTTEDCIRCLE = 12;
-Coeng= 14;
-Ra   = 16;
-Robatic = 20;
-Xgroup  = 21;
-Ygroup  = 22;
-VAbv = 26;
-VBlw = 27;
-VPre = 28;
-VPst = 29;
-
-c = (C | Ra | V);
-cn = c.((ZWJ|ZWNJ)?.Robatic)?;
-joiner = (ZWJ | ZWNJ);
-xgroup = (joiner*.Xgroup)*;
-ygroup = Ygroup*;
-
-# This grammar was experimentally extracted from what Uniscribe allows.
-
-matra_group = VPre? xgroup VBlw? xgroup (joiner?.VAbv)? xgroup VPst?;
-syllable_tail = xgroup matra_group xgroup (Coeng.c)? ygroup;
-
-
-broken_cluster =	(Coeng.cn)* (Coeng | syllable_tail);
-consonant_syllable =	(cn|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster;
-other =			any;
-
-main := |*
-	consonant_syllable	=> { found_syllable (consonant_syllable); };
-	broken_cluster		=> { found_syllable (broken_cluster); };
-	other			=> { found_syllable (non_khmer_cluster); };
-*|;
-
-
-}%%
-
-#define found_syllable(syllable_type) \
-  HB_STMT_START { \
-    if (0) fprintf (stderr, "syllable %d..%d %s\n", ts, te, #syllable_type); \
-    for (unsigned int i = ts; i < te; i++) \
-      info[i].syllable() = (syllable_serial << 4) | khmer_##syllable_type; \
-    syllable_serial++; \
-    if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
-  } HB_STMT_END
-
-static void
-find_syllables_khmer (hb_buffer_t *buffer)
-{
-  unsigned int p, pe, eof, ts, te, act HB_UNUSED;
-  int cs;
-  hb_glyph_info_t *info = buffer->info;
-  %%{
-    write init;
-    getkey info[p].khmer_category();
-  }%%
-
-  p = 0;
-  pe = eof = buffer->len;
-
-  unsigned int syllable_serial = 1;
-  %%{
-    write exec;
-  }%%
-}
-
-#undef found_syllable
-
-#endif /* HB_OT_SHAPE_COMPLEX_KHMER_MACHINE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.cc
index d6fcd7c814..dddba142a3 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.cc
@@ -29,6 +29,7 @@
 #ifndef HB_NO_OT_SHAPE
 
 #include "hb-ot-shape-complex-khmer.hh"
+#include "hb-ot-shape-complex-khmer-machine.hh"
 #include "hb-ot-layout.hh"
 
 
@@ -140,27 +141,6 @@ override_features_khmer (hb_ot_shape_planner_t *plan)
 
 struct khmer_shape_plan_t
 {
-  bool get_virama_glyph (hb_font_t *font, hb_codepoint_t *pglyph) const
-  {
-    hb_codepoint_t glyph = virama_glyph;
-    if (unlikely (virama_glyph == (hb_codepoint_t) -1))
-    {
-      if (!font->get_nominal_glyph (0x17D2u, &glyph))
-	glyph = 0;
-      /* Technically speaking, the spec says we should apply 'locl' to virama too.
-       * Maybe one day... */
-
-      /* Our get_nominal_glyph() function needs a font, so we can't get the virama glyph
-       * during shape planning...  Instead, overwrite it here.  It's safe.  Don't worry! */
-      virama_glyph = glyph;
-    }
-
-    *pglyph = glyph;
-    return glyph != 0;
-  }
-
-  mutable hb_codepoint_t virama_glyph;
-
   hb_mask_t mask_array[KHMER_NUM_FEATURES];
 };
 
@@ -171,8 +151,6 @@ data_create_khmer (const hb_ot_shape_plan_t *plan)
   if (unlikely (!khmer_plan))
     return nullptr;
 
-  khmer_plan->virama_glyph = (hb_codepoint_t) -1;
-
   for (unsigned int i = 0; i < ARRAY_LENGTH (khmer_plan->mask_array); i++)
     khmer_plan->mask_array[i] = (khmer_features[i].flags & F_GLOBAL) ?
 				 0 : plan->map.get_1_mask (khmer_features[i].tag);
@@ -186,15 +164,6 @@ data_destroy_khmer (void *data)
   free (data);
 }
 
-
-enum khmer_syllable_type_t {
-  khmer_consonant_syllable,
-  khmer_broken_cluster,
-  khmer_non_khmer_cluster,
-};
-
-#include "hb-ot-shape-complex-khmer-machine.hh"
-
 static void
 setup_masks_khmer (const hb_ot_shape_plan_t *plan HB_UNUSED,
 		   hb_buffer_t              *buffer,
@@ -321,76 +290,17 @@ reorder_syllable_khmer (const hb_ot_shape_plan_t *plan,
   }
 }
 
-static inline void
-insert_dotted_circles_khmer (const hb_ot_shape_plan_t *plan HB_UNUSED,
-			     hb_font_t *font,
-			     hb_buffer_t *buffer)
-{
-  if (unlikely (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE))
-    return;
-
-  /* Note: This loop is extra overhead, but should not be measurable.
-   * TODO Use a buffer scratch flag to remove the loop. */
-  bool has_broken_syllables = false;
-  unsigned int count = buffer->len;
-  hb_glyph_info_t *info = buffer->info;
-  for (unsigned int i = 0; i < count; i++)
-    if ((info[i].syllable() & 0x0F) == khmer_broken_cluster)
-    {
-      has_broken_syllables = true;
-      break;
-    }
-  if (likely (!has_broken_syllables))
-    return;
-
-
-  hb_codepoint_t dottedcircle_glyph;
-  if (!font->get_nominal_glyph (0x25CCu, &dottedcircle_glyph))
-    return;
-
-  hb_glyph_info_t dottedcircle = {0};
-  dottedcircle.codepoint = 0x25CCu;
-  set_khmer_properties (dottedcircle);
-  dottedcircle.codepoint = dottedcircle_glyph;
-
-  buffer->clear_output ();
-
-  buffer->idx = 0;
-  unsigned int last_syllable = 0;
-  while (buffer->idx < buffer->len && buffer->successful)
-  {
-    unsigned int syllable = buffer->cur().syllable();
-    khmer_syllable_type_t syllable_type = (khmer_syllable_type_t) (syllable & 0x0F);
-    if (unlikely (last_syllable != syllable && syllable_type == khmer_broken_cluster))
-    {
-      last_syllable = syllable;
-
-      hb_glyph_info_t ginfo = dottedcircle;
-      ginfo.cluster = buffer->cur().cluster;
-      ginfo.mask = buffer->cur().mask;
-      ginfo.syllable() = buffer->cur().syllable();
-
-      /* Insert dottedcircle after possible Repha. */
-      while (buffer->idx < buffer->len && buffer->successful &&
-	     last_syllable == buffer->cur().syllable() &&
-	     buffer->cur().khmer_category() == OT_Repha)
-	buffer->next_glyph ();
-
-      buffer->output_info (ginfo);
-    }
-    else
-      buffer->next_glyph ();
-  }
-  buffer->swap_buffers ();
-}
-
 static void
 reorder_khmer (const hb_ot_shape_plan_t *plan,
 	       hb_font_t *font,
 	       hb_buffer_t *buffer)
 {
-  if (buffer->message (font, "start reordering khmer")) {
-    insert_dotted_circles_khmer (plan, font, buffer);
+  if (buffer->message (font, "start reordering khmer"))
+  {
+    hb_syllabic_insert_dotted_circles (font, buffer,
+				       khmer_broken_cluster,
+				       OT_DOTTEDCIRCLE,
+				       OT_Repha);
 
     foreach_syllable (buffer, start, end)
       reorder_syllable_khmer (plan, font->face, buffer, start, end);
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.hh
index 11a77bfd4b..e24d68a8b5 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-khmer.hh
@@ -54,7 +54,7 @@ set_khmer_properties (hb_glyph_info_t &info)
 {
   hb_codepoint_t u = info.codepoint;
   unsigned int type = hb_indic_get_categories (u);
-  khmer_category_t cat = (khmer_category_t) (type & 0x7Fu);
+  khmer_category_t cat = (khmer_category_t) (type & 0xFFu);
   indic_position_t pos = (indic_position_t) (type >> 8);
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-machine-index.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-machine-index.hh
deleted file mode 100644
index 9ec1f3eb7c..0000000000
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-machine-index.hh
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright © 2019,2020  David Corbett
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- */
-
-#ifndef HB_OT_SHAPE_COMPLEX_MACHINE_INDEX_HH
-#define HB_OT_SHAPE_COMPLEX_MACHINE_INDEX_HH
-
-#include "hb.hh"
-
-
-template <typename Iter>
-struct machine_index_t :
-  hb_iter_with_fallback_t<machine_index_t<Iter>,
-			  typename Iter::item_t>
-{
-  machine_index_t (const Iter& it) : it (it) {}
-  machine_index_t (const machine_index_t& o) : it (o.it) {}
-
-  static constexpr bool is_random_access_iterator = Iter::is_random_access_iterator;
-  static constexpr bool is_sorted_iterator = Iter::is_sorted_iterator;
-
-  typename Iter::item_t __item__ () const { return *it; }
-  typename Iter::item_t __item_at__ (unsigned i) const { return it[i]; }
-  unsigned __len__ () const { return it.len (); }
-  void __next__ () { ++it; }
-  void __forward__ (unsigned n) { it += n; }
-  void __prev__ () { --it; }
-  void __rewind__ (unsigned n) { it -= n; }
-  void operator = (unsigned n)
-  { unsigned index = (*it).first; if (index < n) it += n - index; else if (index > n) it -= index - n; }
-  void operator = (const machine_index_t& o) { *this = (*o.it).first; }
-  bool operator == (const machine_index_t& o) const { return (*it).first == (*o.it).first; }
-  bool operator != (const machine_index_t& o) const { return !(*this == o); }
-
-  private:
-  Iter it;
-};
-struct
-{
-  template <typename Iter,
-	    hb_requires (hb_is_iterable (Iter))>
-  machine_index_t<hb_iter_type<Iter>>
-  operator () (Iter&& it) const
-  { return machine_index_t<hb_iter_type<Iter>> (hb_iter (it)); }
-}
-HB_FUNCOBJ (machine_index);
-
-
-#endif /* HB_OT_SHAPE_COMPLEX_MACHINE_INDEX_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar-machine.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar-machine.hh
index c2f4c0045c..c09497896d 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar-machine.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar-machine.hh
@@ -31,8 +31,43 @@
 
 #include "hb.hh"
 
+enum myanmar_syllable_type_t {
+  myanmar_consonant_syllable,
+  myanmar_punctuation_cluster,
+  myanmar_broken_cluster,
+  myanmar_non_myanmar_cluster,
+};
+
 
-#line 36 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 43 "hb-ot-shape-complex-myanmar-machine.hh"
+#define myanmar_syllable_machine_ex_A 10u
+#define myanmar_syllable_machine_ex_As 18u
+#define myanmar_syllable_machine_ex_C 1u
+#define myanmar_syllable_machine_ex_CS 19u
+#define myanmar_syllable_machine_ex_D 32u
+#define myanmar_syllable_machine_ex_D0 20u
+#define myanmar_syllable_machine_ex_DB 3u
+#define myanmar_syllable_machine_ex_GB 11u
+#define myanmar_syllable_machine_ex_H 4u
+#define myanmar_syllable_machine_ex_IV 2u
+#define myanmar_syllable_machine_ex_MH 21u
+#define myanmar_syllable_machine_ex_MR 22u
+#define myanmar_syllable_machine_ex_MW 23u
+#define myanmar_syllable_machine_ex_MY 24u
+#define myanmar_syllable_machine_ex_P 31u
+#define myanmar_syllable_machine_ex_PT 25u
+#define myanmar_syllable_machine_ex_Ra 16u
+#define myanmar_syllable_machine_ex_V 8u
+#define myanmar_syllable_machine_ex_VAbv 26u
+#define myanmar_syllable_machine_ex_VBlw 27u
+#define myanmar_syllable_machine_ex_VPre 28u
+#define myanmar_syllable_machine_ex_VPst 29u
+#define myanmar_syllable_machine_ex_VS 30u
+#define myanmar_syllable_machine_ex_ZWJ 6u
+#define myanmar_syllable_machine_ex_ZWNJ 5u
+
+
+#line 71 "hb-ot-shape-complex-myanmar-machine.hh"
 static const unsigned char _myanmar_syllable_machine_trans_keys[] = {
 	1u, 32u, 3u, 30u, 5u, 29u, 5u, 8u, 5u, 29u, 3u, 25u, 5u, 25u, 5u, 25u, 
 	3u, 29u, 3u, 29u, 3u, 29u, 3u, 29u, 1u, 16u, 3u, 29u, 3u, 29u, 3u, 29u, 
@@ -293,18 +328,18 @@ static const int myanmar_syllable_machine_error = -1;
 static const int myanmar_syllable_machine_en_main = 0;
 
 
-#line 36 "hb-ot-shape-complex-myanmar-machine.rl"
+#line 44 "hb-ot-shape-complex-myanmar-machine.rl"
 
 
 
-#line 94 "hb-ot-shape-complex-myanmar-machine.rl"
+#line 101 "hb-ot-shape-complex-myanmar-machine.rl"
 
 
 #define found_syllable(syllable_type) \
   HB_STMT_START { \
     if (0) fprintf (stderr, "syllable %d..%d %s\n", ts, te, #syllable_type); \
     for (unsigned int i = ts; i < te; i++) \
-      info[i].syllable() = (syllable_serial << 4) | myanmar_##syllable_type; \
+      info[i].syllable() = (syllable_serial << 4) | syllable_type; \
     syllable_serial++; \
     if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
   } HB_STMT_END
@@ -316,7 +351,7 @@ find_syllables_myanmar (hb_buffer_t *buffer)
   int cs;
   hb_glyph_info_t *info = buffer->info;
   
-#line 320 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 355 "hb-ot-shape-complex-myanmar-machine.hh"
 	{
 	cs = myanmar_syllable_machine_start;
 	ts = 0;
@@ -324,7 +359,7 @@ find_syllables_myanmar (hb_buffer_t *buffer)
 	act = 0;
 	}
 
-#line 114 "hb-ot-shape-complex-myanmar-machine.rl"
+#line 121 "hb-ot-shape-complex-myanmar-machine.rl"
 
 
   p = 0;
@@ -332,7 +367,7 @@ find_syllables_myanmar (hb_buffer_t *buffer)
 
   unsigned int syllable_serial = 1;
   
-#line 336 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 371 "hb-ot-shape-complex-myanmar-machine.hh"
 	{
 	int _slen;
 	int _trans;
@@ -346,7 +381,7 @@ _resume:
 #line 1 "NONE"
 	{ts = p;}
 	break;
-#line 350 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 385 "hb-ot-shape-complex-myanmar-machine.hh"
 	}
 
 	_keys = _myanmar_syllable_machine_trans_keys + (cs<<1);
@@ -365,38 +400,38 @@ _eof_trans:
 
 	switch ( _myanmar_syllable_machine_trans_actions[_trans] ) {
 	case 6:
-#line 86 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p+1;{ found_syllable (consonant_syllable); }}
+#line 93 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p+1;{ found_syllable (myanmar_consonant_syllable); }}
 	break;
 	case 4:
-#line 87 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p+1;{ found_syllable (non_myanmar_cluster); }}
+#line 94 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p+1;{ found_syllable (myanmar_non_myanmar_cluster); }}
 	break;
 	case 10:
-#line 88 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p+1;{ found_syllable (punctuation_cluster); }}
+#line 95 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p+1;{ found_syllable (myanmar_punctuation_cluster); }}
 	break;
 	case 8:
-#line 89 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p+1;{ found_syllable (broken_cluster); }}
+#line 96 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p+1;{ found_syllable (myanmar_broken_cluster); }}
 	break;
 	case 3:
-#line 90 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p+1;{ found_syllable (non_myanmar_cluster); }}
+#line 97 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p+1;{ found_syllable (myanmar_non_myanmar_cluster); }}
 	break;
 	case 5:
-#line 86 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p;p--;{ found_syllable (consonant_syllable); }}
+#line 93 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p;p--;{ found_syllable (myanmar_consonant_syllable); }}
 	break;
 	case 7:
-#line 89 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p;p--;{ found_syllable (broken_cluster); }}
+#line 96 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p;p--;{ found_syllable (myanmar_broken_cluster); }}
 	break;
 	case 9:
-#line 90 "hb-ot-shape-complex-myanmar-machine.rl"
-	{te = p;p--;{ found_syllable (non_myanmar_cluster); }}
+#line 97 "hb-ot-shape-complex-myanmar-machine.rl"
+	{te = p;p--;{ found_syllable (myanmar_non_myanmar_cluster); }}
 	break;
-#line 400 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 435 "hb-ot-shape-complex-myanmar-machine.hh"
 	}
 
 _again:
@@ -405,7 +440,7 @@ _again:
 #line 1 "NONE"
 	{ts = 0;}
 	break;
-#line 409 "hb-ot-shape-complex-myanmar-machine.hh"
+#line 444 "hb-ot-shape-complex-myanmar-machine.hh"
 	}
 
 	if ( ++p != pe )
@@ -421,7 +456,7 @@ _again:
 
 	}
 
-#line 122 "hb-ot-shape-complex-myanmar-machine.rl"
+#line 129 "hb-ot-shape-complex-myanmar-machine.rl"
 
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.cc
index fe096ef28a..bc5dcb904c 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.cc
@@ -29,6 +29,7 @@
 #ifndef HB_NO_OT_SHAPE
 
 #include "hb-ot-shape-complex-myanmar.hh"
+#include "hb-ot-shape-complex-myanmar-machine.hh"
 
 
 /*
@@ -97,17 +98,6 @@ collect_features_myanmar (hb_ot_shape_planner_t *plan)
     map->enable_feature (myanmar_other_features[i], F_MANUAL_ZWJ);
 }
 
-
-enum myanmar_syllable_type_t {
-  myanmar_consonant_syllable,
-  myanmar_punctuation_cluster,
-  myanmar_broken_cluster,
-  myanmar_non_myanmar_cluster,
-};
-
-#include "hb-ot-shape-complex-myanmar-machine.hh"
-
-
 static void
 setup_masks_myanmar (const hb_ot_shape_plan_t *plan HB_UNUSED,
 		     hb_buffer_t              *buffer,
@@ -265,70 +255,16 @@ reorder_syllable_myanmar (const hb_ot_shape_plan_t *plan HB_UNUSED,
   }
 }
 
-static inline void
-insert_dotted_circles_myanmar (const hb_ot_shape_plan_t *plan HB_UNUSED,
-			       hb_font_t *font,
-			       hb_buffer_t *buffer)
-{
-  if (unlikely (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE))
-    return;
-
-  /* Note: This loop is extra overhead, but should not be measurable.
-   * TODO Use a buffer scratch flag to remove the loop. */
-  bool has_broken_syllables = false;
-  unsigned int count = buffer->len;
-  hb_glyph_info_t *info = buffer->info;
-  for (unsigned int i = 0; i < count; i++)
-    if ((info[i].syllable() & 0x0F) == myanmar_broken_cluster)
-    {
-      has_broken_syllables = true;
-      break;
-    }
-  if (likely (!has_broken_syllables))
-    return;
-
-
-  hb_codepoint_t dottedcircle_glyph;
-  if (!font->get_nominal_glyph (0x25CCu, &dottedcircle_glyph))
-    return;
-
-  hb_glyph_info_t dottedcircle = {0};
-  dottedcircle.codepoint = 0x25CCu;
-  set_myanmar_properties (dottedcircle);
-  dottedcircle.codepoint = dottedcircle_glyph;
-
-  buffer->clear_output ();
-
-  buffer->idx = 0;
-  unsigned int last_syllable = 0;
-  while (buffer->idx < buffer->len && buffer->successful)
-  {
-    unsigned int syllable = buffer->cur().syllable();
-    myanmar_syllable_type_t syllable_type = (myanmar_syllable_type_t) (syllable & 0x0F);
-    if (unlikely (last_syllable != syllable && syllable_type == myanmar_broken_cluster))
-    {
-      last_syllable = syllable;
-
-      hb_glyph_info_t ginfo = dottedcircle;
-      ginfo.cluster = buffer->cur().cluster;
-      ginfo.mask = buffer->cur().mask;
-      ginfo.syllable() = buffer->cur().syllable();
-
-      buffer->output_info (ginfo);
-    }
-    else
-      buffer->next_glyph ();
-  }
-  buffer->swap_buffers ();
-}
-
 static void
 reorder_myanmar (const hb_ot_shape_plan_t *plan,
 		 hb_font_t *font,
 		 hb_buffer_t *buffer)
 {
-  if (buffer->message (font, "start reordering myanmar")) {
-    insert_dotted_circles_myanmar (plan, font, buffer);
+  if (buffer->message (font, "start reordering myanmar"))
+  {
+    hb_syllabic_insert_dotted_circles (font, buffer,
+				       myanmar_broken_cluster,
+				       OT_GB);
 
     foreach_syllable (buffer, start, end)
       reorder_syllable_myanmar (plan, font->face, buffer, start, end);
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.hh
index 7b9821e6ba..a6d68aae57 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-myanmar.hh
@@ -64,7 +64,7 @@ set_myanmar_properties (hb_glyph_info_t &info)
 {
   hb_codepoint_t u = info.codepoint;
   unsigned int type = hb_indic_get_categories (u);
-  unsigned int cat = type & 0x7Fu;
+  unsigned int cat = type & 0xFFu;
   indic_position_t pos = (indic_position_t) (type >> 8);
 
   /* Myanmar
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
new file mode 100644
index 0000000000..46509abee2
--- /dev/null
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright © 2021  Behdad Esfahbod.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear in
+ * all copies of this software.
+ *
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+ * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ */
+
+#include "hb.hh"
+
+#ifndef HB_NO_OT_SHAPE
+
+#include "hb-ot-shape-complex-syllabic.hh"
+
+
+void
+hb_syllabic_insert_dotted_circles (hb_font_t *font,
+				   hb_buffer_t *buffer,
+				   unsigned int broken_syllable_type,
+				   unsigned int dottedcircle_category,
+				   int repha_category)
+{
+  if (unlikely (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE))
+    return;
+
+  /* Note: This loop is extra overhead, but should not be measurable.
+   * TODO Use a buffer scratch flag to remove the loop. */
+  bool has_broken_syllables = false;
+  unsigned int count = buffer->len;
+  hb_glyph_info_t *info = buffer->info;
+  for (unsigned int i = 0; i < count; i++)
+    if ((info[i].syllable() & 0x0F) == broken_syllable_type)
+    {
+      has_broken_syllables = true;
+      break;
+    }
+  if (likely (!has_broken_syllables))
+    return;
+
+
+  hb_codepoint_t dottedcircle_glyph;
+  if (!font->get_nominal_glyph (0x25CCu, &dottedcircle_glyph))
+    return;
+
+  hb_glyph_info_t dottedcircle = {0};
+  dottedcircle.codepoint = 0x25CCu;
+  dottedcircle.complex_var_u8_category() = dottedcircle_category;
+  dottedcircle.codepoint = dottedcircle_glyph;
+
+  buffer->clear_output ();
+
+  buffer->idx = 0;
+  unsigned int last_syllable = 0;
+  while (buffer->idx < buffer->len && buffer->successful)
+  {
+    unsigned int syllable = buffer->cur().syllable();
+    if (unlikely (last_syllable != syllable && (syllable & 0x0F) == broken_syllable_type))
+    {
+      last_syllable = syllable;
+
+      hb_glyph_info_t ginfo = dottedcircle;
+      ginfo.cluster = buffer->cur().cluster;
+      ginfo.mask = buffer->cur().mask;
+      ginfo.syllable() = buffer->cur().syllable();
+
+      /* Insert dottedcircle after possible Repha. */
+      if (repha_category != -1)
+      {
+	while (buffer->idx < buffer->len && buffer->successful &&
+	       last_syllable == buffer->cur().syllable() &&
+	       buffer->cur().complex_var_u8_category() == (unsigned) repha_category)
+	  (void) buffer->next_glyph ();
+      }
+
+      (void) buffer->output_info (ginfo);
+    }
+    else
+      (void) buffer->next_glyph ();
+  }
+  buffer->swap_buffers ();
+}
+
+
+#endif
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.hh
new file mode 100644
index 0000000000..c80b8fee1d
--- /dev/null
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.hh
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2021  Behdad Esfahbod.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear in
+ * all copies of this software.
+ *
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+ * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ */
+
+#ifndef HB_OT_SHAPE_COMPLEX_SYLLABIC_HH
+#define HB_OT_SHAPE_COMPLEX_SYLLABIC_HH
+
+#include "hb.hh"
+
+#include "hb-ot-shape-complex.hh"
+
+
+HB_INTERNAL void
+hb_syllabic_insert_dotted_circles (hb_font_t *font,
+				   hb_buffer_t *buffer,
+				   unsigned int broken_syllable_type,
+				   unsigned int dottedcircle_category,
+				   int repha_category = -1);
+
+
+#endif /* HB_OT_SHAPE_COMPLEX_SYLLABIC_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
index 347ea2e7ac..4c3068173b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
@@ -323,20 +323,19 @@ preprocess_text_thai (const hb_ot_shape_plan_t *plan,
 
   buffer->clear_output ();
   unsigned int count = buffer->len;
-  for (buffer->idx = 0; buffer->idx < count && buffer->successful;)
+  for (buffer->idx = 0; buffer->idx < count /* No need for: && buffer->successful */;)
   {
     hb_codepoint_t u = buffer->cur().codepoint;
-    if (likely (!IS_SARA_AM (u))) {
-      buffer->next_glyph ();
+    if (likely (!IS_SARA_AM (u)))
+    {
+      if (unlikely (!buffer->next_glyph ())) break;
       continue;
     }
 
     /* Is SARA AM. Decompose and reorder. */
-    hb_glyph_info_t &nikhahit = buffer->output_glyph (NIKHAHIT_FROM_SARA_AM (u));
-    _hb_glyph_info_set_continuation (&nikhahit);
-    buffer->replace_glyph (SARA_AA_FROM_SARA_AM (u));
-    if (unlikely (!buffer->successful))
-      return;
+    (void) buffer->output_glyph (NIKHAHIT_FROM_SARA_AM (u));
+    _hb_glyph_info_set_continuation (&buffer->prev());
+    if (unlikely (!buffer->replace_glyph (SARA_AA_FROM_SARA_AM (u)))) break;
 
     /* Make Nikhahit be recognized as a ccc=0 mark when zeroing widths. */
     unsigned int end = buffer->out_len;
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-machine.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-machine.hh
index 144e7d3a40..b4b2b75100 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-machine.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-machine.hh
@@ -1,300 +1,348 @@
-
 #line 1 "hb-ot-shape-complex-use-machine.rl"
 /*
- * Copyright © 2015  Mozilla Foundation.
- * Copyright © 2015  Google, Inc.
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Mozilla Author(s): Jonathan Kew
- * Google Author(s): Behdad Esfahbod
- */
+* Copyright © 2015  Mozilla Foundation.
+* Copyright © 2015  Google, Inc.
+*
+*  This is part of HarfBuzz, a text shaping library.
+*
+* Permission is hereby granted, without written agreement and without
+* license or royalty fees, to use, copy, modify, and distribute this
+* software and its documentation for any purpose, provided that the
+* above copyright notice and the following two paragraphs appear in
+* all copies of this software.
+*
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+* DAMAGE.
+*
+* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+* FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+*
+* Mozilla Author(s): Jonathan Kew
+* Google Author(s): Behdad Esfahbod
+*/
 
 #ifndef HB_OT_SHAPE_COMPLEX_USE_MACHINE_HH
 #define HB_OT_SHAPE_COMPLEX_USE_MACHINE_HH
 
 #include "hb.hh"
-#include "hb-ot-shape-complex-machine-index.hh"
 
+#include "hb-ot-shape-complex-syllabic.hh"
+
+/* buffer var allocations */
+#define use_category() complex_var_u8_category()
+
+#define USE(Cat) use_syllable_machine_ex_##Cat
+
+enum use_syllable_type_t {
+	use_independent_cluster,
+	use_virama_terminated_cluster,
+	use_sakot_terminated_cluster,
+	use_standard_cluster,
+	use_number_joiner_terminated_cluster,
+	use_numeral_cluster,
+	use_symbol_cluster,
+	use_hieroglyph_cluster,
+	use_broken_cluster,
+	use_non_cluster,
+};
 
-#line 39 "hb-ot-shape-complex-use-machine.hh"
+
+#line 57 "hb-ot-shape-complex-use-machine.hh"
+#define use_syllable_machine_ex_B 1u
+#define use_syllable_machine_ex_CMAbv 31u
+#define use_syllable_machine_ex_CMBlw 32u
+#define use_syllable_machine_ex_CS 43u
+#define use_syllable_machine_ex_FAbv 24u
+#define use_syllable_machine_ex_FBlw 25u
+#define use_syllable_machine_ex_FMAbv 45u
+#define use_syllable_machine_ex_FMBlw 46u
+#define use_syllable_machine_ex_FMPst 47u
+#define use_syllable_machine_ex_FPst 26u
+#define use_syllable_machine_ex_G 49u
+#define use_syllable_machine_ex_GB 5u
+#define use_syllable_machine_ex_H 12u
+#define use_syllable_machine_ex_HN 13u
+#define use_syllable_machine_ex_HVM 44u
+#define use_syllable_machine_ex_J 50u
+#define use_syllable_machine_ex_MAbv 27u
+#define use_syllable_machine_ex_MBlw 28u
+#define use_syllable_machine_ex_MPre 30u
+#define use_syllable_machine_ex_MPst 29u
+#define use_syllable_machine_ex_N 4u
+#define use_syllable_machine_ex_O 0u
+#define use_syllable_machine_ex_R 18u
+#define use_syllable_machine_ex_S 19u
+#define use_syllable_machine_ex_SB 51u
+#define use_syllable_machine_ex_SE 52u
+#define use_syllable_machine_ex_SMAbv 41u
+#define use_syllable_machine_ex_SMBlw 42u
+#define use_syllable_machine_ex_SUB 11u
+#define use_syllable_machine_ex_Sk 48u
+#define use_syllable_machine_ex_VAbv 33u
+#define use_syllable_machine_ex_VBlw 34u
+#define use_syllable_machine_ex_VMAbv 37u
+#define use_syllable_machine_ex_VMBlw 38u
+#define use_syllable_machine_ex_VMPre 23u
+#define use_syllable_machine_ex_VMPst 39u
+#define use_syllable_machine_ex_VPre 22u
+#define use_syllable_machine_ex_VPst 35u
+#define use_syllable_machine_ex_ZWNJ 14u
+
+
+#line 99 "hb-ot-shape-complex-use-machine.hh"
 static const unsigned char _use_syllable_machine_trans_keys[] = {
-	1u, 1u, 1u, 1u, 0u, 51u, 11u, 48u, 11u, 48u, 1u, 1u, 22u, 48u, 23u, 48u, 
-	24u, 47u, 25u, 47u, 26u, 47u, 45u, 46u, 46u, 46u, 24u, 48u, 24u, 48u, 24u, 48u, 
-	1u, 1u, 24u, 48u, 23u, 48u, 23u, 48u, 23u, 48u, 22u, 48u, 22u, 48u, 22u, 48u, 
-	11u, 48u, 1u, 48u, 13u, 13u, 4u, 4u, 11u, 48u, 41u, 42u, 42u, 42u, 11u, 48u, 
-	22u, 48u, 23u, 48u, 24u, 47u, 25u, 47u, 26u, 47u, 45u, 46u, 46u, 46u, 24u, 48u, 
-	24u, 48u, 24u, 48u, 24u, 48u, 23u, 48u, 23u, 48u, 23u, 48u, 22u, 48u, 22u, 48u, 
-	22u, 48u, 11u, 48u, 1u, 48u, 1u, 1u, 4u, 4u, 13u, 13u, 1u, 48u, 11u, 48u, 
-	41u, 42u, 42u, 42u, 1u, 5u, 50u, 52u, 49u, 52u, 49u, 51u, 0
+	1u, 1u, 1u, 1u, 0u, 37u, 5u, 34u,
+	5u, 34u, 1u, 1u, 10u, 34u, 11u, 34u,
+	12u, 33u, 13u, 33u, 14u, 33u, 31u, 32u,
+	32u, 32u, 12u, 34u, 12u, 34u, 12u, 34u,
+	1u, 1u, 12u, 34u, 11u, 34u, 11u, 34u,
+	11u, 34u, 10u, 34u, 10u, 34u, 10u, 34u,
+	5u, 34u, 1u, 34u, 7u, 7u, 3u, 3u,
+	5u, 34u, 27u, 28u, 28u, 28u, 5u, 34u,
+	10u, 34u, 11u, 34u, 12u, 33u, 13u, 33u,
+	14u, 33u, 31u, 32u, 32u, 32u, 12u, 34u,
+	12u, 34u, 12u, 34u, 12u, 34u, 11u, 34u,
+	11u, 34u, 11u, 34u, 10u, 34u, 10u, 34u,
+	10u, 34u, 5u, 34u, 1u, 34u, 1u, 1u,
+	3u, 3u, 7u, 7u, 1u, 34u, 5u, 34u,
+	27u, 28u, 28u, 28u, 1u, 4u, 36u, 38u,
+	35u, 38u, 35u, 37u, 0u
 };
 
-static const char _use_syllable_machine_key_spans[] = {
-	1, 1, 52, 38, 38, 1, 27, 26, 
-	24, 23, 22, 2, 1, 25, 25, 25, 
-	1, 25, 26, 26, 26, 27, 27, 27, 
-	38, 48, 1, 1, 38, 2, 1, 38, 
-	27, 26, 24, 23, 22, 2, 1, 25, 
-	25, 25, 25, 26, 26, 26, 27, 27, 
-	27, 38, 48, 1, 1, 1, 48, 38, 
-	2, 1, 5, 3, 4, 3
+static const signed char _use_syllable_machine_char_class[] = {
+	0, 1, 2, 2, 3, 4, 2, 2,
+	2, 2, 2, 5, 6, 7, 2, 2,
+	2, 2, 8, 9, 2, 2, 10, 11,
+	12, 13, 14, 15, 16, 17, 18, 19,
+	20, 21, 22, 23, 2, 24, 25, 26,
+	2, 27, 28, 29, 30, 31, 32, 33,
+	34, 35, 36, 37, 38, 0
 };
 
 static const short _use_syllable_machine_index_offsets[] = {
-	0, 2, 4, 57, 96, 135, 137, 165, 
-	192, 217, 241, 264, 267, 269, 295, 321, 
-	347, 349, 375, 402, 429, 456, 484, 512, 
-	540, 579, 628, 630, 632, 671, 674, 676, 
-	715, 743, 770, 795, 819, 842, 845, 847, 
-	873, 899, 925, 951, 978, 1005, 1032, 1060, 
-	1088, 1116, 1155, 1204, 1206, 1208, 1210, 1259, 
-	1298, 1301, 1303, 1309, 1313, 1318
+	0, 1, 2, 40, 70, 100, 101, 126,
+	150, 172, 193, 213, 215, 216, 239, 262,
+	285, 286, 309, 333, 357, 381, 406, 431,
+	456, 486, 520, 521, 522, 552, 554, 555,
+	585, 610, 634, 656, 677, 697, 699, 700,
+	723, 746, 769, 792, 816, 840, 864, 889,
+	914, 939, 969, 1003, 1004, 1005, 1006, 1040,
+	1070, 1072, 1073, 1077, 1080, 1084, 0
+};
+
+static const signed char _use_syllable_machine_indicies[] = {
+	1, 2, 4, 5, 6, 7, 8, 1,
+	9, 10, 11, 12, 13, 14, 15, 16,
+	17, 18, 19, 13, 20, 21, 22, 23,
+	24, 25, 26, 27, 28, 29, 30, 31,
+	32, 33, 34, 35, 9, 36, 6, 37,
+	39, 40, 38, 38, 38, 41, 42, 43,
+	44, 45, 46, 47, 41, 48, 5, 49,
+	50, 51, 52, 53, 54, 55, 38, 38,
+	38, 56, 57, 58, 59, 40, 39, 40,
+	38, 38, 38, 41, 42, 43, 44, 45,
+	46, 47, 41, 48, 49, 49, 50, 51,
+	52, 53, 54, 55, 38, 38, 38, 56,
+	57, 58, 59, 40, 39, 41, 42, 43,
+	44, 45, 38, 38, 38, 38, 38, 38,
+	50, 51, 52, 53, 54, 55, 38, 38,
+	38, 42, 57, 58, 59, 61, 42, 43,
+	44, 45, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 53, 54, 55, 38, 38,
+	38, 38, 57, 58, 59, 61, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 38, 38, 38, 38,
+	38, 57, 58, 59, 44, 45, 38, 38,
+	38, 38, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 38, 38, 57, 58,
+	59, 45, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 38, 38, 38, 38,
+	38, 38, 57, 58, 59, 57, 58, 58,
+	43, 44, 45, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 53, 54, 55, 38,
+	38, 38, 38, 57, 58, 59, 61, 43,
+	44, 45, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 54, 55, 38, 38,
+	38, 38, 57, 58, 59, 61, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 55, 38, 38, 38,
+	38, 57, 58, 59, 61, 63, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 38,
+	38, 38, 38, 38, 38, 38, 38, 38,
+	38, 57, 58, 59, 61, 42, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 50,
+	51, 52, 53, 54, 55, 38, 38, 38,
+	42, 57, 58, 59, 61, 42, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 38,
+	51, 52, 53, 54, 55, 38, 38, 38,
+	42, 57, 58, 59, 61, 42, 43, 44,
+	45, 38, 38, 38, 38, 38, 38, 38,
+	38, 52, 53, 54, 55, 38, 38, 38,
+	42, 57, 58, 59, 61, 41, 42, 43,
+	44, 45, 38, 47, 41, 38, 38, 38,
+	50, 51, 52, 53, 54, 55, 38, 38,
+	38, 42, 57, 58, 59, 61, 41, 42,
+	43, 44, 45, 38, 38, 41, 38, 38,
+	38, 50, 51, 52, 53, 54, 55, 38,
+	38, 38, 42, 57, 58, 59, 61, 41,
+	42, 43, 44, 45, 46, 47, 41, 38,
+	38, 38, 50, 51, 52, 53, 54, 55,
+	38, 38, 38, 42, 57, 58, 59, 61,
+	39, 40, 38, 38, 38, 41, 42, 43,
+	44, 45, 46, 47, 41, 48, 38, 49,
+	50, 51, 52, 53, 54, 55, 38, 38,
+	38, 56, 57, 58, 59, 40, 39, 60,
+	60, 60, 60, 60, 60, 60, 60, 60,
+	42, 43, 44, 45, 60, 60, 60, 60,
+	60, 60, 60, 60, 60, 53, 54, 55,
+	60, 60, 60, 60, 57, 58, 59, 61,
+	65, 7, 39, 40, 38, 38, 38, 41,
+	42, 43, 44, 45, 46, 47, 41, 48,
+	5, 49, 50, 51, 52, 53, 54, 55,
+	12, 67, 38, 56, 57, 58, 59, 40,
+	12, 67, 67, 1, 70, 69, 69, 69,
+	13, 14, 15, 16, 17, 18, 19, 13,
+	20, 22, 22, 23, 24, 25, 26, 27,
+	28, 69, 69, 69, 32, 33, 34, 35,
+	70, 13, 14, 15, 16, 17, 69, 69,
+	69, 69, 69, 69, 23, 24, 25, 26,
+	27, 28, 69, 69, 69, 14, 33, 34,
+	35, 71, 14, 15, 16, 17, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 26,
+	27, 28, 69, 69, 69, 69, 33, 34,
+	35, 71, 15, 16, 17, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 69, 69, 33, 34, 35,
+	16, 17, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 33, 34, 35, 17, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 33, 34,
+	35, 33, 34, 34, 15, 16, 17, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	26, 27, 28, 69, 69, 69, 69, 33,
+	34, 35, 71, 15, 16, 17, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	27, 28, 69, 69, 69, 69, 33, 34,
+	35, 71, 15, 16, 17, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	28, 69, 69, 69, 69, 33, 34, 35,
+	71, 15, 16, 17, 69, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 69, 33, 34, 35, 71,
+	14, 15, 16, 17, 69, 69, 69, 69,
+	69, 69, 23, 24, 25, 26, 27, 28,
+	69, 69, 69, 14, 33, 34, 35, 71,
+	14, 15, 16, 17, 69, 69, 69, 69,
+	69, 69, 69, 24, 25, 26, 27, 28,
+	69, 69, 69, 14, 33, 34, 35, 71,
+	14, 15, 16, 17, 69, 69, 69, 69,
+	69, 69, 69, 69, 25, 26, 27, 28,
+	69, 69, 69, 14, 33, 34, 35, 71,
+	13, 14, 15, 16, 17, 69, 19, 13,
+	69, 69, 69, 23, 24, 25, 26, 27,
+	28, 69, 69, 69, 14, 33, 34, 35,
+	71, 13, 14, 15, 16, 17, 69, 69,
+	13, 69, 69, 69, 23, 24, 25, 26,
+	27, 28, 69, 69, 69, 14, 33, 34,
+	35, 71, 13, 14, 15, 16, 17, 18,
+	19, 13, 69, 69, 69, 23, 24, 25,
+	26, 27, 28, 69, 69, 69, 14, 33,
+	34, 35, 71, 1, 70, 69, 69, 69,
+	13, 14, 15, 16, 17, 18, 19, 13,
+	20, 69, 22, 23, 24, 25, 26, 27,
+	28, 69, 69, 69, 32, 33, 34, 35,
+	70, 1, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 14, 15, 16, 17, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	26, 27, 28, 69, 69, 69, 69, 33,
+	34, 35, 71, 1, 73, 10, 5, 69,
+	69, 5, 1, 70, 10, 69, 69, 13,
+	14, 15, 16, 17, 18, 19, 13, 20,
+	21, 22, 23, 24, 25, 26, 27, 28,
+	29, 30, 69, 32, 33, 34, 35, 70,
+	1, 70, 69, 69, 69, 13, 14, 15,
+	16, 17, 18, 19, 13, 20, 21, 22,
+	23, 24, 25, 26, 27, 28, 69, 69,
+	69, 32, 33, 34, 35, 70, 29, 30,
+	30, 5, 72, 72, 5, 75, 74, 36,
+	36, 75, 74, 75, 36, 74, 37, 0
 };
 
-static const char _use_syllable_machine_indicies[] = {
-	1, 0, 2, 0, 3, 4, 5, 5, 
-	6, 7, 5, 5, 5, 5, 5, 1, 
-	8, 9, 5, 5, 5, 5, 10, 11, 
-	5, 5, 12, 13, 14, 15, 16, 17, 
-	18, 12, 19, 20, 21, 22, 23, 24, 
-	5, 25, 26, 27, 5, 28, 29, 30, 
-	31, 32, 33, 34, 8, 35, 5, 36, 
-	5, 38, 39, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 40, 41, 42, 43, 
-	44, 45, 46, 40, 47, 4, 48, 49, 
-	50, 51, 37, 52, 53, 54, 37, 37, 
-	37, 37, 55, 56, 57, 58, 39, 37, 
-	38, 39, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 40, 41, 42, 43, 44, 
-	45, 46, 40, 47, 48, 48, 49, 50, 
-	51, 37, 52, 53, 54, 37, 37, 37, 
-	37, 55, 56, 57, 58, 39, 37, 38, 
-	59, 40, 41, 42, 43, 44, 37, 37, 
-	37, 37, 37, 37, 49, 50, 51, 37, 
-	52, 53, 54, 37, 37, 37, 37, 41, 
-	56, 57, 58, 60, 37, 41, 42, 43, 
-	44, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 52, 53, 54, 37, 37, 
-	37, 37, 37, 56, 57, 58, 60, 37, 
-	42, 43, 44, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 56, 57, 58, 
-	37, 43, 44, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 56, 57, 58, 
-	37, 44, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 56, 57, 58, 37, 
-	56, 57, 37, 57, 37, 42, 43, 44, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 52, 53, 54, 37, 37, 37, 
-	37, 37, 56, 57, 58, 60, 37, 42, 
-	43, 44, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 53, 54, 37, 
-	37, 37, 37, 37, 56, 57, 58, 60, 
-	37, 42, 43, 44, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	54, 37, 37, 37, 37, 37, 56, 57, 
-	58, 60, 37, 62, 61, 42, 43, 44, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 37, 
-	37, 37, 56, 57, 58, 60, 37, 41, 
-	42, 43, 44, 37, 37, 37, 37, 37, 
-	37, 49, 50, 51, 37, 52, 53, 54, 
-	37, 37, 37, 37, 41, 56, 57, 58, 
-	60, 37, 41, 42, 43, 44, 37, 37, 
-	37, 37, 37, 37, 37, 50, 51, 37, 
-	52, 53, 54, 37, 37, 37, 37, 41, 
-	56, 57, 58, 60, 37, 41, 42, 43, 
-	44, 37, 37, 37, 37, 37, 37, 37, 
-	37, 51, 37, 52, 53, 54, 37, 37, 
-	37, 37, 41, 56, 57, 58, 60, 37, 
-	40, 41, 42, 43, 44, 37, 46, 40, 
-	37, 37, 37, 49, 50, 51, 37, 52, 
-	53, 54, 37, 37, 37, 37, 41, 56, 
-	57, 58, 60, 37, 40, 41, 42, 43, 
-	44, 37, 37, 40, 37, 37, 37, 49, 
-	50, 51, 37, 52, 53, 54, 37, 37, 
-	37, 37, 41, 56, 57, 58, 60, 37, 
-	40, 41, 42, 43, 44, 45, 46, 40, 
-	37, 37, 37, 49, 50, 51, 37, 52, 
-	53, 54, 37, 37, 37, 37, 41, 56, 
-	57, 58, 60, 37, 38, 39, 37, 37, 
-	37, 37, 37, 37, 37, 37, 37, 40, 
-	41, 42, 43, 44, 45, 46, 40, 47, 
-	37, 48, 49, 50, 51, 37, 52, 53, 
-	54, 37, 37, 37, 37, 55, 56, 57, 
-	58, 39, 37, 38, 59, 59, 59, 59, 
-	59, 59, 59, 59, 59, 59, 59, 59, 
-	59, 59, 59, 59, 59, 59, 59, 59, 
-	59, 41, 42, 43, 44, 59, 59, 59, 
-	59, 59, 59, 59, 59, 59, 59, 52, 
-	53, 54, 59, 59, 59, 59, 59, 56, 
-	57, 58, 60, 59, 64, 63, 6, 65, 
-	38, 39, 37, 37, 37, 37, 37, 37, 
-	37, 37, 37, 40, 41, 42, 43, 44, 
-	45, 46, 40, 47, 4, 48, 49, 50, 
-	51, 37, 52, 53, 54, 37, 11, 66, 
-	37, 55, 56, 57, 58, 39, 37, 11, 
-	66, 67, 66, 67, 1, 69, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 12, 
-	13, 14, 15, 16, 17, 18, 12, 19, 
-	21, 21, 22, 23, 24, 68, 25, 26, 
-	27, 68, 68, 68, 68, 31, 32, 33, 
-	34, 69, 68, 12, 13, 14, 15, 16, 
-	68, 68, 68, 68, 68, 68, 22, 23, 
-	24, 68, 25, 26, 27, 68, 68, 68, 
-	68, 13, 32, 33, 34, 70, 68, 13, 
-	14, 15, 16, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 25, 26, 27, 
-	68, 68, 68, 68, 68, 32, 33, 34, 
-	70, 68, 14, 15, 16, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 32, 
-	33, 34, 68, 15, 16, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 32, 
-	33, 34, 68, 16, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 32, 33, 
-	34, 68, 32, 33, 68, 33, 68, 14, 
-	15, 16, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 25, 26, 27, 68, 
-	68, 68, 68, 68, 32, 33, 34, 70, 
-	68, 14, 15, 16, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 26, 
-	27, 68, 68, 68, 68, 68, 32, 33, 
-	34, 70, 68, 14, 15, 16, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 27, 68, 68, 68, 68, 68, 
-	32, 33, 34, 70, 68, 14, 15, 16, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 32, 33, 34, 70, 68, 13, 
-	14, 15, 16, 68, 68, 68, 68, 68, 
-	68, 22, 23, 24, 68, 25, 26, 27, 
-	68, 68, 68, 68, 13, 32, 33, 34, 
-	70, 68, 13, 14, 15, 16, 68, 68, 
-	68, 68, 68, 68, 68, 23, 24, 68, 
-	25, 26, 27, 68, 68, 68, 68, 13, 
-	32, 33, 34, 70, 68, 13, 14, 15, 
-	16, 68, 68, 68, 68, 68, 68, 68, 
-	68, 24, 68, 25, 26, 27, 68, 68, 
-	68, 68, 13, 32, 33, 34, 70, 68, 
-	12, 13, 14, 15, 16, 68, 18, 12, 
-	68, 68, 68, 22, 23, 24, 68, 25, 
-	26, 27, 68, 68, 68, 68, 13, 32, 
-	33, 34, 70, 68, 12, 13, 14, 15, 
-	16, 68, 68, 12, 68, 68, 68, 22, 
-	23, 24, 68, 25, 26, 27, 68, 68, 
-	68, 68, 13, 32, 33, 34, 70, 68, 
-	12, 13, 14, 15, 16, 17, 18, 12, 
-	68, 68, 68, 22, 23, 24, 68, 25, 
-	26, 27, 68, 68, 68, 68, 13, 32, 
-	33, 34, 70, 68, 1, 69, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 12, 
-	13, 14, 15, 16, 17, 18, 12, 19, 
-	68, 21, 22, 23, 24, 68, 25, 26, 
-	27, 68, 68, 68, 68, 31, 32, 33, 
-	34, 69, 68, 1, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 68, 
-	68, 13, 14, 15, 16, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 68, 25, 
-	26, 27, 68, 68, 68, 68, 68, 32, 
-	33, 34, 70, 68, 1, 71, 72, 68, 
-	9, 68, 4, 68, 68, 68, 4, 68, 
-	68, 68, 68, 68, 1, 69, 9, 68, 
-	68, 68, 68, 68, 68, 68, 68, 12, 
-	13, 14, 15, 16, 17, 18, 12, 19, 
-	20, 21, 22, 23, 24, 68, 25, 26, 
-	27, 68, 28, 29, 68, 31, 32, 33, 
-	34, 69, 68, 1, 69, 68, 68, 68, 
-	68, 68, 68, 68, 68, 68, 12, 13, 
-	14, 15, 16, 17, 18, 12, 19, 20, 
-	21, 22, 23, 24, 68, 25, 26, 27, 
-	68, 68, 68, 68, 31, 32, 33, 34, 
-	69, 68, 28, 29, 68, 29, 68, 4, 
-	71, 71, 71, 4, 71, 74, 73, 35, 
-	73, 35, 74, 73, 74, 73, 35, 73, 
-	36, 73, 0
+static const signed char _use_syllable_machine_index_defaults[] = {
+	0, 0, 6, 38, 38, 60, 38, 38,
+	38, 38, 38, 38, 38, 38, 38, 38,
+	62, 38, 38, 38, 38, 38, 38, 38,
+	38, 60, 64, 66, 38, 68, 68, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 69, 69, 69, 69, 69,
+	69, 69, 69, 72, 69, 69, 69, 69,
+	69, 69, 72, 74, 74, 74, 0
 };
 
-static const char _use_syllable_machine_trans_targs[] = {
-	2, 31, 42, 2, 3, 2, 26, 28, 
-	51, 52, 54, 29, 32, 33, 34, 35, 
-	36, 46, 47, 48, 55, 49, 43, 44, 
-	45, 39, 40, 41, 56, 57, 58, 50, 
-	37, 38, 2, 59, 61, 2, 4, 5, 
-	6, 7, 8, 9, 10, 21, 22, 23, 
-	24, 18, 19, 20, 13, 14, 15, 25, 
-	11, 12, 2, 2, 16, 2, 17, 2, 
-	27, 2, 30, 2, 2, 0, 1, 2, 
-	53, 2, 60
+static const signed char _use_syllable_machine_cond_targs[] = {
+	2, 31, 42, 2, 2, 3, 2, 26,
+	28, 51, 52, 54, 29, 32, 33, 34,
+	35, 36, 46, 47, 48, 55, 49, 43,
+	44, 45, 39, 40, 41, 56, 57, 58,
+	50, 37, 38, 2, 59, 61, 2, 4,
+	5, 6, 7, 8, 9, 10, 21, 22,
+	23, 24, 18, 19, 20, 13, 14, 15,
+	25, 11, 12, 2, 2, 16, 2, 17,
+	2, 27, 2, 30, 2, 2, 0, 1,
+	2, 53, 2, 60, 0
 };
 
-static const char _use_syllable_machine_trans_actions[] = {
-	1, 2, 2, 5, 0, 6, 0, 0, 
-	0, 0, 2, 0, 2, 2, 0, 0, 
-	0, 2, 2, 2, 2, 2, 2, 2, 
-	2, 2, 2, 2, 0, 0, 0, 2, 
-	0, 0, 7, 0, 0, 8, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 9, 10, 0, 11, 0, 12, 
-	0, 13, 0, 14, 15, 0, 0, 16, 
-	0, 17, 0
+static const signed char _use_syllable_machine_cond_actions[] = {
+	1, 2, 2, 0, 5, 0, 6, 0,
+	0, 0, 0, 2, 0, 2, 2, 0,
+	0, 0, 2, 2, 2, 2, 2, 2,
+	2, 2, 2, 2, 2, 0, 0, 0,
+	2, 0, 0, 7, 0, 0, 8, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 9, 10, 0, 11, 0,
+	12, 0, 13, 0, 14, 15, 0, 0,
+	16, 0, 17, 0, 0
 };
 
-static const char _use_syllable_machine_to_state_actions[] = {
-	0, 0, 3, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0
+static const signed char _use_syllable_machine_to_state_actions[] = {
+	0, 0, 3, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0
 };
 
-static const char _use_syllable_machine_from_state_actions[] = {
-	0, 0, 4, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0, 0, 0, 
-	0, 0, 0, 0, 0, 0
+static const signed char _use_syllable_machine_from_state_actions[] = {
+	0, 0, 4, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0
 };
 
-static const short _use_syllable_machine_eof_trans[] = {
-	1, 1, 0, 38, 38, 60, 38, 38, 
-	38, 38, 38, 38, 38, 38, 38, 38, 
-	62, 38, 38, 38, 38, 38, 38, 38, 
-	38, 60, 64, 66, 38, 68, 68, 69, 
-	69, 69, 69, 69, 69, 69, 69, 69, 
-	69, 69, 69, 69, 69, 69, 69, 69, 
-	69, 69, 69, 72, 69, 69, 69, 69, 
-	69, 69, 72, 74, 74, 74
+static const signed char _use_syllable_machine_eof_trans[] = {
+	1, 1, 4, 39, 39, 61, 39, 39,
+	39, 39, 39, 39, 39, 39, 39, 39,
+	63, 39, 39, 39, 39, 39, 39, 39,
+	39, 61, 65, 67, 39, 69, 69, 70,
+	70, 70, 70, 70, 70, 70, 70, 70,
+	70, 70, 70, 70, 70, 70, 70, 70,
+	70, 70, 70, 73, 70, 70, 70, 70,
+	70, 70, 73, 75, 75, 75, 0
 };
 
 static const int use_syllable_machine_start = 2;
@@ -304,185 +352,376 @@ static const int use_syllable_machine_error = -1;
 static const int use_syllable_machine_en_main = 2;
 
 
-#line 39 "hb-ot-shape-complex-use-machine.rl"
+#line 59 "hb-ot-shape-complex-use-machine.rl"
 
 
 
-#line 154 "hb-ot-shape-complex-use-machine.rl"
+#line 176 "hb-ot-shape-complex-use-machine.rl"
 
 
 #define found_syllable(syllable_type) \
-  HB_STMT_START { \
-    if (0) fprintf (stderr, "syllable %d..%d %s\n", (*ts).second.first, (*te).second.first, #syllable_type); \
-    for (unsigned i = (*ts).second.first; i < (*te).second.first; ++i) \
-      info[i].syllable() = (syllable_serial << 4) | use_##syllable_type; \
-    syllable_serial++; \
-    if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
-  } HB_STMT_END
+HB_STMT_START { \
+	if (0) fprintf (stderr, "syllable %d..%d %s\n", (*ts).second.first, (*te).second.first, #syllable_type); \
+		for (unsigned i = (*ts).second.first; i < (*te).second.first; ++i) \
+	info[i].syllable() = (syllable_serial << 4) | syllable_type; \
+	syllable_serial++; \
+	if (unlikely (syllable_serial == 16)) syllable_serial = 1; \
+	} HB_STMT_END
+
+
+template <typename Iter>
+struct machine_index_t :
+hb_iter_with_fallback_t<machine_index_t<Iter>,
+typename Iter::item_t>
+{
+	machine_index_t (const Iter& it) : it (it) {}
+	machine_index_t (const machine_index_t& o) : it (o.it) {}
+	
+	static constexpr bool is_random_access_iterator = Iter::is_random_access_iterator;
+	static constexpr bool is_sorted_iterator = Iter::is_sorted_iterator;
+	
+	typename Iter::item_t __item__ () const { return *it; }
+	typename Iter::item_t __item_at__ (unsigned i) const { return it[i]; }
+	unsigned __len__ () const { return it.len (); }
+	void __next__ () { ++it; }
+	void __forward__ (unsigned n) { it += n; }
+	void __prev__ () { --it; }
+	void __rewind__ (unsigned n) { it -= n; }
+	void operator = (unsigned n)
+	{ unsigned index = (*it).first; if (index < n) it += n - index; else if (index > n) it -= index - n; }
+	void operator = (const machine_index_t& o) { *this = (*o.it).first; }
+	bool operator == (const machine_index_t& o) const { return (*it).first == (*o.it).first; }
+	bool operator != (const machine_index_t& o) const { return !(*this == o); }
+	
+	private:
+	Iter it;
+};
+struct
+{
+	template <typename Iter,
+	hb_requires (hb_is_iterable (Iter))>
+	machine_index_t<hb_iter_type<Iter>>
+	operator () (Iter&& it) const
+	{ return machine_index_t<hb_iter_type<Iter>> (hb_iter (it)); }
+}
+HB_FUNCOBJ (machine_index);
+
+
 
 static bool
 not_standard_default_ignorable (const hb_glyph_info_t &i)
-{ return !(i.use_category() == USE_O && _hb_glyph_info_is_default_ignorable (&i)); }
+{ return !(i.use_category() == USE(O) && _hb_glyph_info_is_default_ignorable (&i)); }
 
-static void
+static inline void
 find_syllables_use (hb_buffer_t *buffer)
 {
-  hb_glyph_info_t *info = buffer->info;
-  auto p =
-    + hb_iter (info, buffer->len)
-    | hb_enumerate
-    | hb_filter ([] (const hb_glyph_info_t &i) { return not_standard_default_ignorable (i); },
-		 hb_second)
-    | hb_filter ([&] (const hb_pair_t<unsigned, const hb_glyph_info_t &> p)
-		 {
-		   if (p.second.use_category() == USE_ZWNJ)
-		     for (unsigned i = p.first + 1; i < buffer->len; ++i)
-		       if (not_standard_default_ignorable (info[i]))
-			 return !_hb_glyph_info_is_unicode_mark (&info[i]);
-		   return true;
-		 })
-    | hb_enumerate
-    | machine_index
-    ;
-  auto pe = p + p.len ();
-  auto eof = +pe;
-  auto ts = +p;
-  auto te = +p;
-  unsigned int act HB_UNUSED;
-  int cs;
-  
-#line 355 "hb-ot-shape-complex-use-machine.hh"
+	hb_glyph_info_t *info = buffer->info;
+	auto p =
+	+ hb_iter (info, buffer->len)
+	| hb_enumerate
+	| hb_filter ([] (const hb_glyph_info_t &i) { return not_standard_default_ignorable (i); },
+	hb_second)
+	| hb_filter ([&] (const hb_pair_t<unsigned, const hb_glyph_info_t &> p)
+	{
+		if (p.second.use_category() == USE(ZWNJ))
+			for (unsigned i = p.first + 1; i < buffer->len; ++i)
+		if (not_standard_default_ignorable (info[i]))
+			return !_hb_glyph_info_is_unicode_mark (&info[i]);
+		return true;
+	})
+	| hb_enumerate
+	| machine_index
+	;
+	auto pe = p + p.len ();
+	auto eof = +pe;
+	auto ts = +p;
+	auto te = +p;
+	unsigned int act HB_UNUSED;
+	int cs;
+	
+#line 443 "hb-ot-shape-complex-use-machine.hh"
 	{
-	cs = use_syllable_machine_start;
-	ts = 0;
-	te = 0;
-	act = 0;
+		cs = (int)use_syllable_machine_start;
+		ts = 0;
+		te = 0;
 	}
-
-#line 198 "hb-ot-shape-complex-use-machine.rl"
-
-
-  unsigned int syllable_serial = 1;
-  
-#line 368 "hb-ot-shape-complex-use-machine.hh"
+	
+#line 260 "hb-ot-shape-complex-use-machine.rl"
+	
+	
+	unsigned int syllable_serial = 1;
+	
+#line 455 "hb-ot-shape-complex-use-machine.hh"
 	{
-	int _slen;
-	int _trans;
-	const unsigned char *_keys;
-	const char *_inds;
-	if ( p == pe )
-		goto _test_eof;
-_resume:
-	switch ( _use_syllable_machine_from_state_actions[cs] ) {
-	case 4:
+		unsigned int _trans = 0;
+		const unsigned char * _keys;
+		const signed char * _inds;
+		int _ic;
+		_resume: {}
+		if ( p == pe && p != eof )
+			goto _out;
+		switch ( _use_syllable_machine_from_state_actions[cs] ) {
+			case 4:  {
+				{
 #line 1 "NONE"
-	{ts = p;}
-	break;
-#line 382 "hb-ot-shape-complex-use-machine.hh"
-	}
-
-	_keys = _use_syllable_machine_trans_keys + (cs<<1);
-	_inds = _use_syllable_machine_indicies + _use_syllable_machine_index_offsets[cs];
-
-	_slen = _use_syllable_machine_key_spans[cs];
-	_trans = _inds[ _slen > 0 && _keys[0] <=( (*p).second.second.use_category()) &&
-		( (*p).second.second.use_category()) <= _keys[1] ?
-		( (*p).second.second.use_category()) - _keys[0] : _slen ];
-
-_eof_trans:
-	cs = _use_syllable_machine_trans_targs[_trans];
-
-	if ( _use_syllable_machine_trans_actions[_trans] == 0 )
-		goto _again;
-
-	switch ( _use_syllable_machine_trans_actions[_trans] ) {
-	case 2:
+					{ts = p;}}
+				
+#line 470 "hb-ot-shape-complex-use-machine.hh"
+				
+				
+				break; 
+			}
+		}
+		
+		if ( p == eof ) {
+			if ( _use_syllable_machine_eof_trans[cs] > 0 ) {
+				_trans = (unsigned int)_use_syllable_machine_eof_trans[cs] - 1;
+			}
+		}
+		else {
+			_keys = ( _use_syllable_machine_trans_keys + ((cs<<1)));
+			_inds = ( _use_syllable_machine_indicies + (_use_syllable_machine_index_offsets[cs]));
+			
+			if ( ((*p).second.second.use_category()) <= 52 ) {
+				_ic = (int)_use_syllable_machine_char_class[(int)((*p).second.second.use_category()) - 0];
+				if ( _ic <= (int)(*( _keys+1)) && _ic >= (int)(*( _keys)) )
+					_trans = (unsigned int)(*( _inds + (int)( _ic - (int)(*( _keys)) ) )); 
+				else
+					_trans = (unsigned int)_use_syllable_machine_index_defaults[cs];
+			}
+			else {
+				_trans = (unsigned int)_use_syllable_machine_index_defaults[cs];
+			}
+			
+		}
+		cs = (int)_use_syllable_machine_cond_targs[_trans];
+		
+		if ( _use_syllable_machine_cond_actions[_trans] != 0 ) {
+			
+			switch ( _use_syllable_machine_cond_actions[_trans] ) {
+				case 2:  {
+					{
 #line 1 "NONE"
-	{te = p+1;}
-	break;
-	case 5:
-#line 141 "hb-ot-shape-complex-use-machine.rl"
-	{te = p+1;{ found_syllable (independent_cluster); }}
-	break;
-	case 9:
-#line 144 "hb-ot-shape-complex-use-machine.rl"
-	{te = p+1;{ found_syllable (standard_cluster); }}
-	break;
-	case 7:
-#line 149 "hb-ot-shape-complex-use-machine.rl"
-	{te = p+1;{ found_syllable (broken_cluster); }}
-	break;
-	case 6:
-#line 150 "hb-ot-shape-complex-use-machine.rl"
-	{te = p+1;{ found_syllable (non_cluster); }}
-	break;
-	case 10:
-#line 142 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (virama_terminated_cluster); }}
-	break;
-	case 11:
-#line 143 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (sakot_terminated_cluster); }}
-	break;
-	case 8:
-#line 144 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (standard_cluster); }}
-	break;
-	case 13:
-#line 145 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (number_joiner_terminated_cluster); }}
-	break;
-	case 12:
-#line 146 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (numeral_cluster); }}
-	break;
-	case 14:
-#line 147 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (symbol_cluster); }}
-	break;
-	case 17:
-#line 148 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (hieroglyph_cluster); }}
-	break;
-	case 15:
-#line 149 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (broken_cluster); }}
-	break;
-	case 16:
-#line 150 "hb-ot-shape-complex-use-machine.rl"
-	{te = p;p--;{ found_syllable (non_cluster); }}
-	break;
-	case 1:
-#line 149 "hb-ot-shape-complex-use-machine.rl"
-	{{p = ((te))-1;}{ found_syllable (broken_cluster); }}
-	break;
-#line 460 "hb-ot-shape-complex-use-machine.hh"
-	}
-
-_again:
-	switch ( _use_syllable_machine_to_state_actions[cs] ) {
-	case 3:
+						{te = p+1;}}
+					
+#line 508 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 5:  {
+					{
+#line 163 "hb-ot-shape-complex-use-machine.rl"
+						{te = p+1;{
+#line 163 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_independent_cluster); }
+						}}
+					
+#line 521 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 9:  {
+					{
+#line 166 "hb-ot-shape-complex-use-machine.rl"
+						{te = p+1;{
+#line 166 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_standard_cluster); }
+						}}
+					
+#line 534 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 7:  {
+					{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+						{te = p+1;{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_broken_cluster); }
+						}}
+					
+#line 547 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 6:  {
+					{
+#line 172 "hb-ot-shape-complex-use-machine.rl"
+						{te = p+1;{
+#line 172 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_non_cluster); }
+						}}
+					
+#line 560 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 10:  {
+					{
+#line 164 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 164 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_virama_terminated_cluster); }
+						}}
+					
+#line 573 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 11:  {
+					{
+#line 165 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 165 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_sakot_terminated_cluster); }
+						}}
+					
+#line 586 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 8:  {
+					{
+#line 166 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 166 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_standard_cluster); }
+						}}
+					
+#line 599 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 13:  {
+					{
+#line 167 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 167 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_number_joiner_terminated_cluster); }
+						}}
+					
+#line 612 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 12:  {
+					{
+#line 168 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 168 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_numeral_cluster); }
+						}}
+					
+#line 625 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 14:  {
+					{
+#line 169 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 169 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_symbol_cluster); }
+						}}
+					
+#line 638 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 17:  {
+					{
+#line 170 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 170 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_hieroglyph_cluster); }
+						}}
+					
+#line 651 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 15:  {
+					{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_broken_cluster); }
+						}}
+					
+#line 664 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 16:  {
+					{
+#line 172 "hb-ot-shape-complex-use-machine.rl"
+						{te = p;p = p - 1;{
+#line 172 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_non_cluster); }
+						}}
+					
+#line 677 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+				case 1:  {
+					{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+						{p = ((te))-1;
+							{
+#line 171 "hb-ot-shape-complex-use-machine.rl"
+								found_syllable (use_broken_cluster); }
+						}}
+					
+#line 691 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+			}
+			
+		}
+		
+		if ( p == eof ) {
+			if ( cs >= 2 )
+				goto _out;
+		}
+		else {
+			switch ( _use_syllable_machine_to_state_actions[cs] ) {
+				case 3:  {
+					{
 #line 1 "NONE"
-	{ts = 0;}
-	break;
-#line 469 "hb-ot-shape-complex-use-machine.hh"
+						{ts = 0;}}
+					
+#line 711 "hb-ot-shape-complex-use-machine.hh"
+					
+					
+					break; 
+				}
+			}
+			
+			p += 1;
+			goto _resume;
+		}
+		_out: {}
 	}
-
-	if ( ++p != pe )
-		goto _resume;
-	_test_eof: {}
-	if ( p == eof )
-	{
-	if ( _use_syllable_machine_eof_trans[cs] > 0 ) {
-		_trans = _use_syllable_machine_eof_trans[cs] - 1;
-		goto _eof_trans;
-	}
-	}
-
-	}
-
-#line 203 "hb-ot-shape-complex-use-machine.rl"
-
+	
+#line 265 "hb-ot-shape-complex-use-machine.rl"
+	
 }
 
 #undef found_syllable
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-table.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-table.hh
index df3652b18a..a35894ce81 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-table.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use-table.hh
@@ -31,56 +31,57 @@
  * UnicodeData.txt does not have a header.
  */
 
-#include "hb.hh"
+#ifndef HB_OT_SHAPE_COMPLEX_USE_TABLE_HH
+#define HB_OT_SHAPE_COMPLEX_USE_TABLE_HH
 
-#ifndef HB_NO_OT_SHAPE
+#include "hb.hh"
 
-#include "hb-ot-shape-complex-use.hh"
+#include "hb-ot-shape-complex-use-machine.hh"
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-macros"
-#define B	USE_B	/* BASE */
-#define CS	USE_CS	/* CONS_WITH_STACKER */
-#define G	USE_G	/* HIEROGLYPH */
-#define GB	USE_GB	/* BASE_OTHER */
-#define H	USE_H	/* HALANT */
-#define HN	USE_HN	/* HALANT_NUM */
-#define HVM	USE_HVM	/* HALANT_OR_VOWEL_MODIFIER */
-#define J	USE_J	/* HIEROGLYPH_JOINER */
-#define N	USE_N	/* BASE_NUM */
-#define O	USE_O	/* OTHER */
-#define R	USE_R	/* REPHA */
-#define S	USE_S	/* SYM */
-#define SB	USE_SB	/* HIEROGLYPH_SEGMENT_BEGIN */
-#define SE	USE_SE	/* HIEROGLYPH_SEGMENT_END */
-#define SUB	USE_SUB	/* CONS_SUB */
-#define Sk	USE_Sk	/* SAKOT */
-#define ZWNJ	USE_ZWNJ	/* ZWNJ */
-#define CMAbv	USE_CMAbv
-#define CMBlw	USE_CMBlw
-#define FAbv	USE_FAbv
-#define FBlw	USE_FBlw
-#define FPst	USE_FPst
-#define FMAbv	USE_FMAbv
-#define FMBlw	USE_FMBlw
-#define FMPst	USE_FMPst
-#define MAbv	USE_MAbv
-#define MBlw	USE_MBlw
-#define MPst	USE_MPst
-#define MPre	USE_MPre
-#define SMAbv	USE_SMAbv
-#define SMBlw	USE_SMBlw
-#define VAbv	USE_VAbv
-#define VBlw	USE_VBlw
-#define VPst	USE_VPst
-#define VPre	USE_VPre
-#define VMAbv	USE_VMAbv
-#define VMBlw	USE_VMBlw
-#define VMPst	USE_VMPst
-#define VMPre	USE_VMPre
+#define B	USE(B)	/* BASE */
+#define CS	USE(CS)	/* CONS_WITH_STACKER */
+#define G	USE(G)	/* HIEROGLYPH */
+#define GB	USE(GB)	/* BASE_OTHER */
+#define H	USE(H)	/* HALANT */
+#define HN	USE(HN)	/* HALANT_NUM */
+#define HVM	USE(HVM)	/* HALANT_OR_VOWEL_MODIFIER */
+#define J	USE(J)	/* HIEROGLYPH_JOINER */
+#define N	USE(N)	/* BASE_NUM */
+#define O	USE(O)	/* OTHER */
+#define R	USE(R)	/* REPHA */
+#define S	USE(S)	/* SYM */
+#define SB	USE(SB)	/* HIEROGLYPH_SEGMENT_BEGIN */
+#define SE	USE(SE)	/* HIEROGLYPH_SEGMENT_END */
+#define SUB	USE(SUB)	/* CONS_SUB */
+#define Sk	USE(Sk)	/* SAKOT */
+#define ZWNJ	USE(ZWNJ)	/* ZWNJ */
+#define CMAbv	USE(CMAbv)
+#define CMBlw	USE(CMBlw)
+#define FAbv	USE(FAbv)
+#define FBlw	USE(FBlw)
+#define FPst	USE(FPst)
+#define FMAbv	USE(FMAbv)
+#define FMBlw	USE(FMBlw)
+#define FMPst	USE(FMPst)
+#define MAbv	USE(MAbv)
+#define MBlw	USE(MBlw)
+#define MPst	USE(MPst)
+#define MPre	USE(MPre)
+#define SMAbv	USE(SMAbv)
+#define SMBlw	USE(SMBlw)
+#define VAbv	USE(VAbv)
+#define VBlw	USE(VBlw)
+#define VPst	USE(VPst)
+#define VPre	USE(VPre)
+#define VMAbv	USE(VMAbv)
+#define VMBlw	USE(VMBlw)
+#define VMPst	USE(VMPst)
+#define VMPre	USE(VMPre)
 #pragma GCC diagnostic pop
 
-static const USE_TABLE_ELEMENT_TYPE use_table[] = {
+static const uint8_t use_table[] = {
 
 
 #define use_offset_0x0028u 0
@@ -767,7 +768,7 @@ static const USE_TABLE_ELEMENT_TYPE use_table[] = {
 
   /* 11700 */     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,
   /* 11710 */     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     O,     O,  MBlw,  MPre,  MAbv,
-  /* 11720 */  VPst,  VPst,  VAbv,  VAbv,  VBlw,  VBlw,  VPre,  VAbv,  VBlw,  VAbv,  VAbv, VMAbv,     O,     O,     O,     O,
+  /* 11720 */  VPst,  VPst,  VAbv,  VAbv,  VBlw,  VBlw,  VPre,  VAbv,  VBlw,  VAbv,  VAbv,  VAbv,     O,     O,     O,     O,
   /* 11730 */     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     B,     O,     O,     O,     O,
 
 #define use_offset_0x11800u 5848
@@ -1066,7 +1067,7 @@ static const USE_TABLE_ELEMENT_TYPE use_table[] = {
 
 }; /* Table items: 8824; occupancy: 79% */
 
-USE_TABLE_ELEMENT_TYPE
+static inline uint8_t
 hb_use_get_category (hb_codepoint_t u)
 {
   switch (u >> 12)
@@ -1154,7 +1155,7 @@ hb_use_get_category (hb_codepoint_t u)
     default:
       break;
   }
-  return USE_O;
+  return USE(O);
 }
 
 #undef B
@@ -1198,5 +1199,5 @@ hb_use_get_category (hb_codepoint_t u)
 #undef VMPre
 
 
-#endif
+#endif /* HB_OT_SHAPE_COMPLEX_USE_TABLE_HH */
 /* == End of generated table == */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.cc
index 8ac569d8bf..0d0b7e771e 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.cc
@@ -30,14 +30,12 @@
 
 #ifndef HB_NO_OT_SHAPE
 
-#include "hb-ot-shape-complex-use.hh"
+#include "hb-ot-shape-complex-use-machine.hh"
+#include "hb-ot-shape-complex-use-table.hh"
 #include "hb-ot-shape-complex-arabic.hh"
 #include "hb-ot-shape-complex-arabic-joining-list.hh"
 #include "hb-ot-shape-complex-vowel-constraints.hh"
 
-/* buffer var allocations */
-#define use_category() complex_var_u8_1()
-
 
 /*
  * Universal Shaping Engine.
@@ -69,11 +67,11 @@ use_topographical_features[] =
 };
 /* Same order as use_topographical_features. */
 enum joining_form_t {
-  USE_ISOL,
-  USE_INIT,
-  USE_MEDI,
-  USE_FINA,
-  _USE_NONE
+  JOINING_FORM_ISOL,
+  JOINING_FORM_INIT,
+  JOINING_FORM_MEDI,
+  JOINING_FORM_FINA,
+  _JOINING_FORM_NONE
 };
 static const hb_tag_t
 use_other_features[] =
@@ -186,22 +184,6 @@ data_destroy_use (void *data)
   free (data);
 }
 
-enum use_syllable_type_t {
-  use_independent_cluster,
-  use_virama_terminated_cluster,
-  use_sakot_terminated_cluster,
-  use_standard_cluster,
-  use_number_joiner_terminated_cluster,
-  use_numeral_cluster,
-  use_symbol_cluster,
-  use_hieroglyph_cluster,
-  use_broken_cluster,
-  use_non_cluster,
-};
-
-#include "hb-ot-shape-complex-use-machine.hh"
-
-
 static void
 setup_masks_use (const hb_ot_shape_plan_t *plan,
 		 hb_buffer_t              *buffer,
@@ -239,7 +221,7 @@ setup_rphf_mask (const hb_ot_shape_plan_t *plan,
 
   foreach_syllable (buffer, start, end)
   {
-    unsigned int limit = info[start].use_category() == USE_R ? 1 : hb_min (3u, end - start);
+    unsigned int limit = info[start].use_category() == USE(R) ? 1 : hb_min (3u, end - start);
     for (unsigned int i = start; i < start + limit; i++)
       info[i].mask |= mask;
   }
@@ -253,7 +235,7 @@ setup_topographical_masks (const hb_ot_shape_plan_t *plan,
   if (use_plan->arabic_plan)
     return;
 
-  static_assert ((USE_INIT < 4 && USE_ISOL < 4 && USE_MEDI < 4 && USE_FINA < 4), "");
+  static_assert ((JOINING_FORM_INIT < 4 && JOINING_FORM_ISOL < 4 && JOINING_FORM_MEDI < 4 && JOINING_FORM_FINA < 4), "");
   hb_mask_t masks[4], all_masks = 0;
   for (unsigned int i = 0; i < 4; i++)
   {
@@ -267,7 +249,7 @@ setup_topographical_masks (const hb_ot_shape_plan_t *plan,
   hb_mask_t other_masks = ~all_masks;
 
   unsigned int last_start = 0;
-  joining_form_t last_form = _USE_NONE;
+  joining_form_t last_form = _JOINING_FORM_NONE;
   hb_glyph_info_t *info = buffer->info;
   foreach_syllable (buffer, start, end)
   {
@@ -279,7 +261,7 @@ setup_topographical_masks (const hb_ot_shape_plan_t *plan,
       case use_hieroglyph_cluster:
       case use_non_cluster:
 	/* These don't join.  Nothing to do. */
-	last_form = _USE_NONE;
+	last_form = _JOINING_FORM_NONE;
 	break;
 
       case use_virama_terminated_cluster:
@@ -289,18 +271,18 @@ setup_topographical_masks (const hb_ot_shape_plan_t *plan,
       case use_numeral_cluster:
       case use_broken_cluster:
 
-	bool join = last_form == USE_FINA || last_form == USE_ISOL;
+	bool join = last_form == JOINING_FORM_FINA || last_form == JOINING_FORM_ISOL;
 
 	if (join)
 	{
 	  /* Fixup previous syllable's form. */
-	  last_form = last_form == USE_FINA ? USE_MEDI : USE_INIT;
+	  last_form = last_form == JOINING_FORM_FINA ? JOINING_FORM_MEDI : JOINING_FORM_INIT;
 	  for (unsigned int i = last_start; i < start; i++)
 	    info[i].mask = (info[i].mask & other_masks) | masks[last_form];
 	}
 
 	/* Form for this syllable. */
-	last_form = join ? USE_FINA : USE_ISOL;
+	last_form = join ? JOINING_FORM_FINA : JOINING_FORM_ISOL;
 	for (unsigned int i = start; i < end; i++)
 	  info[i].mask = (info[i].mask & other_masks) | masks[last_form];
 
@@ -336,11 +318,11 @@ record_rphf_use (const hb_ot_shape_plan_t *plan,
 
   foreach_syllable (buffer, start, end)
   {
-    /* Mark a substituted repha as USE_R. */
+    /* Mark a substituted repha as USE(R). */
     for (unsigned int i = start; i < end && (info[i].mask & mask); i++)
       if (_hb_glyph_info_substituted (&info[i]))
       {
-	info[i].use_category() = USE_R;
+	info[i].use_category() = USE(R);
 	break;
       }
   }
@@ -359,7 +341,7 @@ record_pref_use (const hb_ot_shape_plan_t *plan HB_UNUSED,
     for (unsigned int i = start; i < end; i++)
       if (_hb_glyph_info_substituted (&info[i]))
       {
-	info[i].use_category() = USE_VPre;
+	info[i].use_category() = USE(VPre);
 	break;
       }
   }
@@ -368,7 +350,7 @@ record_pref_use (const hb_ot_shape_plan_t *plan HB_UNUSED,
 static inline bool
 is_halant_use (const hb_glyph_info_t &info)
 {
-  return (info.use_category() == USE_H || info.use_category() == USE_HVM) &&
+  return (info.use_category() == USE(H) || info.use_category() == USE(HVM)) &&
 	 !_hb_glyph_info_ligated (&info);
 }
 
@@ -387,24 +369,24 @@ reorder_syllable_use (hb_buffer_t *buffer, unsigned int start, unsigned int end)
 
   hb_glyph_info_t *info = buffer->info;
 
-#define POST_BASE_FLAGS64 (FLAG64 (USE_FAbv) | \
-			   FLAG64 (USE_FBlw) | \
-			   FLAG64 (USE_FPst) | \
-			   FLAG64 (USE_MAbv) | \
-			   FLAG64 (USE_MBlw) | \
-			   FLAG64 (USE_MPst) | \
-			   FLAG64 (USE_MPre) | \
-			   FLAG64 (USE_VAbv) | \
-			   FLAG64 (USE_VBlw) | \
-			   FLAG64 (USE_VPst) | \
-			   FLAG64 (USE_VPre) | \
-			   FLAG64 (USE_VMAbv) | \
-			   FLAG64 (USE_VMBlw) | \
-			   FLAG64 (USE_VMPst) | \
-			   FLAG64 (USE_VMPre))
+#define POST_BASE_FLAGS64 (FLAG64 (USE(FAbv)) | \
+			   FLAG64 (USE(FBlw)) | \
+			   FLAG64 (USE(FPst)) | \
+			   FLAG64 (USE(MAbv)) | \
+			   FLAG64 (USE(MBlw)) | \
+			   FLAG64 (USE(MPst)) | \
+			   FLAG64 (USE(MPre)) | \
+			   FLAG64 (USE(VAbv)) | \
+			   FLAG64 (USE(VBlw)) | \
+			   FLAG64 (USE(VPst)) | \
+			   FLAG64 (USE(VPre)) | \
+			   FLAG64 (USE(VMAbv)) | \
+			   FLAG64 (USE(VMBlw)) | \
+			   FLAG64 (USE(VMPst)) | \
+			   FLAG64 (USE(VMPre)))
 
   /* Move things forward. */
-  if (info[start].use_category() == USE_R && end - start > 1)
+  if (info[start].use_category() == USE(R) && end - start > 1)
   {
     /* Got a repha.  Reorder it towards the end, but before the first post-base
      * glyph. */
@@ -441,7 +423,7 @@ reorder_syllable_use (hb_buffer_t *buffer, unsigned int start, unsigned int end)
        * shift things in between forward. */
       j = i + 1;
     }
-    else if (((flag) & (FLAG (USE_VPre) | FLAG (USE_VMPre))) &&
+    else if (((flag) & (FLAG (USE(VPre)) | FLAG (USE(VMPre)))) &&
 	     /* Only move the first component of a MultipleSubst. */
 	     0 == _hb_glyph_info_get_lig_comp (&info[i]) &&
 	     j < i)
@@ -454,76 +436,22 @@ reorder_syllable_use (hb_buffer_t *buffer, unsigned int start, unsigned int end)
   }
 }
 
-static inline void
-insert_dotted_circles_use (const hb_ot_shape_plan_t *plan HB_UNUSED,
-			   hb_font_t *font,
-			   hb_buffer_t *buffer)
-{
-  if (unlikely (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE))
-    return;
-
-  /* Note: This loop is extra overhead, but should not be measurable.
-   * TODO Use a buffer scratch flag to remove the loop. */
-  bool has_broken_syllables = false;
-  unsigned int count = buffer->len;
-  hb_glyph_info_t *info = buffer->info;
-  for (unsigned int i = 0; i < count; i++)
-    if ((info[i].syllable() & 0x0F) == use_broken_cluster)
-    {
-      has_broken_syllables = true;
-      break;
-    }
-  if (likely (!has_broken_syllables))
-    return;
-
-  hb_glyph_info_t dottedcircle = {0};
-  if (!font->get_nominal_glyph (0x25CCu, &dottedcircle.codepoint))
-    return;
-  dottedcircle.use_category() = hb_use_get_category (0x25CC);
-
-  buffer->clear_output ();
-
-  buffer->idx = 0;
-  unsigned int last_syllable = 0;
-  while (buffer->idx < buffer->len && buffer->successful)
-  {
-    unsigned int syllable = buffer->cur().syllable();
-    use_syllable_type_t syllable_type = (use_syllable_type_t) (syllable & 0x0F);
-    if (unlikely (last_syllable != syllable && syllable_type == use_broken_cluster))
-    {
-      last_syllable = syllable;
-
-      hb_glyph_info_t ginfo = dottedcircle;
-      ginfo.cluster = buffer->cur().cluster;
-      ginfo.mask = buffer->cur().mask;
-      ginfo.syllable() = buffer->cur().syllable();
-
-      /* Insert dottedcircle after possible Repha. */
-      while (buffer->idx < buffer->len && buffer->successful &&
-	     last_syllable == buffer->cur().syllable() &&
-	     buffer->cur().use_category() == USE_R)
-	buffer->next_glyph ();
-
-      buffer->output_info (ginfo);
-    }
-    else
-      buffer->next_glyph ();
-  }
-  buffer->swap_buffers ();
-}
-
 static void
 reorder_use (const hb_ot_shape_plan_t *plan,
 	     hb_font_t *font,
 	     hb_buffer_t *buffer)
 {
-	if (buffer->message (font, "start reordering USE")) {
-	  insert_dotted_circles_use (plan, font, buffer);
+  if (buffer->message (font, "start reordering USE"))
+  {
+    hb_syllabic_insert_dotted_circles (font, buffer,
+				       use_broken_cluster,
+				       USE(B),
+				       USE(R));
 
-	  foreach_syllable (buffer, start, end)
-	    reorder_syllable_use (buffer, start, end);
+    foreach_syllable (buffer, start, end)
+      reorder_syllable_use (buffer, start, end);
 
-	  (void) buffer->message (font, "end reordering USE");
+    (void) buffer->message (font, "end reordering USE");
   }
 
   HB_BUFFER_DEALLOCATE_VAR (buffer, use_category);
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.hh
deleted file mode 100644
index 788fb6b6ac..0000000000
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-use.hh
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright © 2015  Mozilla Foundation.
- * Copyright © 2015  Google, Inc.
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Mozilla Author(s): Jonathan Kew
- * Google Author(s): Behdad Esfahbod
- */
-
-#ifndef HB_OT_SHAPE_COMPLEX_USE_HH
-#define HB_OT_SHAPE_COMPLEX_USE_HH
-
-#include "hb.hh"
-
-
-#include "hb-ot-shape-complex.hh"
-
-
-#define USE_TABLE_ELEMENT_TYPE uint8_t
-
-/* Cateories used in the Universal Shaping Engine spec:
- * https://docs.microsoft.com/en-us/typography/script-development/use
- */
-/* Note: This enum is duplicated in the -machine.rl source file.
- * Not sure how to avoid duplication. */
-enum use_category_t {
-  USE_O		= 0,	/* OTHER */
-
-  USE_B		= 1,	/* BASE */
-  USE_N		= 4,	/* BASE_NUM */
-  USE_GB	= 5,	/* BASE_OTHER */
-  USE_SUB	= 11,	/* CONS_SUB */
-  USE_H		= 12,	/* HALANT */
-
-  USE_HN	= 13,	/* HALANT_NUM */
-  USE_ZWNJ	= 14,	/* Zero width non-joiner */
-  USE_R		= 18,	/* REPHA */
-  USE_S		= 19,	/* SYM */
-  USE_CS	= 43,	/* CONS_WITH_STACKER */
-
-  /* https://github.com/harfbuzz/harfbuzz/issues/1102 */
-  USE_HVM	= 44,	/* HALANT_OR_VOWEL_MODIFIER */
-
-  USE_Sk	= 48,	/* SAKOT */
-  USE_G		= 49,	/* HIEROGLYPH */
-  USE_J		= 50,	/* HIEROGLYPH_JOINER */
-  USE_SB	= 51,	/* HIEROGLYPH_SEGMENT_BEGIN */
-  USE_SE	= 52,	/* HIEROGLYPH_SEGMENT_END */
-
-  USE_FAbv	= 24,	/* CONS_FINAL_ABOVE */
-  USE_FBlw	= 25,	/* CONS_FINAL_BELOW */
-  USE_FPst	= 26,	/* CONS_FINAL_POST */
-  USE_MAbv	= 27,	/* CONS_MED_ABOVE */
-  USE_MBlw	= 28,	/* CONS_MED_BELOW */
-  USE_MPst	= 29,	/* CONS_MED_POST */
-  USE_MPre	= 30,	/* CONS_MED_PRE */
-  USE_CMAbv	= 31,	/* CONS_MOD_ABOVE */
-  USE_CMBlw	= 32,	/* CONS_MOD_BELOW */
-  USE_VAbv	= 33,	/* VOWEL_ABOVE / VOWEL_ABOVE_BELOW / VOWEL_ABOVE_BELOW_POST / VOWEL_ABOVE_POST */
-  USE_VBlw	= 34,	/* VOWEL_BELOW / VOWEL_BELOW_POST */
-  USE_VPst	= 35,	/* VOWEL_POST	UIPC = Right */
-  USE_VPre	= 22,	/* VOWEL_PRE / VOWEL_PRE_ABOVE / VOWEL_PRE_ABOVE_POST / VOWEL_PRE_POST */
-  USE_VMAbv	= 37,	/* VOWEL_MOD_ABOVE */
-  USE_VMBlw	= 38,	/* VOWEL_MOD_BELOW */
-  USE_VMPst	= 39,	/* VOWEL_MOD_POST */
-  USE_VMPre	= 23,	/* VOWEL_MOD_PRE */
-  USE_SMAbv	= 41,	/* SYM_MOD_ABOVE */
-  USE_SMBlw	= 42,	/* SYM_MOD_BELOW */
-  USE_FMAbv	= 45,	/* CONS_FINAL_MOD	UIPC = Top */
-  USE_FMBlw	= 46,	/* CONS_FINAL_MOD	UIPC = Bottom */
-  USE_FMPst	= 47,	/* CONS_FINAL_MOD	UIPC = Not_Applicable */
-};
-
-HB_INTERNAL USE_TABLE_ELEMENT_TYPE
-hb_use_get_category (hb_codepoint_t u);
-
-#endif /* HB_OT_SHAPE_COMPLEX_USE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
index 1af546e4fa..1037626998 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
@@ -23,15 +23,15 @@
 static void
 _output_dotted_circle (hb_buffer_t *buffer)
 {
-  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);
-  _hb_glyph_info_reset_continuation (&dottedcircle);
+  (void) buffer->output_glyph (0x25CCu);
+  _hb_glyph_info_reset_continuation (&buffer->prev());
 }
 
 static void
 _output_with_dotted_circle (hb_buffer_t *buffer)
 {
   _output_dotted_circle (buffer);
-  buffer->next_glyph ();
+  (void) buffer->next_glyph ();
 }
 
 void
@@ -51,7 +51,6 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
    *
    * https://github.com/harfbuzz/harfbuzz/issues/1019
    */
-  bool processed = false;
   buffer->clear_output ();
   unsigned int count = buffer->len;
   switch ((unsigned) buffer->props.script)
@@ -97,15 +96,14 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 		buffer->idx + 2 < count &&
 		0x0907u == buffer->cur (2).codepoint)
 	    {
-	      buffer->next_glyph ();
+	      (void) buffer->next_glyph ();
 	      matched = true;
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_BENGALI:
@@ -124,10 +122,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x09E2u == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_GURMUKHI:
@@ -161,10 +158,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_GUJARATI:
@@ -186,10 +182,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x0ABEu == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_ORIYA:
@@ -205,10 +200,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x0B57u == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_TAMIL:
@@ -220,10 +214,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	{
 	  matched = true;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_TELUGU:
@@ -244,10 +237,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x0C55u == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_KANNADA:
@@ -263,10 +255,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x0CCCu == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_MALAYALAM:
@@ -290,10 +281,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_SINHALA:
@@ -326,10 +316,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_BRAHMI:
@@ -348,10 +337,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x11042u == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_KHUDAWADI:
@@ -370,10 +358,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_TIRHUTA:
@@ -397,10 +384,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_MODI:
@@ -418,10 +404,9 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    }
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     case HB_SCRIPT_TAKRI:
@@ -442,21 +427,15 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
 	    matched = 0x116B2u == buffer->cur (1).codepoint;
 	    break;
 	}
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	if (matched) _output_with_dotted_circle (buffer);
       }
-      processed = true;
       break;
 
     default:
       break;
   }
-  if (processed)
-  {
-    if (buffer->idx < count)
-      buffer->next_glyph ();
-    buffer->swap_buffers ();
-  }
+  buffer->swap_buffers ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex.hh
index a1a7a6a47b..19e24b9f30 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex.hh
@@ -35,8 +35,8 @@
 
 
 /* buffer var allocations, used by complex shapers */
-#define complex_var_u8_0()	var2.u8[2]
-#define complex_var_u8_1()	var2.u8[3]
+#define complex_var_u8_category()	var2.u8[2]
+#define complex_var_u8_auxiliary()	var2.u8[3]
 
 
 #define HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS 32
@@ -186,27 +186,8 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     case HB_SCRIPT_ARABIC:
 
     /* Unicode-3.0 additions */
-    case HB_SCRIPT_MONGOLIAN:
     case HB_SCRIPT_SYRIAC:
 
-    /* Unicode-5.0 additions */
-    case HB_SCRIPT_NKO:
-    case HB_SCRIPT_PHAGS_PA:
-
-    /* Unicode-6.0 additions */
-    case HB_SCRIPT_MANDAIC:
-
-    /* Unicode-7.0 additions */
-    case HB_SCRIPT_MANICHAEAN:
-    case HB_SCRIPT_PSALTER_PAHLAVI:
-
-    /* Unicode-9.0 additions */
-    case HB_SCRIPT_ADLAM:
-
-    /* Unicode-11.0 additions */
-    case HB_SCRIPT_HANIFI_ROHINGYA:
-    case HB_SCRIPT_SOGDIAN:
-
       /* For Arabic script, use the Arabic shaper even if no OT script tag was found.
        * This is because we do fallback shaping for Arabic script (and not others).
        * But note that Arabic shaping is applicable only to horizontal layout; for
@@ -284,8 +265,9 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
 	return &_hb_ot_complex_shaper_myanmar;
 
 
-    /* https://github.com/harfbuzz/harfbuzz/issues/1162 */
+#define HB_SCRIPT_MYANMAR_ZAWGYI	((hb_script_t) HB_TAG ('Q','a','a','g'))
     case HB_SCRIPT_MYANMAR_ZAWGYI:
+    /* https://github.com/harfbuzz/harfbuzz/issues/1162 */
 
       return &_hb_ot_complex_shaper_myanmar_zawgyi;
 
@@ -294,7 +276,7 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     case HB_SCRIPT_TIBETAN:
 
     /* Unicode-3.0 additions */
-    //case HB_SCRIPT_MONGOLIAN:
+    case HB_SCRIPT_MONGOLIAN:
     //case HB_SCRIPT_SINHALA:
 
     /* Unicode-3.2 additions */
@@ -315,8 +297,8 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
 
     /* Unicode-5.0 additions */
     case HB_SCRIPT_BALINESE:
-    //case HB_SCRIPT_NKO:
-    //case HB_SCRIPT_PHAGS_PA:
+    case HB_SCRIPT_NKO:
+    case HB_SCRIPT_PHAGS_PA:
 
     /* Unicode-5.1 additions */
     case HB_SCRIPT_CHAM:
@@ -337,7 +319,7 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     /* Unicode-6.0 additions */
     case HB_SCRIPT_BATAK:
     case HB_SCRIPT_BRAHMI:
-    //case HB_SCRIPT_MANDAIC:
+    case HB_SCRIPT_MANDAIC:
 
     /* Unicode-6.1 additions */
     case HB_SCRIPT_CHAKMA:
@@ -351,10 +333,10 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     case HB_SCRIPT_KHOJKI:
     case HB_SCRIPT_KHUDAWADI:
     case HB_SCRIPT_MAHAJANI:
-    //case HB_SCRIPT_MANICHAEAN:
+    case HB_SCRIPT_MANICHAEAN:
     case HB_SCRIPT_MODI:
     case HB_SCRIPT_PAHAWH_HMONG:
-    //case HB_SCRIPT_PSALTER_PAHLAVI:
+    case HB_SCRIPT_PSALTER_PAHLAVI:
     case HB_SCRIPT_SIDDHAM:
     case HB_SCRIPT_TIRHUTA:
 
@@ -363,7 +345,7 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     case HB_SCRIPT_MULTANI:
 
     /* Unicode-9.0 additions */
-    //case HB_SCRIPT_ADLAM:
+    case HB_SCRIPT_ADLAM:
     case HB_SCRIPT_BHAIKSUKI:
     case HB_SCRIPT_MARCHEN:
     case HB_SCRIPT_NEWA:
@@ -376,11 +358,11 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
     /* Unicode-11.0 additions */
     case HB_SCRIPT_DOGRA:
     case HB_SCRIPT_GUNJALA_GONDI:
-    //case HB_SCRIPT_HANIFI_ROHINGYA:
+    case HB_SCRIPT_HANIFI_ROHINGYA:
     case HB_SCRIPT_MAKASAR:
     case HB_SCRIPT_MEDEFAIDRIN:
     case HB_SCRIPT_OLD_SOGDIAN:
-    //case HB_SCRIPT_SOGDIAN:
+    case HB_SCRIPT_SOGDIAN:
 
     /* Unicode-12.0 additions */
     case HB_SCRIPT_ELYMAIC:
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
index 3eabae1b45..778b5b8bd8 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
@@ -101,8 +101,9 @@ set_glyph (hb_glyph_info_t &info, hb_font_t *font)
 static inline void
 output_char (hb_buffer_t *buffer, hb_codepoint_t unichar, hb_codepoint_t glyph)
 {
+  /* This is very confusing indeed. */
   buffer->cur().glyph_index() = glyph;
-  buffer->output_glyph (unichar); /* This is very confusing indeed. */
+  (void) buffer->output_glyph (unichar);
   _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer);
 }
 
@@ -110,7 +111,7 @@ static inline void
 next_char (hb_buffer_t *buffer, hb_codepoint_t glyph)
 {
   buffer->cur().glyph_index() = glyph;
-  buffer->next_glyph ();
+  (void) buffer->next_glyph ();
 }
 
 static inline void
@@ -229,30 +230,35 @@ handle_variation_selector_cluster (const hb_ot_shape_normalize_context_t *c,
       if (font->get_variation_glyph (buffer->cur().codepoint, buffer->cur(+1).codepoint, &buffer->cur().glyph_index()))
       {
 	hb_codepoint_t unicode = buffer->cur().codepoint;
-	buffer->replace_glyphs (2, 1, &unicode);
+	(void) buffer->replace_glyphs (2, 1, &unicode);
       }
       else
       {
 	/* Just pass on the two characters separately, let GSUB do its magic. */
 	set_glyph (buffer->cur(), font);
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
 	set_glyph (buffer->cur(), font);
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
       }
       /* Skip any further variation selectors. */
-      while (buffer->idx < end && unlikely (buffer->unicode->is_variation_selector (buffer->cur().codepoint)))
+      while (buffer->idx < end &&
+	     buffer->successful &&
+	     unlikely (buffer->unicode->is_variation_selector (buffer->cur().codepoint)))
       {
 	set_glyph (buffer->cur(), font);
-	buffer->next_glyph ();
+	(void) buffer->next_glyph ();
       }
-    } else {
+    }
+    else
+    {
       set_glyph (buffer->cur(), font);
-      buffer->next_glyph ();
+      (void) buffer->next_glyph ();
     }
   }
-  if (likely (buffer->idx < end)) {
+  if (likely (buffer->idx < end))
+  {
     set_glyph (buffer->cur(), font);
-    buffer->next_glyph ();
+    (void) buffer->next_glyph ();
   }
 }
 
@@ -348,7 +354,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
 						      sizeof (buffer->info[0]),
 						      &buffer->cur().glyph_index(),
 						      sizeof (buffer->info[0]));
-	buffer->next_glyphs (done);
+	if (unlikely (!buffer->next_glyphs (done))) break;
       }
       while (buffer->idx < end && buffer->successful)
 	decompose_current_character (&c, might_short_circuit);
@@ -419,6 +425,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
   /* Third round, recompose */
 
   if (!all_simple &&
+      buffer->successful &&
       (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS ||
        mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT))
   {
@@ -428,8 +435,8 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
     buffer->clear_output ();
     count = buffer->len;
     unsigned int starter = 0;
-    buffer->next_glyph ();
-    while (buffer->idx < count && buffer->successful)
+    (void) buffer->next_glyph ();
+    while (buffer->idx < count /* No need for: && buffer->successful */)
     {
       hb_codepoint_t composed, glyph;
       if (/* We don't try to compose a non-mark character with it's preceding starter.
@@ -451,9 +458,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
 	    font->get_nominal_glyph (composed, &glyph))
 	{
 	  /* Composes. */
-	  buffer->next_glyph (); /* Copy to out-buffer. */
-	  if (unlikely (!buffer->successful))
-	    return;
+	  if (unlikely (!buffer->next_glyph ())) break; /* Copy to out-buffer. */
 	  buffer->merge_out_clusters (starter, buffer->out_len);
 	  buffer->out_len--; /* Remove the second composable. */
 	  /* Modify starter and carry on. */
@@ -466,7 +471,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       }
 
       /* Blocked, or doesn't compose. */
-      buffer->next_glyph ();
+      if (unlikely (!buffer->next_glyph ())) break;
 
       if (info_cc (buffer->prev()) == 0)
 	starter = buffer->out_len - 1;
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape.cc b/thirdparty/harfbuzz/src/hb-ot-shape.cc
index 7d90558458..86ab0b4268 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape.cc
@@ -534,9 +534,7 @@ hb_insert_dotted_circle (hb_buffer_t *buffer, hb_font_t *font)
   hb_glyph_info_t info = dottedcircle;
   info.cluster = buffer->cur().cluster;
   info.mask = buffer->cur().mask;
-  buffer->output_info (info);
-  while (buffer->idx < buffer->len && buffer->successful)
-    buffer->next_glyph ();
+  (void) buffer->output_info (info);
   buffer->swap_buffers ();
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape.h b/thirdparty/harfbuzz/src/hb-ot-shape.h
index 7b1bcc0637..afdff72833 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape.h
+++ b/thirdparty/harfbuzz/src/hb-ot-shape.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
index f1c391cf0e..87830b5462 100644
--- a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
@@ -6,8 +6,8 @@
  *
  * on files with these headers:
  *
- * <meta name="updated_at" content="2020-11-17 08:21 AM" />
- * File-Date: 2020-09-29
+ * <meta name="updated_at" content="2021-02-12 04:08 PM" />
+ * File-Date: 2021-03-05
  */
 
 #ifndef HB_OT_TAG_TABLE_HH
@@ -169,6 +169,7 @@ static const LangTag ot_languages[] = {
   {"bko",	HB_TAG('B','M','L',' ')},	/* Kwa' -> Bamileke */
   {"bla",	HB_TAG('B','K','F',' ')},	/* Siksika -> Blackfoot */
   {"ble",	HB_TAG('B','L','N',' ')},	/* Balanta-Kentohe -> Balante */
+  {"blg",	HB_TAG('I','B','A',' ')},	/* Balau (retired code) -> Iban */
   {"bli",	HB_TAG_NONE	       },	/* Bolia != Baluchi */
   {"blk",	HB_TAG('B','L','K',' ')},	/* Pa’o Karen */
   {"blk",	HB_TAG('K','R','N',' ')},	/* Pa'o Karen -> Karen */
@@ -358,6 +359,7 @@ static const LangTag ot_languages[] = {
   {"czo",	HB_TAG('Z','H','S',' ')},	/* Min Zhong Chinese -> Chinese, Simplified */
   {"czt",	HB_TAG('Q','I','N',' ')},	/* Zotung Chin -> Chin */
   {"da",	HB_TAG('D','A','N',' ')},	/* Danish */
+/*{"dag",	HB_TAG('D','A','G',' ')},*/	/* Dagbani */
   {"dao",	HB_TAG('Q','I','N',' ')},	/* Daai Chin -> Chin */
   {"dap",	HB_TAG('N','I','S',' ')},	/* Nisi (India) (retired code) */
 /*{"dar",	HB_TAG('D','A','R',' ')},*/	/* Dargwa */
@@ -834,6 +836,7 @@ static const LangTag ot_languages[] = {
   {"lri",	HB_TAG('L','U','H',' ')},	/* Marachi -> Luyia */
   {"lrm",	HB_TAG('L','U','H',' ')},	/* Marama -> Luyia */
   {"lrt",	HB_TAG('C','P','P',' ')},	/* Larantuka Malay -> Creoles */
+  {"lsb",	HB_TAG_NONE	       },	/* Burundian Sign Language != Lower Sorbian */
   {"lsm",	HB_TAG('L','U','H',' ')},	/* Saamia -> Luyia */
   {"lt",	HB_TAG('L','T','H',' ')},	/* Lithuanian */
   {"ltg",	HB_TAG('L','V','I',' ')},	/* Latgalian -> Latvian */
@@ -990,7 +993,7 @@ static const LangTag ot_languages[] = {
 /*{"nga",	HB_TAG('N','G','A',' ')},*/	/* Ngbaka */
   {"ngl",	HB_TAG('L','M','W',' ')},	/* Lomwe */
   {"ngm",	HB_TAG('C','P','P',' ')},	/* Ngatik Men's Creole -> Creoles */
-  {"ngo",	HB_TAG('S','X','T',' ')},	/* Ngoni -> Sutu */
+  {"ngo",	HB_TAG('S','X','T',' ')},	/* Ngoni (retired code) -> Sutu */
   {"ngr",	HB_TAG_NONE	       },	/* Engdewu != Nagari */
   {"ngu",	HB_TAG('N','A','H',' ')},	/* Guerrero Nahuatl -> Nahuatl */
   {"nhc",	HB_TAG('N','A','H',' ')},	/* Tabasco Nahuatl -> Nahuatl */
@@ -1520,6 +1523,8 @@ static const LangTag ot_languages[] = {
   {"xmm",	HB_TAG('C','P','P',' ')},	/* Manado Malay -> Creoles */
   {"xmv",	HB_TAG('M','L','G',' ')},	/* Antankarana Malagasy -> Malagasy */
   {"xmw",	HB_TAG('M','L','G',' ')},	/* Tsimihety Malagasy -> Malagasy */
+  {"xnj",	HB_TAG('S','X','T',' ')},	/* Ngoni (Tanzania) -> Sutu */
+  {"xnq",	HB_TAG('S','X','T',' ')},	/* Ngoni (Mozambique) -> Sutu */
   {"xnr",	HB_TAG('D','G','R',' ')},	/* Kangri -> Dogri (macrolanguage) */
 /*{"xog",	HB_TAG('X','O','G',' ')},*/	/* Soga */
   {"xpe",	HB_TAG('X','P','E',' ')},	/* Liberia Kpelle -> Kpelle (Liberia) */
@@ -2808,6 +2813,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("hnd", -1);  /* Southern Hindko */
   case HB_TAG('H','Y','E',' '):  /* Armenian */
     return hb_language_from_string ("hyw", -1);  /* Western Armenian */
+  case HB_TAG('I','B','A',' '):  /* Iban */
+    return hb_language_from_string ("iba", -1);  /* Iban */
   case HB_TAG('I','J','O',' '):  /* Ijo */
     return hb_language_from_string ("ijo", -1);  /* Ijo [family] */
   case HB_TAG('I','N','U',' '):  /* Inuktitut */
@@ -2892,6 +2899,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("sq", -1);  /* Albanian [macrolanguage] */
   case HB_TAG('S','R','B',' '):  /* Serbian */
     return hb_language_from_string ("sr", -1);  /* Serbian */
+  case HB_TAG('S','X','T',' '):  /* Sutu */
+    return hb_language_from_string ("xnj", -1);  /* Ngoni (Tanzania) */
   case HB_TAG('S','Y','R',' '):  /* Syriac */
     return hb_language_from_string ("syr", -1);  /* Syriac [macrolanguage] */
   case HB_TAG('S','Y','R','E'):  /* Syriac, Estrangela script-variant (equivalent to ISO 15924 'Syre') */
diff --git a/thirdparty/harfbuzz/src/hb-ot-tag.cc b/thirdparty/harfbuzz/src/hb-ot-tag.cc
index 19bd3639d3..fc145a41f7 100644
--- a/thirdparty/harfbuzz/src/hb-ot-tag.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-tag.cc
@@ -164,6 +164,15 @@ hb_ot_all_tags_from_script (hb_script_t   script,
   *count = i;
 }
 
+/**
+ * hb_ot_tag_to_script:
+ * @tag: a script tag
+ *
+ * Converts a script tag to an #hb_script_t.
+ *
+ * Return value: The #hb_script_t corresponding to @tag.
+ *
+ **/
 hb_script_t
 hb_ot_tag_to_script (hb_tag_t tag)
 {
@@ -351,13 +360,13 @@ parse_private_use_subtag (const char     *private_use_subtag,
  * hb_ot_tags_from_script_and_language:
  * @script: an #hb_script_t to convert.
  * @language: an #hb_language_t to convert.
- * @script_count: (allow-none): maximum number of script tags to retrieve (IN)
+ * @script_count: (inout) (optional): maximum number of script tags to retrieve (IN)
  * and actual number of script tags retrieved (OUT)
- * @script_tags: (out) (allow-none): array of size at least @script_count to store the
+ * @script_tags: (out) (optional): array of size at least @script_count to store the
  * script tag results
- * @language_count: (allow-none): maximum number of language tags to retrieve
+ * @language_count: (inout) (optional): maximum number of language tags to retrieve
  * (IN) and actual number of language tags retrieved (OUT)
- * @language_tags: (out) (allow-none): array of size at least @language_count to store
+ * @language_tags: (out) (optional): array of size at least @language_count to store
  * the language tag results
  *
  * Converts an #hb_script_t and an #hb_language_t to script and language tags.
@@ -424,10 +433,12 @@ hb_ot_tags_from_script_and_language (hb_script_t   script,
 
 /**
  * hb_ot_tag_to_language:
+ * @tag: an language tag
  *
+ * Converts a language tag to an #hb_language_t.
  *
- *
- * Return value: (transfer none):
+ * Return value: (transfer none) (nullable):
+ * The #hb_language_t corresponding to @tag.
  *
  * Since: 0.9.2
  **/
@@ -478,9 +489,9 @@ hb_ot_tag_to_language (hb_tag_t tag)
  * hb_ot_tags_to_script_and_language:
  * @script_tag: a script tag
  * @language_tag: a language tag
- * @script: (allow-none): the #hb_script_t corresponding to @script_tag (OUT).
- * @language: (allow-none): the #hb_language_t corresponding to @script_tag and
- * @language_tag (OUT).
+ * @script: (out) (optional): the #hb_script_t corresponding to @script_tag.
+ * @language: (out) (optional): the #hb_language_t corresponding to @script_tag and
+ * @language_tag.
  *
  * Converts a script tag and a language tag to an #hb_script_t and an
  * #hb_language_t.
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
index 4d4e6dcae4..7e4eaaad95 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
@@ -652,8 +652,8 @@ no_more_gaps:
 	/* apply specified / inferred deltas to points */
 	for (unsigned int i = 0; i < points.length; i++)
 	{
-	  points[i].x += roundf (deltas[i].x);
-	  points[i].y += roundf (deltas[i].y);
+	  points[i].x += deltas[i].x;
+	  points[i].y += deltas[i].y;
 	}
       } while (iterator.move_to_next ());
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.cc b/thirdparty/harfbuzz/src/hb-ot-var.cc
index 1fe57383c0..6b42b45cd9 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-var.cc
@@ -56,7 +56,7 @@
  *
  * Tests whether a face includes any OpenType variation data in the `fvar` table.
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 1.4.2
  **/
@@ -87,7 +87,7 @@ hb_ot_var_get_axis_count (hb_face_t *face)
  * hb_ot_var_get_axes:
  * @face: #hb_face_t to work upon
  * @start_offset: offset of the first lookup to retrieve
- * @axes_count: (inout) (allow-none): Input = the maximum number of variation axes to return;
+ * @axes_count: (inout) (optional): Input = the maximum number of variation axes to return;
  *                Output = the actual number of variation axes returned (may be zero)
  * @axes_array: (out caller-allocates) (array length=axes_count): The array of variation axes found
  *
@@ -133,7 +133,7 @@ hb_ot_var_find_axis (hb_face_t        *face,
  * hb_ot_var_get_axis_infos:
  * @face: #hb_face_t to work upon
  * @start_offset: offset of the first lookup to retrieve
- * @axes_count: (inout) (allow-none): Input = the maximum number of variation axes to return;
+ * @axes_count: (inout) (optional): Input = the maximum number of variation axes to return;
  *                Output = the actual number of variation axes returned (may be zero)
  * @axes_array: (out caller-allocates) (array length=axes_count): The array of variation axes found
  *
@@ -162,7 +162,7 @@ hb_ot_var_get_axis_infos (hb_face_t             *face,
  * Fetches the variation-axis information corresponding to the specified axis tag
  * in the specified face.
  *
- * Return value: true if data found, false otherwise
+ * Return value: %true if data found, %false otherwise
  *
  * Since: 2.2.0
  **/
@@ -237,7 +237,7 @@ hb_ot_var_named_instance_get_postscript_name_id (hb_face_t  *face,
  * hb_ot_var_named_instance_get_design_coords:
  * @face: The #hb_face_t to work on
  * @instance_index: The index of the named instance to query
- * @coords_length: (inout) (allow-none): Input = the maximum number of coordinates to return;
+ * @coords_length: (inout) (optional): Input = the maximum number of coordinates to return;
  *                 Output = the actual number of coordinates returned (may be zero)
  * @coords: (out) (array length=coords_length): The array of coordinates found for the query
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.h b/thirdparty/harfbuzz/src/hb-ot-var.h
index ef2ca0a716..ce201d3b4f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.h
+++ b/thirdparty/harfbuzz/src/hb-ot-var.h
@@ -24,7 +24,7 @@
  * Red Hat Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_OT_H_IN
+#if !defined(HB_OT_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb-ot.h> instead."
 #endif
 
@@ -36,34 +36,38 @@
 HB_BEGIN_DECLS
 
 /**
- * hb_tag_t:
- * @HB_OT_TAG_VAR_AXIS_ITALIC: Registered tag for the roman/italic axis
+ * HB_OT_TAG_VAR_AXIS_ITALIC:
+ *
+ * Registered tag for the roman/italic axis.
  */
 #define HB_OT_TAG_VAR_AXIS_ITALIC	HB_TAG('i','t','a','l')
 
 /**
- * hb_tag_t:
- * @HB_OT_TAG_VAR_AXIS_OPTICAL_SIZE: Registered tag for the optical-size axis
+ * HB_OT_TAG_VAR_AXIS_OPTICAL_SIZE:
  *
+ * Registered tag for the optical-size axis.
  * <note>Note: The optical-size axis supersedes the OpenType `size` feature.</note>
  */
 #define HB_OT_TAG_VAR_AXIS_OPTICAL_SIZE	HB_TAG('o','p','s','z')
 
 /**
- * hb_tag_t:
- * @HB_OT_TAG_VAR_AXIS_SLANT: Registered tag for the slant axis
+ * HB_OT_TAG_VAR_AXIS_SLANT:
+ *
+ * Registered tag for the slant axis
  */
 #define HB_OT_TAG_VAR_AXIS_SLANT	HB_TAG('s','l','n','t')
 
 /**
- * hb_tag_t:
- * @HB_OT_TAG_VAR_AXIS_WIDTH: Registered tag for the width axis
+ * HB_OT_TAG_VAR_AXIS_WIDTH:
+ *
+ * Registered tag for the width axis.
  */
 #define HB_OT_TAG_VAR_AXIS_WIDTH	HB_TAG('w','d','t','h')
 
 /**
- * hb_tag_t:
- * @HB_OT_TAG_VAR_AXIS_WEIGHT: Registered tag for the weight axis
+ * HB_OT_TAG_VAR_AXIS_WEIGHT:
+ *
+ * Registered tag for the weight axis.
  */
 #define HB_OT_TAG_VAR_AXIS_WEIGHT	HB_TAG('w','g','h','t')
 
@@ -88,11 +92,14 @@ hb_ot_var_get_axis_count (hb_face_t *face);
  * hb_ot_var_axis_flags_t:
  * @HB_OT_VAR_AXIS_FLAG_HIDDEN: The axis should not be exposed directly in user interfaces.
  *
+ * Flags for #hb_ot_var_axis_info_t.
+ *
  * Since: 2.2.0
  */
 typedef enum { /*< flags >*/
   HB_OT_VAR_AXIS_FLAG_HIDDEN	= 0x00000001u,
 
+  /*< private >*/
   _HB_OT_VAR_AXIS_FLAG_MAX_VALUE= HB_TAG_MAX_SIGNED /*< skip >*/
 } hb_ot_var_axis_flags_t;
 
diff --git a/thirdparty/harfbuzz/src/hb-sanitize.hh b/thirdparty/harfbuzz/src/hb-sanitize.hh
index 024b4d1c99..1675e8448a 100644
--- a/thirdparty/harfbuzz/src/hb-sanitize.hh
+++ b/thirdparty/harfbuzz/src/hb-sanitize.hh
@@ -73,7 +73,7 @@
  * === The sanitize() contract ===
  *
  * The sanitize() method of each object type shall return true if it's safe to
- * call other methods of the object, and false otherwise.
+ * call other methods of the object, and %false otherwise.
  *
  * Note that what sanitize() checks for might align with what the specification
  * describes as valid table data, but does not have to be.  In particular, we
@@ -113,8 +113,8 @@
 #ifndef HB_SANITIZE_MAX_OPS_MAX
 #define HB_SANITIZE_MAX_OPS_MAX 0x3FFFFFFF
 #endif
-#ifndef HB_SANITIZE_MAX_SUTABLES
-#define HB_SANITIZE_MAX_SUTABLES 0x4000
+#ifndef HB_SANITIZE_MAX_SUBTABLES
+#define HB_SANITIZE_MAX_SUBTABLES 0x4000
 #endif
 
 struct hb_sanitize_context_t :
@@ -139,7 +139,7 @@ struct hb_sanitize_context_t :
   bool visit_subtables (unsigned count)
   {
     max_subtables += count;
-    return max_subtables < HB_SANITIZE_MAX_SUTABLES;
+    return max_subtables < HB_SANITIZE_MAX_SUBTABLES;
   }
 
   private:
diff --git a/thirdparty/harfbuzz/src/hb-serialize.hh b/thirdparty/harfbuzz/src/hb-serialize.hh
index 4566153a59..fe29bdf96e 100644
--- a/thirdparty/harfbuzz/src/hb-serialize.hh
+++ b/thirdparty/harfbuzz/src/hb-serialize.hh
@@ -256,10 +256,11 @@ struct hb_serialize_context_t
 
     packed.push (obj);
 
-    if (unlikely (packed.in_error ())) {
-      // obj wasn't successfully added to packed, so clean it up otherwise it's
-      // links will be leaked.
-      propagate_error (packed);
+    if (unlikely (!propagate_error (packed)))
+    {
+      /* Obj wasn't successfully added to packed, so clean it up otherwise its
+       * links will be leaked. When we use constructor/destructors properly, we
+       * can remove these. */
       obj->fini ();
       return 0;
     }
@@ -523,7 +524,7 @@ struct hb_serialize_context_t
   template <typename T>
   void assign_offset (const object_t* parent, const object_t::link_t &link, unsigned offset)
   {
-    auto &off = * ((BEInt<T, sizeof (T)> *) (parent->head + link.position));
+    auto &off = * ((BEInt<T> *) (parent->head + link.position));
     assert (0 == off);
     check_assign (off, offset);
   }
diff --git a/thirdparty/harfbuzz/src/hb-set.cc b/thirdparty/harfbuzz/src/hb-set.cc
index 3b4059ad32..86bf70034c 100644
--- a/thirdparty/harfbuzz/src/hb-set.cc
+++ b/thirdparty/harfbuzz/src/hb-set.cc
@@ -117,7 +117,7 @@ hb_set_destroy (hb_set_t *set)
  * @set: A set
  * @key: The user-data key to set
  * @data: A pointer to the user data to set
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified set.
@@ -162,7 +162,7 @@ hb_set_get_user_data (hb_set_t           *set,
  *
  * Tests whether memory allocation for a set was successful.
  *
- * Return value: %true if allocation succeeded, false otherwise
+ * Return value: %true if allocation succeeded, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -183,6 +183,9 @@ hb_set_allocation_successful (const hb_set_t  *set)
 void
 hb_set_clear (hb_set_t *set)
 {
+  if (unlikely (hb_object_is_immutable (set)))
+    return;
+
   set->clear ();
 }
 
@@ -209,7 +212,7 @@ hb_set_is_empty (const hb_set_t *set)
  *
  * Tests whether @codepoint belongs to @set.
  *
- * Return value: %true if @codepoint is in @set, false otherwise
+ * Return value: %true if @codepoint is in @set, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -298,7 +301,7 @@ hb_set_del_range (hb_set_t       *set,
  * Tests whether @set and @other are equal (contain the same
  * elements).
  *
- * Return value: %TRUE if the two sets are equal, %FALSE otherwise.
+ * Return value: %true if the two sets are equal, %false otherwise.
  *
  * Since: 0.9.7
  **/
@@ -316,7 +319,7 @@ hb_set_is_equal (const hb_set_t *set,
  *
  * Tests whether @set is a subset of @larger_set.
  *
- * Return value: %TRUE if the @set is a subset of (or equal to) @larger_set, %FALSE otherwise.
+ * Return value: %true if the @set is a subset of (or equal to) @larger_set, %false otherwise.
  *
  * Since: 1.8.1
  **/
@@ -447,7 +450,7 @@ hb_set_get_population (const hb_set_t *set)
  *
  * Finds the smallest element in the set.
  *
- * Return value: minimum of @set, or %HB_SET_VALUE_INVALID if @set is empty.
+ * Return value: minimum of @set, or #HB_SET_VALUE_INVALID if @set is empty.
  *
  * Since: 0.9.7
  **/
@@ -463,7 +466,7 @@ hb_set_get_min (const hb_set_t *set)
  *
  * Finds the largest element in the set.
  *
- * Return value: maximum of @set, or %HB_SET_VALUE_INVALID if @set is empty.
+ * Return value: maximum of @set, or #HB_SET_VALUE_INVALID if @set is empty.
  *
  * Since: 0.9.7
  **/
@@ -481,9 +484,9 @@ hb_set_get_max (const hb_set_t *set)
  *
  * Fetches the next element in @set that is greater than current value of @codepoint.
  *
- * Set @codepoint to %HB_SET_VALUE_INVALID to get started.
+ * Set @codepoint to #HB_SET_VALUE_INVALID to get started.
  *
- * Return value: %true if there was a next value, false otherwise
+ * Return value: %true if there was a next value, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -502,9 +505,9 @@ hb_set_next (const hb_set_t *set,
  *
  * Fetches the previous element in @set that is lower than current value of @codepoint.
  *
- * Set @codepoint to %HB_SET_VALUE_INVALID to get started.
+ * Set @codepoint to #HB_SET_VALUE_INVALID to get started.
  *
- * Return value: %true if there was a previous value, false otherwise
+ * Return value: %true if there was a previous value, %false otherwise
  *
  * Since: 1.8.0
  **/
@@ -525,9 +528,9 @@ hb_set_previous (const hb_set_t *set,
  * Fetches the next consecutive range of elements in @set that
  * are greater than current value of @last.
  *
- * Set @last to %HB_SET_VALUE_INVALID to get started.
+ * Set @last to #HB_SET_VALUE_INVALID to get started.
  *
- * Return value: %true if there was a next range, false otherwise
+ * Return value: %true if there was a next range, %false otherwise
  *
  * Since: 0.9.7
  **/
@@ -549,9 +552,9 @@ hb_set_next_range (const hb_set_t *set,
  * Fetches the previous consecutive range of elements in @set that
  * are greater than current value of @last.
  *
- * Set @first to %HB_SET_VALUE_INVALID to get started.
+ * Set @first to #HB_SET_VALUE_INVALID to get started.
  *
- * Return value: %true if there was a previous range, false otherwise
+ * Return value: %true if there was a previous range, %false otherwise
  *
  * Since: 1.8.0
  **/
diff --git a/thirdparty/harfbuzz/src/hb-set.h b/thirdparty/harfbuzz/src/hb-set.h
index cafc36dbad..0ad27f4bbd 100644
--- a/thirdparty/harfbuzz/src/hb-set.h
+++ b/thirdparty/harfbuzz/src/hb-set.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -36,7 +36,11 @@
 HB_BEGIN_DECLS
 
 
-/*
+/**
+ * HB_SET_VALUE_INVALID:
+ *
+ * Unset #hb_set_t value.
+ *
  * Since: 0.9.21
  */
 #define HB_SET_VALUE_INVALID ((hb_codepoint_t) -1)
diff --git a/thirdparty/harfbuzz/src/hb-set.hh b/thirdparty/harfbuzz/src/hb-set.hh
index b6e2086a2e..ae8b5eb10f 100644
--- a/thirdparty/harfbuzz/src/hb-set.hh
+++ b/thirdparty/harfbuzz/src/hb-set.hh
@@ -244,7 +244,7 @@ struct hb_set_t
 
   bool resize (unsigned int count)
   {
-    if (unlikely (!successful)) return false;
+    if (unlikely (count > pages.length && !successful)) return false;
     if (!pages.resize (count) || !page_map.resize (count))
     {
       pages.resize (page_map.length);
@@ -256,19 +256,14 @@ struct hb_set_t
 
   void reset ()
   {
-    if (unlikely (hb_object_is_immutable (this)))
-      return;
-    clear ();
     successful = true;
+    clear ();
   }
 
   void clear ()
   {
-    if (unlikely (hb_object_is_immutable (this)))
-      return;
-    population = 0;
-    page_map.resize (0);
-    pages.resize (0);
+    if (resize (0))
+      population = 0;
   }
   bool is_empty () const
   {
@@ -278,6 +273,7 @@ struct hb_set_t
 	return false;
     return true;
   }
+  explicit operator bool () const { return !is_empty (); }
 
   void dirty () { population = UINT_MAX; }
 
@@ -389,6 +385,11 @@ struct hb_set_t
   {
     if (ds <= de)
     {
+      // Pre-allocate the workspace that compact() will need so we can bail on allocation failure
+      // before attempting to rewrite the page map.
+      hb_vector_t<unsigned> compact_workspace;
+      if (unlikely (!allocate_compact_workspace (compact_workspace))) return;
+
       unsigned int write_index = 0;
       for (unsigned int i = 0; i < page_map.length; i++)
       {
@@ -396,11 +397,12 @@ struct hb_set_t
 	if (m < ds || de < m)
 	  page_map[write_index++] = page_map[i];
       }
-      compact (write_index);
+      compact (compact_workspace, write_index);
       resize (write_index);
     }
   }
 
+
   public:
   void del_range (hb_codepoint_t a, hb_codepoint_t b)
   {
@@ -512,20 +514,37 @@ struct hb_set_t
     return true;
   }
 
-  void compact (unsigned int length)
+  bool allocate_compact_workspace(hb_vector_t<unsigned>& workspace)
+  {
+    if (unlikely(!workspace.resize (pages.length)))
+    {
+      successful = false;
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /*
+   * workspace should be a pre-sized vector allocated to hold at exactly pages.length
+   * elements.
+   */
+  void compact (hb_vector_t<unsigned>& workspace,
+                unsigned int length)
   {
-    hb_vector_t<uint32_t> old_index_to_page_map_index;
-    old_index_to_page_map_index.resize(pages.length);
-    for (uint32_t i = 0; i < old_index_to_page_map_index.length; i++)
-      old_index_to_page_map_index[i] = 0xFFFFFFFF;
+    assert(workspace.length == pages.length);
+    hb_vector_t<unsigned>& old_index_to_page_map_index = workspace;
 
-    for (uint32_t i = 0; i < length; i++)
+    hb_fill (old_index_to_page_map_index.writer(), 0xFFFFFFFF);
+    /* TODO(iter) Rewrite as dagger? */
+    for (unsigned i = 0; i < length; i++)
       old_index_to_page_map_index[page_map[i].index] =  i;
 
     compact_pages (old_index_to_page_map_index);
   }
 
-  void compact_pages (const hb_vector_t<uint32_t>& old_index_to_page_map_index)
+  void compact_pages (const hb_vector_t<unsigned>& old_index_to_page_map_index)
   {
     unsigned int write_index = 0;
     for (unsigned int i = 0; i < pages.length; i++)
@@ -543,6 +562,9 @@ struct hb_set_t
   template <typename Op>
   void process (const Op& op, const hb_set_t *other)
   {
+    const bool passthru_left = op (1, 0);
+    const bool passthru_right = op (0, 1);
+
     if (unlikely (!successful)) return;
 
     dirty ();
@@ -554,11 +576,17 @@ struct hb_set_t
     unsigned int count = 0, newCount = 0;
     unsigned int a = 0, b = 0;
     unsigned int write_index = 0;
+
+    // Pre-allocate the workspace that compact() will need so we can bail on allocation failure
+    // before attempting to rewrite the page map.
+    hb_vector_t<unsigned> compact_workspace;
+    if (!passthru_left && unlikely (!allocate_compact_workspace (compact_workspace))) return;
+
     for (; a < na && b < nb; )
     {
       if (page_map[a].major == other->page_map[b].major)
       {
-	if (!Op::passthru_left)
+	if (!passthru_left)
 	{
 	  // Move page_map entries that we're keeping from the left side set
 	  // to the front of the page_map vector. This isn't necessary if
@@ -575,27 +603,27 @@ struct hb_set_t
       }
       else if (page_map[a].major < other->page_map[b].major)
       {
-	if (Op::passthru_left)
+	if (passthru_left)
 	  count++;
 	a++;
       }
       else
       {
-	if (Op::passthru_right)
+	if (passthru_right)
 	  count++;
 	b++;
       }
     }
-    if (Op::passthru_left)
+    if (passthru_left)
       count += na - a;
-    if (Op::passthru_right)
+    if (passthru_right)
       count += nb - b;
 
-    if (!Op::passthru_left)
+    if (!passthru_left)
     {
       na  = write_index;
       next_page = write_index;
-      compact (write_index);
+      compact (compact_workspace, write_index);
     }
 
     if (!resize (count))
@@ -619,7 +647,7 @@ struct hb_set_t
       else if (page_map[a - 1].major > other->page_map[b - 1].major)
       {
 	a--;
-	if (Op::passthru_left)
+	if (passthru_left)
 	{
 	  count--;
 	  page_map[count] = page_map[a];
@@ -628,7 +656,7 @@ struct hb_set_t
       else
       {
 	b--;
-	if (Op::passthru_right)
+	if (passthru_right)
 	{
 	  count--;
 	  page_map[count].major = other->page_map[b].major;
@@ -637,14 +665,14 @@ struct hb_set_t
 	}
       }
     }
-    if (Op::passthru_left)
+    if (passthru_left)
       while (a)
       {
 	a--;
 	count--;
 	page_map[count] = page_map [a];
       }
-    if (Op::passthru_right)
+    if (passthru_right)
       while (b)
       {
 	b--;
@@ -655,6 +683,9 @@ struct hb_set_t
       }
     assert (!count);
     if (pages.length > newCount)
+      // This resize() doesn't need to be checked because we can't get here
+      // if the set is currently in_error() and this only resizes downwards
+      // which will always succeed if the set is not in_error().
       resize (newCount);
   }
 
diff --git a/thirdparty/harfbuzz/src/hb-shape-plan.cc b/thirdparty/harfbuzz/src/hb-shape-plan.cc
index 65a5fc4512..0d9eaddaa6 100644
--- a/thirdparty/harfbuzz/src/hb-shape-plan.cc
+++ b/thirdparty/harfbuzz/src/hb-shape-plan.cc
@@ -329,12 +329,12 @@ hb_shape_plan_destroy (hb_shape_plan_t *shape_plan)
  * @shape_plan: A shaping plan
  * @key: The user-data key to set
  * @data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the given shaping plan. 
  *
- * Return value:
+ * Return value: %true if success, %false otherwise.
  *
  * Since: 0.9.7
  **/
@@ -439,7 +439,7 @@ _hb_shape_plan_execute_internal (hb_shape_plan_t    *shape_plan,
  * Executes the given shaping plan on the specified buffer, using
  * the given @font and @features.
  *
- * Return value: 
+ * Return value: %true if success, %false otherwise.
  *
  * Since: 0.9.7
  **/
diff --git a/thirdparty/harfbuzz/src/hb-shape-plan.h b/thirdparty/harfbuzz/src/hb-shape-plan.h
index 336524ee2f..fc7c041899 100644
--- a/thirdparty/harfbuzz/src/hb-shape-plan.h
+++ b/thirdparty/harfbuzz/src/hb-shape-plan.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-shape.cc b/thirdparty/harfbuzz/src/hb-shape.cc
index a3debce397..c442f4403b 100644
--- a/thirdparty/harfbuzz/src/hb-shape.cc
+++ b/thirdparty/harfbuzz/src/hb-shape.cc
@@ -111,10 +111,10 @@ hb_shape_list_shapers ()
  * hb_shape_full:
  * @font: an #hb_font_t to use for shaping
  * @buffer: an #hb_buffer_t to shape
- * @features: (array length=num_features) (allow-none): an array of user
+ * @features: (array length=num_features) (nullable): an array of user
  *    specified #hb_feature_t or %NULL
  * @num_features: the length of @features array
- * @shaper_list: (array zero-terminated=1) (allow-none): a %NULL-terminated
+ * @shaper_list: (array zero-terminated=1) (nullable): a %NULL-terminated
  *    array of shapers to use or %NULL
  *
  * See hb_shape() for details. If @shaper_list is not %NULL, the specified
@@ -146,7 +146,7 @@ hb_shape_full (hb_font_t          *font,
  * hb_shape:
  * @font: an #hb_font_t to use for shaping
  * @buffer: an #hb_buffer_t to shape
- * @features: (array length=num_features) (allow-none): an array of user
+ * @features: (array length=num_features) (nullable): an array of user
  *    specified #hb_feature_t or %NULL
  * @num_features: the length of @features array
  *
diff --git a/thirdparty/harfbuzz/src/hb-shape.h b/thirdparty/harfbuzz/src/hb-shape.h
index 39507ff744..922f8c011e 100644
--- a/thirdparty/harfbuzz/src/hb-shape.h
+++ b/thirdparty/harfbuzz/src/hb-shape.h
@@ -26,7 +26,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-style.cc b/thirdparty/harfbuzz/src/hb-style.cc
index 86b9f7da5f..2f45d119f9 100644
--- a/thirdparty/harfbuzz/src/hb-style.cc
+++ b/thirdparty/harfbuzz/src/hb-style.cc
@@ -65,6 +65,7 @@ typedef enum {
   HB_STYLE_TAG_WIDTH		= HB_TAG ('w','d','t','h'),
   HB_STYLE_TAG_WEIGHT		= HB_TAG ('w','g','h','t'),
 
+  /*< private >*/
   _HB_STYLE_TAG_MAX_VALUE	= HB_TAG_MAX_SIGNED /*< skip >*/
 } hb_style_tag_t;
 
diff --git a/thirdparty/harfbuzz/src/hb-style.h b/thirdparty/harfbuzz/src/hb-style.h
index 1209c79e94..f5776cee58 100644
--- a/thirdparty/harfbuzz/src/hb-style.h
+++ b/thirdparty/harfbuzz/src/hb-style.h
@@ -22,7 +22,7 @@
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.cc b/thirdparty/harfbuzz/src/hb-subset-plan.cc
index 24beada3e8..d055783a4d 100644
--- a/thirdparty/harfbuzz/src/hb-subset-plan.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-plan.cc
@@ -88,10 +88,17 @@ _gsub_closure_glyphs_lookups_features (hb_face_t *face,
 			 &lookup_indices);
   _remap_indexes (&lookup_indices, gsub_lookups);
 
-  //closure features
+  // Collect and prune features
   hb_set_t feature_indices;
-  gsub->closure_features (gsub_lookups, &feature_indices);
+  hb_ot_layout_collect_features (face,
+                                 HB_OT_TAG_GSUB,
+                                 nullptr,
+                                 nullptr,
+                                 nullptr,
+                                 &feature_indices);
+  gsub->prune_features (gsub_lookups, &feature_indices);
   _remap_indexes (&feature_indices, gsub_features);
+
   gsub.destroy ();
 }
 
@@ -114,9 +121,15 @@ _gpos_closure_lookups_features (hb_face_t      *face,
 			 &lookup_indices);
   _remap_indexes (&lookup_indices, gpos_lookups);
 
-  //closure features
+  // Collect and prune features
   hb_set_t feature_indices;
-  gpos->closure_features (gpos_lookups, &feature_indices);
+  hb_ot_layout_collect_features (face,
+                                 HB_OT_TAG_GPOS,
+                                 nullptr,
+                                 nullptr,
+                                 nullptr,
+                                 &feature_indices);
+  gpos->prune_features (gpos_lookups, &feature_indices);
   _remap_indexes (&feature_indices, gpos_features);
   gpos.destroy ();
 }
@@ -243,7 +256,11 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
 
 #ifndef HB_NO_VAR
   if (close_over_gdef)
-    _collect_layout_variation_indices (plan->source, plan->_glyphset, plan->gpos_lookups, plan->layout_variation_indices, plan->layout_variation_idx_map);
+    _collect_layout_variation_indices (plan->source,
+                                       plan->_glyphset_gsub,
+                                       plan->gpos_lookups,
+                                       plan->layout_variation_indices,
+                                       plan->layout_variation_idx_map);
 #endif
 
 #ifndef HB_NO_SUBSET_CFF
diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.hh b/thirdparty/harfbuzz/src/hb-subset-plan.hh
index e9f603dd1d..cc9cb7a1a2 100644
--- a/thirdparty/harfbuzz/src/hb-subset-plan.hh
+++ b/thirdparty/harfbuzz/src/hb-subset-plan.hh
@@ -172,12 +172,15 @@ struct hb_subset_plan_t
   add_table (hb_tag_t tag,
 	     hb_blob_t *contents)
   {
-    hb_blob_t *source_blob = source->reference_table (tag);
-    DEBUG_MSG(SUBSET, nullptr, "add table %c%c%c%c, dest %d bytes, source %d bytes",
-	      HB_UNTAG(tag),
-	      hb_blob_get_length (contents),
-	      hb_blob_get_length (source_blob));
-    hb_blob_destroy (source_blob);
+    if (HB_DEBUG_SUBSET)
+    {
+      hb_blob_t *source_blob = source->reference_table (tag);
+      DEBUG_MSG(SUBSET, nullptr, "add table %c%c%c%c, dest %d bytes, source %d bytes",
+		HB_UNTAG(tag),
+		hb_blob_get_length (contents),
+		hb_blob_get_length (source_blob));
+      hb_blob_destroy (source_blob);
+    }
     return hb_face_builder_add_table (dest, tag, contents);
   }
 };
diff --git a/thirdparty/harfbuzz/src/hb-unicode.cc b/thirdparty/harfbuzz/src/hb-unicode.cc
index d7f6a6e130..7470bb1b6e 100644
--- a/thirdparty/harfbuzz/src/hb-unicode.cc
+++ b/thirdparty/harfbuzz/src/hb-unicode.cc
@@ -276,7 +276,7 @@ hb_unicode_funcs_destroy (hb_unicode_funcs_t *ufuncs)
  * @ufuncs: The Unicode-functions structure
  * @key: The user-data key
  * @data: A pointer to the user data
- * @destroy: (optional): A callback to call when @data is not needed anymore
+ * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
  * Attaches a user-data key/data pair to the specified Unicode-functions structure. 
@@ -340,7 +340,7 @@ hb_unicode_funcs_make_immutable (hb_unicode_funcs_t *ufuncs)
  * Tests whether the specified Unicode-functions structure
  * is immutable.
  *
- * Return value: %true if @ufuncs is immutable, false otherwise
+ * Return value: %true if @ufuncs is immutable, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -421,7 +421,7 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
  * Calls the composition function of the specified
  * Unicode-functions structure @ufuncs.
  *
- * Return value: %true if @a and @b composed, false otherwise
+ * Return value: %true if @a and @b composed, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -446,7 +446,7 @@ hb_unicode_compose (hb_unicode_funcs_t *ufuncs,
  * Calls the decomposition function of the specified
  * Unicode-functions structure @ufuncs.
  *
- * Return value: %true if @ab was decomposed, false otherwise
+ * Return value: %true if @ab was decomposed, %false otherwise
  *
  * Since: 0.9.2
  **/
@@ -469,7 +469,7 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
  * Fetches the compatibility decomposition of a Unicode
  * code point. Deprecated.
  *
- * Return value:
+ * Return value: length of @decomposed.
  *
  * Since: 0.9.2
  * Deprecated: 2.0.0
diff --git a/thirdparty/harfbuzz/src/hb-unicode.h b/thirdparty/harfbuzz/src/hb-unicode.h
index 7ea0848c0f..c04ee15a09 100644
--- a/thirdparty/harfbuzz/src/hb-unicode.h
+++ b/thirdparty/harfbuzz/src/hb-unicode.h
@@ -28,7 +28,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -41,7 +41,9 @@ HB_BEGIN_DECLS
 
 
 /**
- * HB_UNICODE_MAX
+ * HB_UNICODE_MAX:
+ *
+ * Maximum valid Unicode code point.
  *
  * Since: 1.9.0
  **/
@@ -427,7 +429,7 @@ typedef hb_script_t			(*hb_unicode_script_func_t)		(hb_unicode_funcs_t *ufuncs,
  * The method must return an #hb_bool_t indicating the success
  * of the composition.
  * 
- * Return value: True is @a,@b composed, false otherwise
+ * Return value: %true is @a,@b composed, %false otherwise
  *
  **/
 typedef hb_bool_t			(*hb_unicode_compose_func_t)		(hb_unicode_funcs_t *ufuncs,
@@ -451,7 +453,7 @@ typedef hb_bool_t			(*hb_unicode_compose_func_t)		(hb_unicode_funcs_t *ufuncs,
  * output parameters (if successful). The method must return an
  * #hb_bool_t indicating the success of the composition.
  * 
- * Return value: True if @ab decomposed, false otherwise
+ * Return value: %true if @ab decomposed, %false otherwise
  *
  **/
 typedef hb_bool_t			(*hb_unicode_decompose_func_t)		(hb_unicode_funcs_t *ufuncs,
@@ -467,7 +469,7 @@ typedef hb_bool_t			(*hb_unicode_decompose_func_t)		(hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_combining_class_func_t.
  *
@@ -483,7 +485,7 @@ hb_unicode_funcs_set_combining_class_func (hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_general_category_func_t.
  *
@@ -499,7 +501,7 @@ hb_unicode_funcs_set_general_category_func (hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_mirroring_func_t.
  *
@@ -515,7 +517,7 @@ hb_unicode_funcs_set_mirroring_func (hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_script_func_t.
  *
@@ -531,7 +533,7 @@ hb_unicode_funcs_set_script_func (hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_compose_func_t.
  *
@@ -547,7 +549,7 @@ hb_unicode_funcs_set_compose_func (hb_unicode_funcs_t *ufuncs,
  * @ufuncs: A Unicode-functions structure
  * @func: (closure user_data) (destroy destroy) (scope notified): The callback function to assign
  * @user_data: Data to pass to @func
- * @destroy: (optional): The function to call when @user_data is not needed anymore
+ * @destroy: (nullable): The function to call when @user_data is not needed anymore
  *
  * Sets the implementation function for #hb_unicode_decompose_func_t.
  *
@@ -624,40 +626,12 @@ HB_EXTERN hb_script_t
 hb_unicode_script (hb_unicode_funcs_t *ufuncs,
 		   hb_codepoint_t unicode);
 
-/**
- * hb_unicode_compose:
- * @ufuncs: The Unicode-functions structure
- * @a: The first code point to compose
- * @b: The second code point to compose
- * @ab: (out): The composed code point
- *
- * Composes the code point sequence @a,@b by canonical equivalence into
- * code point @ab.
- *
- * Return value: True is @a,@b composed, false otherwise
- *
- * Since: 0.9.2
- **/
 HB_EXTERN hb_bool_t
 hb_unicode_compose (hb_unicode_funcs_t *ufuncs,
 		    hb_codepoint_t      a,
 		    hb_codepoint_t      b,
 		    hb_codepoint_t     *ab);
 
-/**
- * hb_unicode_decompose:
- * @ufuncs: The Unicode-functions structure
- * @ab: The code point to decompose
- * @a: (out): The first decomposed code point
- * @b: (out): The second decomposed code point
- *
- * Decomposes code point @ab by canonical equivalence, into code points
- * @a and @b.
- *
- * Return value: True if @ab decomposed, false otherwise
- *
- * Since: 0.9.2
- **/
 HB_EXTERN hb_bool_t
 hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
 		      hb_codepoint_t      ab,
diff --git a/thirdparty/harfbuzz/src/hb-vector.hh b/thirdparty/harfbuzz/src/hb-vector.hh
index 079b94a6b4..13517a9c29 100644
--- a/thirdparty/harfbuzz/src/hb-vector.hh
+++ b/thirdparty/harfbuzz/src/hb-vector.hh
@@ -80,7 +80,12 @@ struct hb_vector_t
     fini ();
   }
 
-  void reset () { resize (0); }
+  void reset ()
+  {
+    if (unlikely (in_error ()))
+      allocated = length; // Big hack!
+    resize (0);
+  }
 
   hb_vector_t& operator = (const hb_vector_t &o)
   {
@@ -181,7 +186,7 @@ struct hb_vector_t
   /* Allocate for size but don't adjust length. */
   bool alloc (unsigned int size)
   {
-    if (unlikely (allocated < 0))
+    if (unlikely (in_error ()))
       return false;
 
     if (likely (size <= (unsigned) allocated))
@@ -195,7 +200,7 @@ struct hb_vector_t
 
     Type *new_array = nullptr;
     bool overflows =
-      (int) new_allocated < 0 ||
+      (int) in_error () ||
       (new_allocated < (unsigned) allocated) ||
       hb_unsigned_mul_overflows (new_allocated, sizeof (Type));
     if (likely (!overflows))
diff --git a/thirdparty/harfbuzz/src/hb-version.h b/thirdparty/harfbuzz/src/hb-version.h
index da377b9df6..6db58c3f7c 100644
--- a/thirdparty/harfbuzz/src/hb-version.h
+++ b/thirdparty/harfbuzz/src/hb-version.h
@@ -24,7 +24,7 @@
  * Google Author(s): Behdad Esfahbod
  */
 
-#ifndef HB_H_IN
+#if !defined(HB_H_IN) && !defined(HB_NO_SINGLE_HEADER_ERROR)
 #error "Include <hb.h> instead."
 #endif
 
@@ -36,12 +36,41 @@
 HB_BEGIN_DECLS
 
 
+/**
+ * HB_VERSION_MAJOR:
+ *
+ * The major component of the library version available at compile-time.
+ */
 #define HB_VERSION_MAJOR 2
-#define HB_VERSION_MINOR 7
-#define HB_VERSION_MICRO 4
+/**
+ * HB_VERSION_MINOR:
+ *
+ * The minor component of the library version available at compile-time.
+ */
+#define HB_VERSION_MINOR 8
+/**
+ * HB_VERSION_MICRO:
+ *
+ * The micro component of the library version available at compile-time.
+ */
+#define HB_VERSION_MICRO 0
 
-#define HB_VERSION_STRING "2.7.4"
+/**
+ * HB_VERSION_STRING:
+ *
+ * A string literal containing the library version available at compile-time.
+ */
+#define HB_VERSION_STRING "2.8.0"
 
+/**
+ * HB_VERSION_ATLEAST:
+ * @major: the major component of the version number
+ * @minor: the minor component of the version number
+ * @micro: the micro component of the version number
+ *
+ * Tests the library version at compile-time against a minimum value,
+ * as three integer components.
+ */
 #define HB_VERSION_ATLEAST(major,minor,micro) \
 	((major)*10000+(minor)*100+(micro) <= \
 	 HB_VERSION_MAJOR*10000+HB_VERSION_MINOR*100+HB_VERSION_MICRO)
diff --git a/thirdparty/harfbuzz/src/hb.hh b/thirdparty/harfbuzz/src/hb.hh
index 274a0e98db..18516581c7 100644
--- a/thirdparty/harfbuzz/src/hb.hh
+++ b/thirdparty/harfbuzz/src/hb.hh
@@ -62,7 +62,6 @@
 
 /* Error.  Should never happen. */
 #ifndef HB_NO_PRAGMA_GCC_DIAGNOSTIC_ERROR
-#pragma GCC diagnostic error   "-Wc++11-narrowing"
 #pragma GCC diagnostic error   "-Wcast-align"
 #pragma GCC diagnostic error   "-Wcast-function-type"
 #pragma GCC diagnostic error   "-Wdelete-non-virtual-dtor"
@@ -75,6 +74,7 @@
 #pragma GCC diagnostic error   "-Wmissing-braces"
 #pragma GCC diagnostic error   "-Wmissing-declarations"
 #pragma GCC diagnostic error   "-Wmissing-prototypes"
+#pragma GCC diagnostic error   "-Wnarrowing"
 #pragma GCC diagnostic error   "-Wnested-externs"
 #pragma GCC diagnostic error   "-Wold-style-definition"
 #pragma GCC diagnostic error   "-Wpointer-arith"
@@ -126,6 +126,7 @@
 #pragma GCC diagnostic ignored "-Wformat-zero-length"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #pragma GCC diagnostic ignored "-Wpacked" // Erratic impl in clang
+#pragma GCC diagnostic ignored "-Wrange-loop-analysis" // https://github.com/harfbuzz/harfbuzz/issues/2834
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wtype-limits"
 #pragma GCC diagnostic ignored "-Wc++11-compat" // only gcc raises it
@@ -175,15 +176,15 @@
 #include "hb-aat.h"
 #define HB_AAT_H_IN
 
-#include <limits.h>
-#include <math.h>
-#include <float.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <string.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdarg.h>
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 
 #if (defined(_MSC_VER) && _MSC_VER >= 1500) || defined(__MINGW32__)
 #ifdef __MINGW32_VERSION
@@ -244,12 +245,8 @@ extern "C" void  hb_free_impl(void *ptr);
 #endif
 
 #if defined(__GNUC__) && (__GNUC__ >= 3)
-#define HB_PURE_FUNC	__attribute__((pure))
-#define HB_CONST_FUNC	__attribute__((const))
 #define HB_PRINTF_FUNC(format_idx, arg_idx) __attribute__((__format__ (__printf__, format_idx, arg_idx)))
 #else
-#define HB_PURE_FUNC
-#define HB_CONST_FUNC
 #define HB_PRINTF_FUNC(format_idx, arg_idx)
 #endif
 #if defined(__GNUC__) && (__GNUC__ >= 4) || (__clang__)
@@ -394,7 +391,7 @@ extern "C" void  hb_free_impl(void *ptr);
 #endif
 
 #ifndef HB_NO_ERRNO
-#  include <errno.h>
+#  include <cerrno>
 #else
 static int HB_UNUSED _hb_errno = 0;
 #  undef errno
@@ -440,181 +437,12 @@ static int HB_UNUSED _hb_errno = 0;
 #define HB_STMT_START do
 #define HB_STMT_END   while (0)
 
-/* Static-assert as expression. */
-template <unsigned int cond> class hb_assert_constant_t;
-template <> class hb_assert_constant_t<1> {};
-#define ASSERT_STATIC_EXPR_ZERO(_cond) (0 * (unsigned int) sizeof (hb_assert_constant_t<_cond>))
-
 /* Lets assert int types.  Saves trouble down the road. */
-static_assert ((sizeof (int8_t) == 1), "");
-static_assert ((sizeof (uint8_t) == 1), "");
-static_assert ((sizeof (int16_t) == 2), "");
-static_assert ((sizeof (uint16_t) == 2), "");
-static_assert ((sizeof (int32_t) == 4), "");
-static_assert ((sizeof (uint32_t) == 4), "");
-static_assert ((sizeof (int64_t) == 8), "");
-static_assert ((sizeof (uint64_t) == 8), "");
 static_assert ((sizeof (hb_codepoint_t) == 4), "");
 static_assert ((sizeof (hb_position_t) == 4), "");
 static_assert ((sizeof (hb_mask_t) == 4), "");
 static_assert ((sizeof (hb_var_int_t) == 4), "");
 
-#define HB_DELETE_COPY_ASSIGN(TypeName) \
-  TypeName(const TypeName&) = delete; \
-  void operator=(const TypeName&) = delete
-#define HB_DELETE_CREATE_COPY_ASSIGN(TypeName) \
-  TypeName() = delete; \
-  TypeName(const TypeName&) = delete; \
-  void operator=(const TypeName&) = delete
-
-
-/* Flags */
-
-/* Enable bitwise ops on enums marked as flags_t */
-/* To my surprise, looks like the function resolver is happy to silently cast
- * one enum to another...  So this doesn't provide the type-checking that I
- * originally had in mind... :(.
- *
- * For MSVC warnings, see: https://github.com/harfbuzz/harfbuzz/pull/163
- */
-#ifdef _MSC_VER
-# pragma warning(disable:4200)
-# pragma warning(disable:4800)
-#endif
-#define HB_MARK_AS_FLAG_T(T) \
-	extern "C++" { \
-	  static inline T operator | (T l, T r) { return T ((unsigned) l | (unsigned) r); } \
-	  static inline T operator & (T l, T r) { return T ((unsigned) l & (unsigned) r); } \
-	  static inline T operator ^ (T l, T r) { return T ((unsigned) l ^ (unsigned) r); } \
-	  static inline T operator ~ (T r) { return T (~(unsigned int) r); } \
-	  static inline T& operator |= (T &l, T r) { l = l | r; return l; } \
-	  static inline T& operator &= (T& l, T r) { l = l & r; return l; } \
-	  static inline T& operator ^= (T& l, T r) { l = l ^ r; return l; } \
-	} \
-	static_assert (true, "")
-
-/* Useful for set-operations on small enums.
- * For example, for testing "x ∈ {x1, x2, x3}" use:
- * (FLAG_UNSAFE(x) & (FLAG(x1) | FLAG(x2) | FLAG(x3)))
- */
-#define FLAG(x) (ASSERT_STATIC_EXPR_ZERO ((unsigned)(x) < 32) + (((uint32_t) 1U) << (unsigned)(x)))
-#define FLAG_UNSAFE(x) ((unsigned)(x) < 32 ? (((uint32_t) 1U) << (unsigned)(x)) : 0)
-#define FLAG_RANGE(x,y) (ASSERT_STATIC_EXPR_ZERO ((x) < (y)) + FLAG(y+1) - FLAG(x))
-#define FLAG64(x) (ASSERT_STATIC_EXPR_ZERO ((unsigned)(x) < 64) + (((uint64_t) 1ULL) << (unsigned)(x)))
-#define FLAG64_UNSAFE(x) ((unsigned)(x) < 64 ? (((uint64_t) 1ULL) << (unsigned)(x)) : 0)
-
-
-/* Size signifying variable-sized array */
-#ifndef HB_VAR_ARRAY
-#define HB_VAR_ARRAY 1
-#endif
-
-static inline float
-_hb_roundf (float x) { return floorf (x + .5f); }
-#define roundf(x) _hb_roundf(x)
-
-/* Endian swap, used in Windows related backends */
-static inline uint16_t hb_uint16_swap (const uint16_t v)
-{ return (v >> 8) | (v << 8); }
-static inline uint32_t hb_uint32_swap (const uint32_t v)
-{ return (hb_uint16_swap (v) << 16) | hb_uint16_swap (v >> 16); }
-
-/*
- * Big-endian integers.  Here because fundamental.
- */
-
-template <typename Type, int Bytes> struct BEInt;
-
-template <typename Type>
-struct BEInt<Type, 1>
-{
-  public:
-  BEInt<Type, 1>& operator = (Type V)
-  {
-    v = V;
-    return *this;
-  }
-  operator Type () const { return v; }
-  private: uint8_t v;
-};
-template <typename Type>
-struct BEInt<Type, 2>
-{
-  public:
-  BEInt<Type, 2>& operator = (Type V)
-  {
-    v[0] = (V >>  8) & 0xFF;
-    v[1] = (V      ) & 0xFF;
-    return *this;
-  }
-  operator Type () const
-  {
-#if ((defined(__GNUC__) && __GNUC__ >= 5) || defined(__clang__)) && \
-    defined(__BYTE_ORDER) && \
-    (__BYTE_ORDER == __LITTLE_ENDIAN || __BYTE_ORDER == __BIG_ENDIAN)
-    /* Spoon-feed the compiler a big-endian integer with alignment 1.
-     * https://github.com/harfbuzz/harfbuzz/pull/1398 */
-    struct __attribute__((packed)) packed_uint16_t { uint16_t v; };
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-    return __builtin_bswap16 (((packed_uint16_t *) this)->v);
-#else /* __BYTE_ORDER == __BIG_ENDIAN */
-    return ((packed_uint16_t *) this)->v;
-#endif
-#endif
-    return (v[0] <<  8)
-	 + (v[1]      );
-  }
-  private: uint8_t v[2];
-};
-template <typename Type>
-struct BEInt<Type, 3>
-{
-  public:
-  BEInt<Type, 3>& operator = (Type V)
-  {
-    v[0] = (V >> 16) & 0xFF;
-    v[1] = (V >>  8) & 0xFF;
-    v[2] = (V      ) & 0xFF;
-    return *this;
-  }
-  operator Type () const
-  {
-    return (v[0] << 16)
-	 + (v[1] <<  8)
-	 + (v[2]      );
-  }
-  private: uint8_t v[3];
-};
-template <typename Type>
-struct BEInt<Type, 4>
-{
-  public:
-  BEInt<Type, 4>& operator = (Type V)
-  {
-    v[0] = (V >> 24) & 0xFF;
-    v[1] = (V >> 16) & 0xFF;
-    v[2] = (V >>  8) & 0xFF;
-    v[3] = (V      ) & 0xFF;
-    return *this;
-  }
-  operator Type () const
-  {
-    return (v[0] << 24)
-	 + (v[1] << 16)
-	 + (v[2] <<  8)
-	 + (v[3]      );
-  }
-  private: uint8_t v[4];
-};
-
-
-/*
- * For lack of a better place, put Zawgyi script hack here.
- * https://github.com/harfbuzz/harfbuzz/issues/1162
- */
-
-#define HB_SCRIPT_MYANMAR_ZAWGYI	((hb_script_t) HB_TAG ('Q','a','a','g'))
-
 
 /* Headers we include for everyone.  Keep topologically sorted by dependency.
  * They express dependency amongst themselves, but no other file should include
diff --git a/thirdparty/icu4c/APIChangeReport.md b/thirdparty/icu4c/APIChangeReport.md
deleted file mode 100644
index 5385904fd1..0000000000
--- a/thirdparty/icu4c/APIChangeReport.md
+++ /dev/null
@@ -1,396 +0,0 @@
-
-  
-<!--
- Copyright © 2019 and later: Unicode, Inc. and others.
- License & terms of use: http://www.unicode.org/copyright.html
--->
-
-# ICU4C API Comparison: ICU 67 with ICU 68
-
-> _Note_ Markdown format of this document is new for ICU 65.
-
-- [Removed from ICU 67](#removed)
-- [Deprecated or Obsoleted in ICU 68](#deprecated)
-- [Changed in  ICU 68](#changed)
-- [Promoted to stable in ICU 68](#promoted)
-- [Added in ICU 68](#added)
-- [Other existing drafts in ICU 68](#other)
-- [Signature Simplifications](#simplifications)
-
-## Removed
-
-Removed from ICU 67
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| fmtable.h | const UFormattable* icu::Formattable::toUFormattable() |  StableICU 52 | (missing)
-| measunit.h | LocalArray&lt;MeasureUnit&gt; icu::MeasureUnit::splitToSingleUnits(int32_t&amp;, UErrorCode&amp;) const |  InternalICU 67 | (missing)
-| measunit.h | int32_t icu::MeasureUnit::getIndex() const |  Internal | (missing)
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::resolveUnitPerUnit(const MeasureUnit&amp;, const MeasureUnit&amp;, bool*) |  Internal | (missing)
-| measunit.h | <tt>static</tt> int32_t icu::MeasureUnit::getIndexCount() |  Internal | (missing)
-| measunit.h | <tt>static</tt> int32_t icu::MeasureUnit::internalGetIndexForTypeAndSubtype(const char*, const char*) |  Internal | (missing)
-| nounit.h | UClassID icu::NoUnit::getDynamicClassID() const |  DraftICU 60 | (missing)
-| nounit.h | icu::NoUnit::NoUnit(const NoUnit&amp;) |  DraftICU 60 | (missing)
-| nounit.h | icu::NoUnit::~NoUnit() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::base() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::percent() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::permille() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> UClassID icu::NoUnit::getStaticClassID() |  DraftICU 60 | (missing)
-| nounit.h | void* icu::NoUnit::clone() const |  DraftICU 60 | (missing)
-| uniset.h | const USet* icu::UnicodeSet::toUSet() |  StableICU 4.2 | (missing)
-
-## Deprecated
-
-Deprecated or Obsoleted in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getFirstDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getSecondDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> FALSE |  StableICU 2.0 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> TRUE |  StableICU 2.0 | DeprecatedICU 68
-
-## Changed
-
-Changed in  ICU 68 (old, new)
-
-
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestrie.h | BytesTrie&amp; icu::BytesTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| bytestrie.h | uint64_t icu::BytesTrie::getState64() const |  Draft→StableICU 65
-| listformatter.h | <tt>static</tt> ListFormatter* icu::ListFormatter::createInstance(const Locale&amp;, UListFormatterType, UListFormatterWidth, UErrorCode&amp;) |  Draft→StableICU 67
-| localebuilder.h | UBool icu::LocaleBuilder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::addSupportedLocale(const Locale&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::operator=(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDefaultLocale(const Locale*) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Iter, Iter) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesFromListString(StringPiece) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesViaConverter(Iter, Iter, Conv) |  Draft→StableICU 65
-| localematcher.h | Locale icu::LocaleMatcher::Result::makeResolvedLocale(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher icu::LocaleMatcher::Builder::build(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher&amp; icu::LocaleMatcher::operator=(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result&amp; icu::LocaleMatcher::Result::operator=(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | UBool icu::LocaleMatcher::Builder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getDesiredLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getSupportedLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatchForListString(StringPiece, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_NONE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_REGION |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_LANGUAGE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_SCRIPT |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::~Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::LocaleMatcher(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::Result(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::~Result() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::~LocaleMatcher() |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getDesiredIndex() const |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getSupportedIndex() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::Iterator::hasNext() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::RangeIterator&lt; Iter &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::next() override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::Iterator::next() |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::RangeIterator&lt; Iter &gt;::next() override |  Draft→StableICU 65
-| locid.h | icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::ConvertingIterator(Iter, Iter, Conv) |  Draft→StableICU 65
-| locid.h | icu::Locale::Iterator::~Iterator() |  Draft→StableICU 65
-| locid.h | icu::Locale::RangeIterator&lt; Iter &gt;::RangeIterator(Iter, Iter) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getBar() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDecade() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEm() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getMegapixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPascal() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getThermUs() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createBar(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDecade(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEm(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createMegapixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPascal(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createThermUs(UErrorCode&amp;) |  Draft→StableICU 65
-| numberformatter.h | StringClass icu::number::FormattedNumber::toDecimalNumber(UErrorCode&amp;) const |  Draft→StableICU 65
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getFirstDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getSecondDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_HOUR |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_MINUTE |  Draft→StableICU 65
-| stringpiece.h | icu::StringPiece::StringPiece(T) |  Draft→StableICU 65
-| ucal.h | int32_t ucal_getHostTimeZone(UChar*, int32_t, UErrorCode*) |  Draft→StableICU 65
-| ucharstrie.h | UCharsTrie&amp; icu::UCharsTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| ucharstrie.h | uint64_t icu::UCharsTrie::getState64() const |  Draft→StableICU 65
-| ulistformatter.h | UListFormatter* ulistfmt_openForType(const char*, UListFormatterType, UListFormatterWidth, UErrorCode*) |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_AND |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_OR |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_UNITS |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_NARROW |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_SHORT |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_WIDE |  Draft→StableICU 67
-| uloc.h | UEnumeration* uloc_openAvailableByType(ULocAvailableType, UErrorCode*) |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_DEFAULT |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_ONLY_LEGACY_ALIASES |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_WITH_LEGACY_ALIASES |  Draft→StableICU 65
-| umachine.h | <tt>#define</tt> FALSE |  StableICU 2.0 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> TRUE |  StableICU 2.0 | DeprecatedICU 68
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_BUNDLE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_DATA_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_RES_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_START |  Draft→StableICU 65
-
-## Promoted
-
-Promoted to stable in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestrie.h | BytesTrie&amp; icu::BytesTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| bytestrie.h | uint64_t icu::BytesTrie::getState64() const |  Draft→StableICU 65
-| fmtable.h | UFormattable* icu::Formattable::toUFormattable() |  (missing) | StableICU 52
-| listformatter.h | <tt>static</tt> ListFormatter* icu::ListFormatter::createInstance(const Locale&amp;, UListFormatterType, UListFormatterWidth, UErrorCode&amp;) |  Draft→StableICU 67
-| localebuilder.h | UBool icu::LocaleBuilder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::addSupportedLocale(const Locale&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::operator=(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDefaultLocale(const Locale*) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Iter, Iter) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesFromListString(StringPiece) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesViaConverter(Iter, Iter, Conv) |  Draft→StableICU 65
-| localematcher.h | Locale icu::LocaleMatcher::Result::makeResolvedLocale(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher icu::LocaleMatcher::Builder::build(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher&amp; icu::LocaleMatcher::operator=(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result&amp; icu::LocaleMatcher::Result::operator=(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | UBool icu::LocaleMatcher::Builder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getDesiredLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getSupportedLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatchForListString(StringPiece, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_NONE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_REGION |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_LANGUAGE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_SCRIPT |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::~Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::LocaleMatcher(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::Result(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::~Result() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::~LocaleMatcher() |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getDesiredIndex() const |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getSupportedIndex() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::Iterator::hasNext() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::RangeIterator&lt; Iter &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::next() override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::Iterator::next() |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::RangeIterator&lt; Iter &gt;::next() override |  Draft→StableICU 65
-| locid.h | icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::ConvertingIterator(Iter, Iter, Conv) |  Draft→StableICU 65
-| locid.h | icu::Locale::Iterator::~Iterator() |  Draft→StableICU 65
-| locid.h | icu::Locale::RangeIterator&lt; Iter &gt;::RangeIterator(Iter, Iter) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getBar() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDecade() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEm() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getMegapixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPascal() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getThermUs() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createBar(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDecade(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEm(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createMegapixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPascal(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createThermUs(UErrorCode&amp;) |  Draft→StableICU 65
-| numberformatter.h | StringClass icu::number::FormattedNumber::toDecimalNumber(UErrorCode&amp;) const |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_HOUR |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_MINUTE |  Draft→StableICU 65
-| stringpiece.h | icu::StringPiece::StringPiece(T) |  Draft→StableICU 65
-| ucal.h | int32_t ucal_getHostTimeZone(UChar*, int32_t, UErrorCode*) |  Draft→StableICU 65
-| ucharstrie.h | UCharsTrie&amp; icu::UCharsTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| ucharstrie.h | uint64_t icu::UCharsTrie::getState64() const |  Draft→StableICU 65
-| ulistformatter.h | UListFormatter* ulistfmt_openForType(const char*, UListFormatterType, UListFormatterWidth, UErrorCode*) |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_AND |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_OR |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_UNITS |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_NARROW |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_SHORT |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_WIDE |  Draft→StableICU 67
-| uloc.h | UEnumeration* uloc_openAvailableByType(ULocAvailableType, UErrorCode*) |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_DEFAULT |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_ONLY_LEGACY_ALIASES |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_WITH_LEGACY_ALIASES |  Draft→StableICU 65
-| uniset.h | USet* icu::UnicodeSet::toUSet() |  (missing) | StableICU 4.2
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_BUNDLE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_DATA_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_RES_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_START |  Draft→StableICU 65
-
-## Added
-
-Added in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| dtitvfmt.h | UDisplayContext icu::DateIntervalFormat::getContext(UDisplayContextType, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| dtitvfmt.h | void icu::DateIntervalFormat::setContext(UDisplayContext, UErrorCode&amp;) |  (missing) | DraftICU 68
-| dtptngen.h | <tt>static</tt> DateTimePatternGenerator* icu::DateTimePatternGenerator::createInstanceNoStdPat(const Locale&amp;, UErrorCode&amp;) |  (missing) | Internal
-| fmtable.h | UFormattable* icu::Formattable::toUFormattable() |  (missing) | StableICU 52
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setMaxDistance(const Locale&amp;, const Locale&amp;) |  (missing) | DraftICU 68
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setNoDefaultLocale() |  (missing) | DraftICU 68
-| localematcher.h | UBool icu::LocaleMatcher::isMatch(const Locale&amp;, const Locale&amp;, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| measunit.h | int32_t icu::MeasureUnit::getOffset() const |  (missing) | Internal
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getCandela() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDessertSpoon() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDessertSpoonImperial() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDot() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDram() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDrop() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEarthRadius() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getGrain() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getJigger() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getLumen() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPinch() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getQuartImperial() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createCandela(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDessertSpoon(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDessertSpoonImperial(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDot(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDram(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDrop(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEarthRadius(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createGrain(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createJigger(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createLumen(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPinch(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createQuartImperial(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | std::pair&lt; LocalArray&lt; MeasureUnit &gt;, int32_t &gt; icu::MeasureUnit::splitToSingleUnits(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| numberformatter.h | Derived icu::number::NumberFormatterSettings&lt; Derived &gt;::usage(StringPiece) const&amp; |  (missing) | DraftICU 68
-| numberformatter.h | Derived icu::number::NumberFormatterSettings&lt; Derived &gt;::usage(StringPiece)&amp;&amp; |  (missing) | DraftICU 68
-| numberformatter.h | MeasureUnit icu::number::FormattedNumber::getOutputUnit(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| numberformatter.h | Usage&amp; icu::number::impl::Usage::operator=(Usage&amp;&amp;) |  (missing) | Internal
-| numberformatter.h | Usage&amp; icu::number::impl::Usage::operator=(const Usage&amp;) |  (missing) | Internal
-| numberformatter.h | bool icu::number::impl::Usage::isSet() const |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::Usage(Usage&amp;&amp;) |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::Usage(const Usage&amp;) |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::~Usage() |  (missing) | Internal
-| numberformatter.h | int16_t icu::number::impl::Usage::length() const |  (missing) | Internal
-| numberformatter.h | void icu::number::impl::Usage::set(StringPiece) |  (missing) | Internal
-| numberrangeformatter.h | std::pair&lt; StringClass, StringClass &gt; icu::number::FormattedNumberRange::getDecimalNumbers(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| plurrule.h | UnicodeString icu::PluralRules::select(const number::FormattedNumberRange&amp;, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| plurrule.h | UnicodeString icu::PluralRules::select(const number::impl::UFormattedNumberRangeData*, UErrorCode&amp;) const |  (missing) | Internal
-| plurrule.h | int32_t icu::PluralRules::getSamples(const UnicodeString&amp;, FixedDecimal*, int32_t, UErrorCode&amp;) |  (missing) | Internal
-| timezone.h | <tt>static</tt> TimeZone* icu::TimeZone::forLocaleOrDefault(const Locale&amp;) |  (missing) | Internal
-| ucurr.h | <tt>enum</tt> UCurrNameStyle::UCURR_FORMAL_SYMBOL_NAME |  (missing) | DraftICU 68
-| ucurr.h | <tt>enum</tt> UCurrNameStyle::UCURR_VARIANT_SYMBOL_NAME |  (missing) | DraftICU 68
-| udateintervalformat.h | UDisplayContext udtitvfmt_getContext(const UDateIntervalFormat*, UDisplayContextType, UErrorCode*) |  (missing) | DraftICU 68
-| udateintervalformat.h | void udtitvfmt_setContext(UDateIntervalFormat*, UDisplayContext, UErrorCode*) |  (missing) | DraftICU 68
-| umachine.h | <tt>#define</tt> U_DEFINE_FALSE_AND_TRUE |  (missing) | InternalICU 68
-| uniset.h | USet* icu::UnicodeSet::toUSet() |  (missing) | StableICU 4.2
-| unum.h | <tt>enum</tt> UNumberFormatMinimumGroupingDigits::UNUM_MINIMUM_GROUPING_DIGITS_AUTO |  (missing) | DraftICU 68
-| unum.h | <tt>enum</tt> UNumberFormatMinimumGroupingDigits::UNUM_MINIMUM_GROUPING_DIGITS_MIN2 |  (missing) | DraftICU 68
-| unumberformatter.h | <tt>enum</tt> UNumberUnitWidth::UNUM_UNIT_WIDTH_FORMAL |  (missing) | DraftICU 68
-| unumberformatter.h | <tt>enum</tt> UNumberUnitWidth::UNUM_UNIT_WIDTH_VARIANT |  (missing) | DraftICU 68
-| unumberformatter.h | int32_t unumf_resultToDecimalNumber(const UFormattedNumber*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UFormattedNumberRange* unumrf_openResult(UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UNumberRangeFormatter* unumrf_openForSkeletonWithCollapseAndIdentityFallback(const UChar*, int32_t, UNumberRangeCollapse, UNumberRangeIdentityFallback, const char*, UParseError*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UNumberRangeIdentityResult unumrf_resultGetIdentityResult(const UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | const UFormattedValue* unumrf_resultAsValue(const UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | int32_t unumrf_resultGetFirstDecimalNumber(const UFormattedNumberRange*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | int32_t unumrf_resultGetSecondDecimalNumber(const UFormattedNumberRange*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_close(UNumberRangeFormatter*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_closeResult(UFormattedNumberRange*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_formatDecimalRange(const UNumberRangeFormatter*, const char*, int32_t, const char*, int32_t, UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_formatDoubleRange(const UNumberRangeFormatter*, double, double, UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| upluralrules.h | int32_t uplrules_selectForRange(const UPluralRules*, const struct UFormattedNumberRange*, UChar*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-
-## Other
-
-Other existing drafts in ICU 68
-
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestream.h |  void icu::ByteSink::AppendU8(const char*, int32_t) | DraftICU 67 | 
-| bytestream.h |  void icu::ByteSink::AppendU8(const char8_t*, int32_t) | DraftICU 67 | 
-| dtptngen.h |  UDateFormatHourCycle icu::DateTimePatternGenerator::getDefaultHourCycle(UErrorCode&amp;) const | DraftICU 67 | 
-| localematcher.h |  Builder&amp; icu::LocaleMatcher::Builder::setDirection(ULocMatchDirection) | DraftICU 67 | 
-| localematcher.h |  <tt>enum</tt> ULocMatchDirection::ULOCMATCH_DIRECTION_ONLY_TWO_WAY | DraftICU 67 | 
-| localematcher.h |  <tt>enum</tt> ULocMatchDirection::ULOCMATCH_DIRECTION_WITH_ONE_WAY | DraftICU 67 | 
-| locid.h |  void icu::Locale::canonicalize(UErrorCode&amp;) | DraftICU 67 | 
-| measfmt.h |  void icu::MeasureFormat::parseObject(const UnicodeString&amp;, Formattable&amp;, ParsePosition&amp;) const | DraftICU 53 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::product(const MeasureUnit&amp;, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::reciprocal(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::withDimensionality(int32_t, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::withSIPrefix(UMeasureSIPrefix, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit&amp; icu::MeasureUnit::operator=(MeasureUnit&amp;&amp;) noexcept | DraftICU 67 | 
-| measunit.h |  UMeasureSIPrefix icu::MeasureUnit::getSIPrefix(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  UMeasureUnitComplexity icu::MeasureUnit::getComplexity(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  const char* icu::MeasureUnit::getIdentifier() const | DraftICU 67 | 
-| measunit.h |  icu::MeasureUnit::MeasureUnit(MeasureUnit&amp;&amp;) noexcept | DraftICU 67 | 
-| measunit.h |  int32_t icu::MeasureUnit::getDimensionality(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  <tt>static</tt> MeasureUnit icu::MeasureUnit::forIdentifier(StringPiece, UErrorCode&amp;) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const char8_t*) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const char8_t*, int32_t) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const std::u8string&amp;) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(std::nullptr_t) | DraftICU 67 | 
-| stringpiece.h |  int32_t icu::StringPiece::compare(StringPiece) | DraftICU 67 | 
-| stringpiece.h |  int32_t icu::StringPiece::find(StringPiece, int32_t) | DraftICU 67 | 
-| stringpiece.h |  void icu::StringPiece::set(const char8_t*) | DraftICU 67 | 
-| stringpiece.h |  void icu::StringPiece::set(const char8_t*, int32_t) | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_11 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_12 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_23 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_24 | DraftICU 67 | 
-| udateintervalformat.h |  void udtitvfmt_formatCalendarToResult(const UDateIntervalFormat*, UCalendar*, UCalendar*, UFormattedDateInterval*, UErrorCode*) | DraftICU 67 | 
-| udateintervalformat.h |  void udtitvfmt_formatToResult(const UDateIntervalFormat*, UDate, UDate, UFormattedDateInterval*, UErrorCode*) | DraftICU 67 | 
-| udatpg.h |  UDateFormatHourCycle udatpg_getDefaultHourCycle(const UDateTimePatternGenerator*, UErrorCode*) | DraftICU 67 | 
-| uregex.h |  <tt>enum</tt> URegexpFlag::UREGEX_CANON_EQ | DraftICU 2.4 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_BREAK_ENGINE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_CHARACTER | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_LINE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_SENTENCE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_TITLE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_WORD | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_START | DraftICU 67 | 
-
-## Simplifications
-
-This section shows cases where the signature was "simplified" for the sake of comparison. The simplified form is in bold, followed by
-    all possible variations in "original" form.
-
-
-## Colophon
-
-Contents generated by StableAPI tool on Fri Oct 23 11:32:42 PDT 2020
-
-Copyright © 2019 and later: Unicode, Inc. and others.
-License & terms of use: http://www.unicode.org/copyright.html
-  
-\ No newline at end of file
diff --git a/thirdparty/icu4c/common/bytestriebuilder.cpp b/thirdparty/icu4c/common/bytestriebuilder.cpp
index ec1ab7d8f5..28256f272a 100644
--- a/thirdparty/icu4c/common/bytestriebuilder.cpp
+++ b/thirdparty/icu4c/common/bytestriebuilder.cpp
@@ -474,31 +474,39 @@ BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
     U_ASSERT(i>=0);
     if(i<=BytesTrie::kMaxOneByteDelta) {
         return write(i);
+    } else {
+        char intBytes[5];
+        return write(intBytes, internalEncodeDelta(i, intBytes));
     }
-    char intBytes[5];
-    int32_t length;
+}
+
+int32_t
+BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
+    U_ASSERT(i>=0);
+    if(i<=BytesTrie::kMaxOneByteDelta) {
+        intBytes[0]=(char)i;
+        return 1;
+    }
+    int32_t length=1;
     if(i<=BytesTrie::kMaxTwoByteDelta) {
         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
-        length=1;
     } else {
         if(i<=BytesTrie::kMaxThreeByteDelta) {
             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
-            length=2;
         } else {
             if(i<=0xffffff) {
                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
-                length=3;
             } else {
                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
                 intBytes[1]=(char)(i>>24);
-                length=4;
+                length=2;
             }
-            intBytes[1]=(char)(i>>16);
+            intBytes[length++]=(char)(i>>16);
         }
-        intBytes[1]=(char)(i>>8);
+        intBytes[length++]=(char)(i>>8);
     }
     intBytes[length++]=(char)i;
-    return write(intBytes, length);
+    return length;
 }
 
 U_NAMESPACE_END
diff --git a/thirdparty/icu4c/common/charstr.cpp b/thirdparty/icu4c/common/charstr.cpp
index 318a185b3f..c35622882c 100644
--- a/thirdparty/icu4c/common/charstr.cpp
+++ b/thirdparty/icu4c/common/charstr.cpp
@@ -14,6 +14,8 @@
 *   created by: Markus W. Scherer
 */
 
+#include <cstdlib>
+
 #include "unicode/utypes.h"
 #include "unicode/putil.h"
 #include "charstr.h"
@@ -141,6 +143,38 @@ CharString &CharString::append(const char *s, int32_t sLength, UErrorCode &error
     return *this;
 }
 
+CharString &CharString::appendNumber(int32_t number, UErrorCode &status) {
+    if (number < 0) {
+        this->append('-', status);
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+    }
+
+    if (number == 0) {
+        this->append('0', status);
+        return *this;
+    }
+
+    int32_t numLen = 0;
+    while (number != 0) {
+        int32_t residue = number % 10;
+        number /= 10;
+        this->append(std::abs(residue) + '0', status);
+        numLen++;
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+    }
+
+    int32_t start = this->length() - numLen, end = this->length() - 1;
+    while(start < end) {
+        std::swap(this->data()[start++], this->data()[end--]);
+    }
+
+    return *this;
+}
+
 char *CharString::getAppendBuffer(int32_t minCapacity,
                                   int32_t desiredCapacityHint,
                                   int32_t &resultCapacity,
diff --git a/thirdparty/icu4c/common/charstr.h b/thirdparty/icu4c/common/charstr.h
index 6619faac61..175acd1c0a 100644
--- a/thirdparty/icu4c/common/charstr.h
+++ b/thirdparty/icu4c/common/charstr.h
@@ -127,6 +127,9 @@ public:
         return append(s.data(), s.length(), errorCode);
     }
     CharString &append(const char *s, int32_t sLength, UErrorCode &status);
+
+    CharString &appendNumber(int32_t number, UErrorCode &status);
+
     /**
      * Returns a writable buffer for appending and writes the buffer's capacity to
      * resultCapacity. Guarantees resultCapacity>=minCapacity if U_SUCCESS().
diff --git a/thirdparty/icu4c/common/cmemory.h b/thirdparty/icu4c/common/cmemory.h
index a9d9424b4e..f03b7dcce6 100644
--- a/thirdparty/icu4c/common/cmemory.h
+++ b/thirdparty/icu4c/common/cmemory.h
@@ -31,14 +31,63 @@
 #include <stddef.h>
 #include <string.h>
 #include "unicode/localpointer.h"
+#include "uassert.h"
 
 #if U_DEBUG && defined(UPRV_MALLOC_COUNT)
 #include <stdio.h>
 #endif
 
-
-#define uprv_memcpy(dst, src, size) U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size)
-#define uprv_memmove(dst, src, size) U_STANDARD_CPP_NAMESPACE memmove(dst, src, size)
+// uprv_memcpy and uprv_memmove
+#if defined(__clang__)
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("clang diagnostic push") \
+    _Pragma("clang diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("clang diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("clang diagnostic push") \
+    _Pragma("clang diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("clang diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#elif defined(__GNUC__)
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("GCC diagnostic push") \
+    _Pragma("GCC diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("GCC diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("GCC diagnostic push") \
+    _Pragma("GCC diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("GCC diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#else
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#endif
 
 /**
  * \def UPRV_LENGTHOF
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index b42cdf03fa..44285755f3 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -265,13 +265,9 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -503,13 +499,9 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -699,13 +691,9 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -908,13 +896,9 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
diff --git a/thirdparty/icu4c/common/edits.cpp b/thirdparty/icu4c/common/edits.cpp
index 95f0c19a72..92ca36fb5d 100644
--- a/thirdparty/icu4c/common/edits.cpp
+++ b/thirdparty/icu4c/common/edits.cpp
@@ -86,6 +86,7 @@ Edits &Edits::moveArray(Edits &src) U_NOEXCEPT {
 }
 
 Edits &Edits::operator=(const Edits &other) {
+    if (this == &other) { return *this; }  // self-assignment: no-op
     length = other.length;
     delta = other.delta;
     numChanges = other.numChanges;
diff --git a/thirdparty/icu4c/common/filteredbrk.cpp b/thirdparty/icu4c/common/filteredbrk.cpp
index c07128cbce..25080f9d33 100644
--- a/thirdparty/icu4c/common/filteredbrk.cpp
+++ b/thirdparty/icu4c/common/filteredbrk.cpp
@@ -20,6 +20,7 @@
 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
 #include "uvector.h"
 #include "cmemory.h"
+#include "umutex.h"
 
 U_NAMESPACE_BEGIN
 
@@ -139,13 +140,30 @@ class SimpleFilteredSentenceBreakData : public UMemory {
 public:
   SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 
       : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
-  SimpleFilteredSentenceBreakData *incr() { refcount++;  return this; }
-  SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
-  virtual ~SimpleFilteredSentenceBreakData();
+    SimpleFilteredSentenceBreakData *incr() {
+        umtx_atomic_inc(&refcount);
+        return this;
+    }
+    SimpleFilteredSentenceBreakData *decr() {
+        if(umtx_atomic_dec(&refcount) <= 0) {
+            delete this;
+        }
+        return 0;
+    }
+    virtual ~SimpleFilteredSentenceBreakData();
+
+    bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
+    bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
 
-  LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
-  LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
-  int32_t                     refcount;
+    const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
+    const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
+
+private:
+    // These tries own their data arrays.
+    // They are shared and must therefore not be modified.
+    LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
+    LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
+    u_atomic_int32_t            refcount;
 };
 
 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
@@ -244,7 +262,13 @@ SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIt
   fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
   fDelegate(adopt)
 {
-  // all set..
+    if (fData == nullptr) {
+        delete forwards;
+        delete backwards;
+        if (U_SUCCESS(status)) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
 }
 
 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
@@ -261,59 +285,62 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
     int32_t bestValue = -1;
     // loops while 'n' points to an exception.
     utext_setNativeIndex(fText.getAlias(), n); // from n..
-    fData->fBackwardsTrie->reset();
-    UChar32 uch;
 
     //if(debug2) u_printf(" n@ %d\n", n);
     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
-    if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
+    if(utext_previous32(fText.getAlias())==u' ') {  // TODO: skip a class of chars here??
       // TODO only do this the 1st time?
       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
     } else {
       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
-      uch = utext_next32(fText.getAlias());
+      utext_next32(fText.getAlias());
       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
     }
 
-    UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
-
-    while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
-          USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
-      if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
-        bestPosn = utext_getNativeIndex(fText.getAlias());
-        bestValue = fData->fBackwardsTrie->getValue();
-      }
-      //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
+    {
+        // Do not modify the shared trie!
+        UCharsTrie iter(fData->getBackwardsTrie());
+        UChar32 uch;
+        while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) {  // more to consume backwards
+            UStringTrieResult r = iter.nextForCodePoint(uch);
+            if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
+                bestPosn = utext_getNativeIndex(fText.getAlias());
+                bestValue = iter.getValue();
+            }
+            if(!USTRINGTRIE_HAS_NEXT(r)) {
+                break;
+            }
+            //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
+        }
     }
 
-    if(USTRINGTRIE_MATCHES(r)) { // exact match?
-      //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
-      bestValue = fData->fBackwardsTrie->getValue();
-      bestPosn = utext_getNativeIndex(fText.getAlias());
-      //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
-    }
+    //if(bestValue >= 0) {
+        //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
+    //}
 
     if(bestPosn>=0) {
       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
 
       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
-      //int32_t bestValue = fBackwardsTrie->getValue();
+      //int32_t bestValue = iter.getValue();
       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
 
       if(bestValue == kMATCH) { // exact match!
         //if(debug2) u_printf(" exact backward match\n");
         return kExceptionHere; // See if the next is another exception.
       } else if(bestValue == kPARTIAL
-                && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
+                && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
         //if(debug2) u_printf(" partial backward match\n");
         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
         // to see if it matches something going forward.
-        fData->fForwardsPartialTrie->reset();
         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
+        // Do not modify the shared trie!
+        UCharsTrie iter(fData->getForwardsPartialTrie());
+        UChar32 uch;
         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
-              USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
+              USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
         }
         if(USTRINGTRIE_MATCHES(rfwd)) {
@@ -339,7 +366,7 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
 int32_t
 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
   if(n == UBRK_DONE || // at end  or
-    fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+    !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
       return n;
   }
   // OK, do we need to break here?
@@ -369,7 +396,7 @@ SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
 int32_t
 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
   if(n == 0 || n == UBRK_DONE || // at end  or
-    fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+    !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
       return n;
   }
   // OK, do we need to break here?
@@ -420,7 +447,7 @@ SimpleFilteredSentenceBreakIterator::previous(void) {
 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
   if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
 
-  if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
+  if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
 
   UErrorCode status = U_ZERO_ERROR;
   resetState(status);
diff --git a/thirdparty/icu4c/common/hash.h b/thirdparty/icu4c/common/hash.h
index f02cb7087a..b927ddb3c3 100644
--- a/thirdparty/icu4c/common/hash.h
+++ b/thirdparty/icu4c/common/hash.h
@@ -85,16 +85,22 @@ public:
 
     inline int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
 
+    inline int32_t putiAllowZero(const UnicodeString& key, int32_t value, UErrorCode& status);
+
     inline void* get(const UnicodeString& key) const;
 
     inline int32_t geti(const UnicodeString& key) const;
 
+    inline int32_t getiAndFound(const UnicodeString& key, UBool &found) const;
+
     inline void* remove(const UnicodeString& key);
 
     inline int32_t removei(const UnicodeString& key);
 
     inline void removeAll(void);
 
+    inline UBool containsKey(const UnicodeString& key) const;
+
     inline const UHashElement* find(const UnicodeString& key) const;
 
     /**
@@ -203,6 +209,11 @@ inline int32_t Hashtable::puti(const UnicodeString& key, int32_t value, UErrorCo
     return uhash_puti(hash, new UnicodeString(key), value, &status);
 }
 
+inline int32_t Hashtable::putiAllowZero(const UnicodeString& key, int32_t value,
+                                        UErrorCode& status) {
+    return uhash_putiAllowZero(hash, new UnicodeString(key), value, &status);
+}
+
 inline void* Hashtable::get(const UnicodeString& key) const {
     return uhash_get(hash, &key);
 }
@@ -211,6 +222,10 @@ inline int32_t Hashtable::geti(const UnicodeString& key) const {
     return uhash_geti(hash, &key);
 }
 
+inline int32_t Hashtable::getiAndFound(const UnicodeString& key, UBool &found) const {
+    return uhash_getiAndFound(hash, &key, &found);
+}
+
 inline void* Hashtable::remove(const UnicodeString& key) {
     return uhash_remove(hash, &key);
 }
@@ -219,6 +234,10 @@ inline int32_t Hashtable::removei(const UnicodeString& key) {
     return uhash_removei(hash, &key);
 }
 
+inline UBool Hashtable::containsKey(const UnicodeString& key) const {
+    return uhash_containsKey(hash, &key);
+}
+
 inline const UHashElement* Hashtable::find(const UnicodeString& key) const {
     return uhash_find(hash, &key);
 }
diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp
index 5795cbf87e..132aee290e 100644
--- a/thirdparty/icu4c/common/localematcher.cpp
+++ b/thirdparty/icu4c/common/localematcher.cpp
@@ -345,9 +345,8 @@ UBool compareLSRs(const UHashTok t1, const UHashTok t2) {
 int32_t LocaleMatcher::putIfAbsent(const LSR &lsr, int32_t i, int32_t suppLength,
                                    UErrorCode &errorCode) {
     if (U_FAILURE(errorCode)) { return suppLength; }
-    int32_t index = uhash_geti(supportedLsrToIndex, &lsr);
-    if (index == 0) {
-        uhash_puti(supportedLsrToIndex, const_cast<LSR *>(&lsr), i + 1, &errorCode);
+    if (!uhash_containsKey(supportedLsrToIndex, &lsr)) {
+        uhash_putiAllowZero(supportedLsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode);
         if (U_SUCCESS(errorCode)) {
             supportedLSRs[suppLength] = &lsr;
             supportedIndexes[suppLength++] = i;
@@ -685,12 +684,11 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
     int32_t bestSupportedLsrIndex = -1;
     for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
         // Quick check for exact maximized LSR.
-        // Returns suppIndex+1 where 0 means not found.
         if (supportedLsrToIndex != nullptr) {
             desiredLSR.setHashCode();
-            int32_t index = uhash_geti(supportedLsrToIndex, &desiredLSR);
-            if (index != 0) {
-                int32_t suppIndex = index - 1;
+            UBool found = false;
+            int32_t suppIndex = uhash_getiAndFound(supportedLsrToIndex, &desiredLSR, &found);
+            if (found) {
                 if (remainingIter != nullptr) {
                     remainingIter->rememberCurrent(desiredIndex, errorCode);
                 }
diff --git a/thirdparty/icu4c/common/localeprioritylist.cpp b/thirdparty/icu4c/common/localeprioritylist.cpp
index 8916b121be..4455eedb75 100644
--- a/thirdparty/icu4c/common/localeprioritylist.cpp
+++ b/thirdparty/icu4c/common/localeprioritylist.cpp
@@ -187,17 +187,18 @@ bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &e
         if (U_FAILURE(errorCode)) { return false; }
     }
     LocalPointer<Locale> clone;
-    int32_t index = uhash_geti(map, &locale);
-    if (index != 0) {
+    UBool found = false;
+    int32_t index = uhash_getiAndFound(map, &locale, &found);
+    if (found) {
         // Duplicate: Remove the old item and append it anew.
-        LocaleAndWeight &lw = list->array[index - 1];
+        LocaleAndWeight &lw = list->array[index];
         clone.adoptInstead(lw.locale);
         lw.locale = nullptr;
         lw.weight = 0;
         ++numRemoved;
     }
     if (weight <= 0) {  // do not add q=0
-        if (index != 0) {
+        if (found) {
             // Not strictly necessary but cleaner.
             uhash_removei(map, &locale);
         }
@@ -217,7 +218,7 @@ bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &e
             return false;
         }
     }
-    uhash_puti(map, clone.getAlias(), listLength + 1, &errorCode);
+    uhash_putiAllowZero(map, clone.getAlias(), listLength, &errorCode);
     if (U_FAILURE(errorCode)) { return false; }
     LocaleAndWeight &lw = list->array[listLength];
     lw.locale = clone.orphan();
diff --git a/thirdparty/icu4c/common/locdispnames.cpp b/thirdparty/icu4c/common/locdispnames.cpp
index 47c0667417..96af3f9aa8 100644
--- a/thirdparty/icu4c/common/locdispnames.cpp
+++ b/thirdparty/icu4c/common/locdispnames.cpp
@@ -698,7 +698,7 @@ uloc_getDisplayName(const char *locale,
                     } /* end switch */
 
                     if (len>0) {
-                        /* we addeed a component, so add separator and write it if there's room. */
+                        /* we added a component, so add separator and write it if there's room. */
                         if(len+sepLen<=cap) {
                             const UChar * plimit = p + len;
                             for (; p < plimit; p++) {
diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp
index 874e4a7055..0d506293a9 100644
--- a/thirdparty/icu4c/common/locid.cpp
+++ b/thirdparty/icu4c/common/locid.cpp
@@ -254,7 +254,7 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)
 
 Locale::~Locale()
 {
-    if (baseName != fullName) {
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
@@ -466,7 +466,7 @@ Locale& Locale::operator=(const Locale& other) {
 }
 
 Locale& Locale::operator=(Locale&& other) U_NOEXCEPT {
-    if (baseName != fullName) uprv_free(baseName);
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) uprv_free(baseName);
     if (fullName != fullNameBuffer) uprv_free(fullName);
 
     if (other.fullName == other.fullNameBuffer) {
@@ -524,7 +524,7 @@ static const char* const KNOWN_CANONICALIZED[] = {
     "km", "km_KH", "kn", "kn_IN", "ko", "ko_KR", "ky", "ky_KG", "lo", "lo_LA",
     "lt", "lt_LT", "lv", "lv_LV", "mk", "mk_MK", "ml", "ml_IN", "mn", "mn_MN",
     "mr", "mr_IN", "ms", "ms_MY", "my", "my_MM", "nb", "nb_NO", "ne", "ne_NP",
-    "nl", "nl_NL", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
+    "nl", "nl_NL", "no", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
     "pt", "pt_BR", "pt_PT", "ro", "ro_RO", "ru", "ru_RU", "sd", "sd_IN", "si",
     "si_LK", "sk", "sk_SK", "sl", "sl_SI", "so", "so_SO", "sq", "sq_AL", "sr",
     "sr_Cyrl_RS", "sr_Latn", "sr_RS", "sv", "sv_SE", "sw", "sw_TZ", "ta",
@@ -627,6 +627,17 @@ private:
                           LocalMemory<const char*>& types,
                           LocalMemory<int32_t>& replacementIndexes,
                           int32_t &length, UErrorCode &status);
+
+    // Read the subdivisionAlias data from alias to
+    // strings+types+replacementIndexes
+    // Allocate length items for types, to store the type field.
+    // Allocate length items for replacementIndexes,
+    // to store the index in the strings for the replacement variant.
+    void readSubdivisionAlias(UResourceBundle* alias,
+                          UniqueCharStrings* strings,
+                          LocalMemory<const char*>& types,
+                          LocalMemory<int32_t>& replacementIndexes,
+                          int32_t &length, UErrorCode &status);
 };
 
 /**
@@ -647,6 +658,7 @@ public:
     const CharStringMap& scriptMap() const { return script; }
     const CharStringMap& territoryMap() const { return territory; }
     const CharStringMap& variantMap() const { return variant; }
+    const CharStringMap& subdivisionMap() const { return subdivision; }
 
     static void U_CALLCONV loadData(UErrorCode &status);
     static UBool U_CALLCONV cleanup();
@@ -658,11 +670,13 @@ private:
               CharStringMap scriptMap,
               CharStringMap territoryMap,
               CharStringMap variantMap,
+              CharStringMap subdivisionMap,
               CharString* strings)
         : language(std::move(languageMap)),
           script(std::move(scriptMap)),
           territory(std::move(territoryMap)),
           variant(std::move(variantMap)),
+          subdivision(std::move(subdivisionMap)),
           strings(strings) {
     }
 
@@ -676,6 +690,7 @@ private:
     CharStringMap script;
     CharStringMap territory;
     CharStringMap variant;
+    CharStringMap subdivision;
     CharString* strings;
 
     friend class AliasDataBuilder;
@@ -867,6 +882,34 @@ AliasDataBuilder::readVariantAlias(
 }
 
 /**
+ * Read the subdivisionAlias data from alias to strings+types+replacementIndexes.
+ * Allocate length items for types, to store the type field. Allocate length
+ * items for replacementIndexes, to store the index in the strings for the
+ * replacement regions.
+ */
+void
+AliasDataBuilder::readSubdivisionAlias(
+        UResourceBundle* alias,
+        UniqueCharStrings* strings,
+        LocalMemory<const char*>& types,
+        LocalMemory<int32_t>& replacementIndexes,
+        int32_t &length,
+        UErrorCode &status)
+{
+    return readAlias(
+        alias, strings, types, replacementIndexes, length,
+#if U_DEBUG
+        [](const char* type) {
+            U_ASSERT(uprv_strlen(type) >= 3 && uprv_strlen(type) <= 8);
+        },
+#else
+        [](const char*) {},
+#endif
+        [](const UnicodeString&) { },
+        status);
+}
+
+/**
  * Initializes the alias data from the ICU resource bundles. The alias data
  * contains alias of language, country, script and variants.
  *
@@ -905,12 +948,14 @@ AliasDataBuilder::build(UErrorCode &status) {
         ures_getByKey(metadataAlias.getAlias(), "territory", nullptr, &status));
     LocalUResourceBundlePointer variantAlias(
         ures_getByKey(metadataAlias.getAlias(), "variant", nullptr, &status));
+    LocalUResourceBundlePointer subdivisionAlias(
+        ures_getByKey(metadataAlias.getAlias(), "subdivision", nullptr, &status));
 
     if (U_FAILURE(status)) {
         return nullptr;
     }
     int32_t languagesLength = 0, scriptLength = 0, territoryLength = 0,
-            variantLength = 0;
+            variantLength = 0, subdivisionLength = 0;
 
     // Read the languageAlias into languageTypes, languageReplacementIndexes
     // and strings
@@ -955,6 +1000,16 @@ AliasDataBuilder::build(UErrorCode &status) {
                      variantReplacementIndexes,
                      variantLength, status);
 
+    // Read the subdivisionAlias into subdivisionTypes, subdivisionReplacementIndexes
+    // and strings
+    LocalMemory<const char*> subdivisionTypes;
+    LocalMemory<int32_t> subdivisionReplacementIndexes;
+    readSubdivisionAlias(subdivisionAlias.getAlias(),
+                         &strings,
+                         subdivisionTypes,
+                         subdivisionReplacementIndexes,
+                         subdivisionLength, status);
+
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -994,6 +1049,14 @@ AliasDataBuilder::build(UErrorCode &status) {
                        status);
     }
 
+    // Build the subdivisionMap from subdivisionTypes & subdivisionReplacementIndexes.
+    CharStringMap subdivisionMap(2, status);
+    for (int32_t i = 0; U_SUCCESS(status) && i < subdivisionLength; i++) {
+        subdivisionMap.put(subdivisionTypes[i],
+                       strings.get(subdivisionReplacementIndexes[i]),
+                       status);
+    }
+
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -1004,6 +1067,7 @@ AliasDataBuilder::build(UErrorCode &status) {
         std::move(scriptMap),
         std::move(territoryMap),
         std::move(variantMap),
+        std::move(subdivisionMap),
         strings.orphanCharStrings());
 
     if (data == nullptr) {
@@ -1105,6 +1169,14 @@ private:
 
     // Replace by using variantAlias.
     bool replaceVariant(UErrorCode& status);
+
+    // Replace by using subdivisionAlias.
+    bool replaceSubdivision(StringPiece subdivision,
+                            CharString& output, UErrorCode& status);
+
+    // Replace transformed extensions.
+    bool replaceTransformedExtensions(
+        CharString& transformedExtensions, CharString& output, UErrorCode& status);
 };
 
 CharString&
@@ -1294,7 +1366,6 @@ AliasReplacer::replaceLanguage(
             }
         }
         if (replacedExtensions != nullptr) {
-            // TODO(ICU-21292)
             // DO NOTHING
             // UTS35 does not specifiy what should we do if we have extensions in the
             // replacement. Currently we know only the following 4 "BCP47 LegacyRules" have
@@ -1435,6 +1506,106 @@ AliasReplacer::replaceVariant(UErrorCode& status)
     return false;
 }
 
+bool
+AliasReplacer::replaceSubdivision(
+    StringPiece subdivision, CharString& output, UErrorCode& status)
+{
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    const char *replacement = data->subdivisionMap().get(subdivision.data());
+    if (replacement != nullptr) {
+        const char* firstSpace = uprv_strchr(replacement, ' ');
+        // Found replacement data for this subdivision.
+        size_t len = (firstSpace != nullptr) ?
+            (firstSpace - replacement) : uprv_strlen(replacement);
+        if (2 <= len && len <= 8) {
+            output.append(replacement, (int32_t)len, status);
+            if (2 == len) {
+                // Add 'zzzz' based on changes to UTS #35 for CLDR-14312.
+                output.append("zzzz", 4, status);
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+bool
+AliasReplacer::replaceTransformedExtensions(
+    CharString& transformedExtensions, CharString& output, UErrorCode& status)
+{
+    // The content of the transformedExtensions will be modified in this
+    // function to NULL-terminating (tkey-tvalue) pairs.
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    int32_t len = transformedExtensions.length();
+    const char* str = transformedExtensions.data();
+    const char* tkey = ultag_getTKeyStart(str);
+    int32_t tlangLen = (tkey == str) ? 0 :
+        ((tkey == nullptr) ? len : static_cast<int32_t>((tkey - str - 1)));
+    CharStringByteSink sink(&output);
+    if (tlangLen > 0) {
+        Locale tlang = LocaleBuilder()
+            .setLanguageTag(StringPiece(str, tlangLen))
+            .build(status);
+        tlang.canonicalize(status);
+        tlang.toLanguageTag(sink, status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        T_CString_toLowerCase(output.data());
+    }
+    if (tkey != nullptr) {
+        // We need to sort the tfields by tkey
+        UVector tfields(status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        do {
+            const char* tvalue = uprv_strchr(tkey, '-');
+            if (tvalue == nullptr) {
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+            }
+            const char* nextTKey = ultag_getTKeyStart(tvalue);
+            if (nextTKey != nullptr) {
+                *((char*)(nextTKey-1)) = '\0';  // NULL terminate tvalue
+            }
+            tfields.insertElementAt((void*)tkey, tfields.size(), status);
+            if (U_FAILURE(status)) {
+                return false;
+            }
+            tkey = nextTKey;
+        } while (tkey != nullptr);
+        tfields.sort([](UElement e1, UElement e2) -> int8_t {
+            // uprv_strcmp return int and in some platform, such as arm64-v8a,
+            // it may return positive values > 127 which cause the casted value
+            // of int8_t negative.
+            int res = uprv_strcmp(
+                (const char*)e1.pointer, (const char*)e2.pointer);
+            return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
+        }, status);
+        for (int32_t i = 0; i < tfields.size(); i++) {
+             if (output.length() > 0) {
+                 output.append('-', status);
+             }
+             const char* tfield = (const char*) tfields.elementAt(i);
+             const char* tvalue = uprv_strchr(tfield, '-');
+             // Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
+             U_ASSERT(tvalue != nullptr);
+             *((char*)tvalue++) = '\0'; // NULL terminate tkey
+             output.append(tfield, status).append('-', status);
+             const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
+             output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
+        }
+    }
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    return true;
+}
+
 CharString&
 AliasReplacer::outputToString(
     CharString& out, UErrorCode status)
@@ -1453,8 +1624,12 @@ AliasReplacer::outputToString(
           out.append(SEP_CHAR, status);
         }
         variants.sort([](UElement e1, UElement e2) -> int8_t {
-            return uprv_strcmp(
+            // uprv_strcmp return int and in some platform, such as arm64-v8a,
+            // it may return positive values > 127 which cause the casted value
+            // of int8_t negative.
+            int res = uprv_strcmp(
                 (const char*)e1.pointer, (const char*)e2.pointer);
+            return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
         }, status);
         int32_t variantsStart = out.length();
         for (int32_t i = 0; i < variants.size(); i++) {
@@ -1497,7 +1672,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
         region = nullptr;
     }
     const char* variantsStr = locale.getVariant();
-    const char* extensionsStr = locale_getKeywordsStart(locale.getName());
     CharString variantsBuff(variantsStr, -1, status);
     if (!variantsBuff.isEmpty()) {
         if (U_FAILURE(status)) { return false; }
@@ -1516,8 +1690,12 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
 
     // Sort the variants
     variants.sort([](UElement e1, UElement e2) -> int8_t {
-        return uprv_strcmp(
+        // uprv_strcmp return int and in some platform, such as arm64-v8a,
+        // it may return positive values > 127 which cause the casted value
+        // of int8_t negative.
+        int res = uprv_strcmp(
             (const char*)e1.pointer, (const char*)e2.pointer);
+        return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
     }, status);
 
     // A changed count to assert when loop too many times.
@@ -1561,11 +1739,52 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
     if (U_FAILURE(status)) { return false; }
     // Nothing changed and we know the order of the vaiants are not change
     // because we have no variant or only one.
-    if (changed == 0 && variants.size() <= 1) {
+    const char* extensionsStr = locale_getKeywordsStart(locale.getName());
+    if (changed == 0 && variants.size() <= 1 && extensionsStr == nullptr) {
         return false;
     }
     outputToString(out, status);
+    if (U_FAILURE(status)) {
+        return false;
+    }
     if (extensionsStr != nullptr) {
+        changed = 0;
+        Locale temp(locale);
+        LocalPointer<icu::StringEnumeration> iter(locale.createKeywords(status));
+        if (U_SUCCESS(status) && !iter.isNull()) {
+            const char* key;
+            while ((key = iter->next(nullptr, status)) != nullptr) {
+                if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
+                        uprv_strcmp("t", key) == 0) {
+                    CharString value;
+                    CharStringByteSink valueSink(&value);
+                    locale.getKeywordValue(key, valueSink, status);
+                    if (U_FAILURE(status)) {
+                        status = U_ZERO_ERROR;
+                        continue;
+                    }
+                    CharString replacement;
+                    if (uprv_strlen(key) == 2) {
+                        if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
+                    } else {
+                        U_ASSERT(uprv_strcmp(key, "t") == 0);
+                        if (replaceTransformedExtensions(value, replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
+                    }
+                    if (U_FAILURE(status)) {
+                        return false;
+                    }
+                }
+            }
+        }
+        if (changed != 0) {
+            extensionsStr = locale_getKeywordsStart(temp.getName());
+        }
         out.append(extensionsStr, status);
     }
     if (U_FAILURE(status)) {
@@ -1573,8 +1792,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
     }
     // If the tag is not changed, return.
     if (uprv_strcmp(out.data(), locale.getName()) == 0) {
-        U_ASSERT(changed == 0);
-        U_ASSERT(variants.size() > 1);
         out.clear();
         return false;
     }
@@ -1636,7 +1853,7 @@ Locale& Locale::init(const char* localeID, UBool canonicalize)
 {
     fIsBogus = FALSE;
     /* Free our current storage */
-    if (baseName != fullName) {
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
@@ -1672,6 +1889,7 @@ Locale& Locale::init(const char* localeID, UBool canonicalize)
             uloc_getName(localeID, fullName, sizeof(fullNameBuffer), &err);
 
         if(err == U_BUFFER_OVERFLOW_ERROR || length >= (int32_t)sizeof(fullNameBuffer)) {
+            U_ASSERT(baseName == nullptr);
             /*Go to heap for the fullName if necessary*/
             fullName = (char *)uprv_malloc(sizeof(char)*(length + 1));
             if(fullName == 0) {
@@ -1825,7 +2043,7 @@ Locale::hashCode() const
 void
 Locale::setToBogus() {
     /* Free our current storage */
-    if(baseName != fullName) {
+    if((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
diff --git a/thirdparty/icu4c/common/loclikelysubtags.cpp b/thirdparty/icu4c/common/loclikelysubtags.cpp
index a031bfa587..aa592e6ea8 100644
--- a/thirdparty/icu4c/common/loclikelysubtags.cpp
+++ b/thirdparty/icu4c/common/loclikelysubtags.cpp
@@ -320,7 +320,8 @@ XLikelySubtags::~XLikelySubtags() {
 LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const {
     const char *name = locale.getName();
     if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') {  // name.startsWith("@x=")
-        // Private use language tag x-subtag-subtag...
+        // Private use language tag x-subtag-subtag... which CLDR changes to
+        // und-x-subtag-subtag...
         return LSR(name, "", "", LSR::EXPLICIT_LSR);
     }
     return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
diff --git a/thirdparty/icu4c/common/norm2allmodes.h b/thirdparty/icu4c/common/norm2allmodes.h
index e8bd52c6ae..584835da57 100644
--- a/thirdparty/icu4c/common/norm2allmodes.h
+++ b/thirdparty/icu4c/common/norm2allmodes.h
@@ -38,7 +38,7 @@ public:
     virtual UnicodeString &
     normalize(const UnicodeString &src,
               UnicodeString &dest,
-              UErrorCode &errorCode) const {
+              UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             dest.setToBogus();
             return dest;
@@ -64,13 +64,13 @@ public:
     virtual UnicodeString &
     normalizeSecondAndAppend(UnicodeString &first,
                              const UnicodeString &second,
-                             UErrorCode &errorCode) const {
+                             UErrorCode &errorCode) const U_OVERRIDE {
         return normalizeSecondAndAppend(first, second, true, errorCode);
     }
     virtual UnicodeString &
     append(UnicodeString &first,
            const UnicodeString &second,
-           UErrorCode &errorCode) const {
+           UErrorCode &errorCode) const U_OVERRIDE {
         return normalizeSecondAndAppend(first, second, false, errorCode);
     }
     UnicodeString &
@@ -107,7 +107,7 @@ public:
                        UnicodeString &safeMiddle,
                        ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
     virtual UBool
-    getDecomposition(UChar32 c, UnicodeString &decomposition) const {
+    getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
         UChar buffer[4];
         int32_t length;
         const UChar *d=impl.getDecomposition(c, buffer, length);
@@ -122,7 +122,7 @@ public:
         return true;
     }
     virtual UBool
-    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
+    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
         UChar buffer[30];
         int32_t length;
         const UChar *d=impl.getRawDecomposition(c, buffer, length);
@@ -137,18 +137,18 @@ public:
         return true;
     }
     virtual UChar32
-    composePair(UChar32 a, UChar32 b) const {
+    composePair(UChar32 a, UChar32 b) const U_OVERRIDE {
         return impl.composePair(a, b);
     }
 
     virtual uint8_t
-    getCombiningClass(UChar32 c) const {
+    getCombiningClass(UChar32 c) const U_OVERRIDE {
         return impl.getCC(impl.getNorm16(c));
     }
 
     // quick checks
     virtual UBool
-    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
+    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             return false;
         }
@@ -161,11 +161,11 @@ public:
         return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
     }
     virtual UNormalizationCheckResult
-    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
+    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
     }
     virtual int32_t
-    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             return 0;
         }
@@ -194,27 +194,57 @@ public:
 private:
     virtual void
     normalize(const UChar *src, const UChar *limit,
-              ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+              ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.decompose(src, limit, &buffer, errorCode);
     }
     using Normalizer2WithImpl::normalize;  // Avoid warning about hiding base class function.
     virtual void
     normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
                        UnicodeString &safeMiddle,
-                       ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+                       ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
     }
+
+    void
+    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
+                  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
+        if (U_FAILURE(errorCode)) {
+            return;
+        }
+        if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
+            edits->reset();
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
+        impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
+        sink.Flush();
+    }
+    virtual UBool
+    isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
+        if(U_FAILURE(errorCode)) {
+            return false;
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
+        const uint8_t *sLimit = s + sp.length();
+        return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
+    }
+
     virtual const UChar *
-    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
         return impl.decompose(src, limit, NULL, errorCode);
     }
     using Normalizer2WithImpl::spanQuickCheckYes;  // Avoid warning about hiding base class function.
-    virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
+    virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
         return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
     }
-    virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
-    virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
-    virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
+    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
+        return impl.hasDecompBoundaryBefore(c);
+    }
+    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
+        return impl.hasDecompBoundaryAfter(c);
+    }
+    virtual UBool isInert(UChar32 c) const U_OVERRIDE {
+        return impl.isDecompInert(c);
+    }
 };
 
 class ComposeNormalizer2 : public Normalizer2WithImpl {
@@ -321,24 +351,30 @@ public:
 private:
     virtual void
     normalize(const UChar *src, const UChar *limit,
-              ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+              ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.makeFCD(src, limit, &buffer, errorCode);
     }
     using Normalizer2WithImpl::normalize;  // Avoid warning about hiding base class function.
     virtual void
     normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
                        UnicodeString &safeMiddle,
-                       ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+                       ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
     }
     virtual const UChar *
-    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
         return impl.makeFCD(src, limit, NULL, errorCode);
     }
     using Normalizer2WithImpl::spanQuickCheckYes;  // Avoid warning about hiding base class function.
-    virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
-    virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
-    virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
+    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
+        return impl.hasFCDBoundaryBefore(c);
+    }
+    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
+        return impl.hasFCDBoundaryAfter(c);
+    }
+    virtual UBool isInert(UChar32 c) const U_OVERRIDE {
+        return impl.isFCDInert(c);
+    }
 };
 
 struct Norm2AllModes : public UMemory {
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp
index cbf6b4d980..c0ad5c69f3 100644
--- a/thirdparty/icu4c/common/normalizer2impl.cpp
+++ b/thirdparty/icu4c/common/normalizer2impl.cpp
@@ -731,9 +731,131 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
 }
 
+// Dual functionality:
+// sink != nullptr: normalize
+// sink == nullptr: isNormalized/spanQuickCheckYes
+const uint8_t *
+Normalizer2Impl::decomposeUTF8(uint32_t options,
+                               const uint8_t *src, const uint8_t *limit,
+                               ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
+    U_ASSERT(limit != nullptr);
+    UnicodeString s16;
+    uint8_t minNoLead = leadByteForCP(minDecompNoCP);
+
+    const uint8_t *prevBoundary = src;
+    // only for quick check
+    uint8_t prevCC = 0;
+
+    for (;;) {
+        // Fast path: Scan over a sequence of characters below the minimum "no" code point,
+        // or with (decompYes && ccc==0) properties.
+        const uint8_t *fastStart = src;
+        const uint8_t *prevSrc;
+        uint16_t norm16 = 0;
+
+        for (;;) {
+            if (src == limit) {
+                if (prevBoundary != limit && sink != nullptr) {
+                    ByteSinkUtil::appendUnchanged(prevBoundary, limit,
+                                                  *sink, options, edits, errorCode);
+                }
+                return src;
+            }
+            if (*src < minNoLead) {
+                ++src;
+            } else {
+                prevSrc = src;
+                UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
+                if (!isMostDecompYesAndZeroCC(norm16)) {
+                    break;
+                }
+            }
+        }
+        // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
+        // and the current character at [prevSrc..src[ is not a common case with cc=0
+        // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
+        // It could still be a maybeYes with cc=0.
+        if (prevSrc != fastStart) {
+            // The fast path looped over yes/0 characters before the current one.
+            if (sink != nullptr &&
+                    !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+                                                   *sink, options, edits, errorCode)) {
+                break;
+            }
+            prevBoundary = prevSrc;
+            prevCC = 0;
+        }
+
+        // Medium-fast path: Quick check.
+        if (isMaybeOrNonZeroCC(norm16)) {
+            // Does not decompose.
+            uint8_t cc = getCCFromYesOrMaybe(norm16);
+            if (prevCC <= cc || cc == 0) {
+                prevCC = cc;
+                if (cc <= 1) {
+                    if (sink != nullptr &&
+                            !ByteSinkUtil::appendUnchanged(prevBoundary, src,
+                                                           *sink, options, edits, errorCode)) {
+                        break;
+                    }
+                    prevBoundary = src;
+                }
+                continue;
+            }
+        }
+        if (sink == nullptr) {
+            return prevBoundary;  // quick check: "no" or cc out of order
+        }
+
+        // Slow path
+        // Decompose up to and including the current character.
+        if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
+            if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+                                               *sink, options, edits, errorCode)) {
+                break;
+            }
+            prevBoundary = prevSrc;
+        }
+        ReorderingBuffer buffer(*this, s16, errorCode);
+        if (U_FAILURE(errorCode)) {
+            break;
+        }
+        decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
+                       buffer, errorCode);
+        // Decompose until the next boundary.
+        if (buffer.getLastCC() > 1) {
+            src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
+                                 buffer, errorCode);
+        }
+        if (U_FAILURE(errorCode)) {
+            break;
+        }
+        if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
+            errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            break;
+        }
+        // We already know there was a change if the original character decomposed;
+        // otherwise compare.
+        if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
+            if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
+                                               *sink, options, edits, errorCode)) {
+                break;
+            }
+        } else {
+            if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
+                                            *sink, edits, errorCode)) {
+                break;
+            }
+        }
+        prevBoundary = src;
+        prevCC = 0;
+    }
+    return src;
+}
+
 const uint8_t *
 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
-                                UBool stopAtCompBoundary, UBool onlyContiguous,
+                                StopAt stopAt, UBool onlyContiguous,
                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
     if (U_FAILURE(errorCode)) {
         return nullptr;
@@ -746,21 +868,28 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
         UChar32 c = U_SENTINEL;
         if (norm16 >= limitNoNo) {
             if (isMaybeOrNonZeroCC(norm16)) {
-                // No boundaries around this character.
+                // No comp boundaries around this character.
+                uint8_t cc = getCCFromYesOrMaybe(norm16);
+                if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+                    return prevSrc;
+                }
                 c = codePointFromValidUTF8(prevSrc, src);
-                if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
+                if (!buffer.append(c, cc, errorCode)) {
                     return nullptr;
                 }
+                if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
+                    return src;
+                }
                 continue;
             }
             // Maps to an isCompYesAndZeroCC.
-            if (stopAtCompBoundary) {
+            if (stopAt != STOP_AT_LIMIT) {
                 return prevSrc;
             }
             c = codePointFromValidUTF8(prevSrc, src);
             c = mapAlgorithmic(c, norm16);
             norm16 = getRawNorm16(c);
-        } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
+        } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
             return prevSrc;
         }
         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
@@ -768,7 +897,8 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
         // its norm16==INERT is normalization-inert,
         // so it gets copied unchanged in the fast path,
         // and we stop the slow path where invalid UTF-8 begins.
-        U_ASSERT(norm16 != INERT);
+        // c >= 0 is the result of an algorithmic mapping.
+        U_ASSERT(c >= 0 || norm16 != INERT);
         if (norm16 < minYesNo) {
             if (c < 0) {
                 c = codePointFromValidUTF8(prevSrc, src);
@@ -798,11 +928,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
             } else {
                 leadCC = 0;
             }
+            if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+                return prevSrc;
+            }
             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
                 return nullptr;
             }
         }
-        if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
+        if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
+                (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
             return src;
         }
     }
@@ -1954,10 +2088,10 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
             break;
         }
         // We know there is not a boundary here.
-        decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
+        decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
                        buffer, errorCode);
         // Decompose until the next boundary.
-        src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
+        src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
                              buffer, errorCode);
         if (U_FAILURE(errorCode)) {
             break;
diff --git a/thirdparty/icu4c/common/normalizer2impl.h b/thirdparty/icu4c/common/normalizer2impl.h
index 4218a30a34..bdb6767a92 100644
--- a/thirdparty/icu4c/common/normalizer2impl.h
+++ b/thirdparty/icu4c/common/normalizer2impl.h
@@ -491,6 +491,12 @@ public:
                             UnicodeString &safeMiddle,
                             ReorderingBuffer &buffer,
                             UErrorCode &errorCode) const;
+
+    /** sink==nullptr: isNormalized()/spanQuickCheckYes() */
+    const uint8_t *decomposeUTF8(uint32_t options,
+                                 const uint8_t *src, const uint8_t *limit,
+                                 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
+
     UBool compose(const UChar *src, const UChar *limit,
                   UBool onlyContiguous,
                   UBool doCompose,
@@ -649,6 +655,9 @@ private:
                                                 UChar32 minNeedDataCP,
                                                 ReorderingBuffer *buffer,
                                                 UErrorCode &errorCode) const;
+
+    enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
+
     const UChar *decomposeShort(const UChar *src, const UChar *limit,
                                 UBool stopAtCompBoundary, UBool onlyContiguous,
                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
@@ -656,7 +665,7 @@ private:
                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;
 
     const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
-                                  UBool stopAtCompBoundary, UBool onlyContiguous,
+                                  StopAt stopAt, UBool onlyContiguous,
                                   ReorderingBuffer &buffer, UErrorCode &errorCode) const;
 
     static int32_t combine(const uint16_t *list, UChar32 trail);
diff --git a/thirdparty/icu4c/common/pluralmap.h b/thirdparty/icu4c/common/pluralmap.h
index d898ac4671..2a14a07af1 100644
--- a/thirdparty/icu4c/common/pluralmap.h
+++ b/thirdparty/icu4c/common/pluralmap.h
@@ -24,7 +24,7 @@ class U_COMMON_API PluralMapBase : public UMemory {
 public:
     /**
      * The names of all the plural categories. NONE is not an actual plural
-     * category, but rather represents the absense of a plural category.
+     * category, but rather represents the absence of a plural category.
      */
     enum Category {
         NONE = -1,
diff --git a/thirdparty/icu4c/common/putil.cpp b/thirdparty/icu4c/common/putil.cpp
index 3ed6a05d22..ffcbbcce59 100644
--- a/thirdparty/icu4c/common/putil.cpp
+++ b/thirdparty/icu4c/common/putil.cpp
@@ -1139,7 +1139,7 @@ uprv_tzname(int n)
 #endif
     if (tzid != NULL && isValidOlsonID(tzid)
 #if U_PLATFORM == U_PF_SOLARIS
-    /* When TZ equals localtime on Solaris, check the /etc/localtime file. */
+    /* Don't misinterpret TZ "localtime" on Solaris as a time zone name. */
         && uprv_strcmp(tzid, TZ_ENV_CHECK) != 0
 #endif
     ) {
@@ -1361,7 +1361,7 @@ uprv_pathIsAbsolute(const char *path)
 
 /* Backup setting of ICU_DATA_DIR_PREFIX_ENV_VAR
    (needed for some Darwin ICU build environments) */
-#if U_PLATFORM_IS_DARWIN_BASED && TARGET_OS_SIMULATOR
+#if U_PLATFORM_IS_DARWIN_BASED && defined(TARGET_OS_SIMULATOR) && TARGET_OS_SIMULATOR
 # if !defined(ICU_DATA_DIR_PREFIX_ENV_VAR)
 #  define ICU_DATA_DIR_PREFIX_ENV_VAR "IPHONE_SIMULATOR_ROOT"
 # endif
diff --git a/thirdparty/icu4c/common/putilimp.h b/thirdparty/icu4c/common/putilimp.h
index a325c6c359..5b95a68418 100644
--- a/thirdparty/icu4c/common/putilimp.h
+++ b/thirdparty/icu4c/common/putilimp.h
@@ -527,7 +527,7 @@ U_CAPI void * U_EXPORT2 uprv_maximumPtr(void *base);
  * on the destination pointer and capacity cannot overflow.
  *
  * The pinned capacity must fulfill the following conditions (for positive capacities):
- *   - dest + capacity is a valid pointer according to the machine arcitecture (AS/400, 64-bit, etc.)
+ *   - dest + capacity is a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
  *   - (dest + capacity) >= dest
  *   - The size (in bytes) of T[capacity] does not exceed 0x7fffffff
  *
diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp
index 9b7e70c3cf..b821ca4463 100644
--- a/thirdparty/icu4c/common/rbbi.cpp
+++ b/thirdparty/icu4c/common/rbbi.cpp
@@ -812,7 +812,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
         }
     #endif
 
-    // handleNext alway sets the break tag value.
+    // handleNext always sets the break tag value.
     // Set the default for it.
     fRuleStatusIndex = 0;
 
diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp
index 63ff3001c7..44f19d8697 100644
--- a/thirdparty/icu4c/common/rbbi_cache.cpp
+++ b/thirdparty/icu4c/common/rbbi_cache.cpp
@@ -258,7 +258,7 @@ void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode
             previous(status);
         } else {
             // seek() leaves the BreakCache positioned at the preceding boundary
-            //        if the requested position is between two bounaries.
+            //        if the requested position is between two boundaries.
             // current() pushes the BreakCache position out to the BreakIterator itself.
             U_ASSERT(startPos > fTextIdx);
             current();
diff --git a/thirdparty/icu4c/common/rbbiscan.cpp b/thirdparty/icu4c/common/rbbiscan.cpp
index 9c406af671..45911b1cfe 100644
--- a/thirdparty/icu4c/common/rbbiscan.cpp
+++ b/thirdparty/icu4c/common/rbbiscan.cpp
@@ -284,7 +284,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
 
     case doEndAssign:
         {
-            // We have reached the end of an assignement statement.
+            // We have reached the end of an assignment statement.
             //   Current scan char is the ';' that terminates the assignment.
 
             // Terminate expression, leaves expression parse tree rooted in TOS node.
@@ -856,6 +856,10 @@ UChar32  RBBIRuleScanner::nextCharLL() {
         return (UChar32)-1;
     }
     ch         = fRB->fRules.char32At(fNextIndex);
+    if (U_IS_SURROGATE(ch)) {
+        error(U_ILLEGAL_CHAR_FOUND);
+        return U_SENTINEL;
+    }
     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
 
     if (ch == chCR ||
diff --git a/thirdparty/icu4c/common/rbbitblb.cpp b/thirdparty/icu4c/common/rbbitblb.cpp
index 70e260fc08..dd76337bc6 100644
--- a/thirdparty/icu4c/common/rbbitblb.cpp
+++ b/thirdparty/icu4c/common/rbbitblb.cpp
@@ -151,7 +151,7 @@ void  RBBITableBuilder::buildForwardTable() {
     //
     // calculate the functions nullable, firstpos, lastpos and followpos on
     // nodes in the parse tree.
-    //    See the alogrithm description in Aho.
+    //    See the algorithm description in Aho.
     //    Understanding how this works by looking at the code alone will be
     //       nearly impossible.
     //
diff --git a/thirdparty/icu4c/common/resource.h b/thirdparty/icu4c/common/resource.h
index 3795694412..48f5b9fa6e 100644
--- a/thirdparty/icu4c/common/resource.h
+++ b/thirdparty/icu4c/common/resource.h
@@ -274,8 +274,10 @@ public:
      *
      * @param key The key string of the enumeration-start resource.
      *     Empty if the enumeration starts at the top level of the bundle.
-     * @param value Call getArray() or getTable() as appropriate.
-     *     Then reuse for output values from Array and Table getters.
+     * @param value Call getArray() or getTable() as appropriate. Then reuse for
+     *     output values from Array and Table getters. Note: ResourceTable and
+     *     ResourceArray instances must outlive the ResourceValue instance for
+     *     ResourceTracer to be happy.
      * @param noFallback true if the bundle has no parent;
      *     that is, its top-level table has the nofallback attribute,
      *     or it is the root bundle of a locale tree.
diff --git a/thirdparty/icu4c/common/restrace.cpp b/thirdparty/icu4c/common/restrace.cpp
index 5c6498850e..1f83372d68 100644
--- a/thirdparty/icu4c/common/restrace.cpp
+++ b/thirdparty/icu4c/common/restrace.cpp
@@ -54,6 +54,9 @@ void ResourceTracer::traceOpen() const {
 
 CharString& ResourceTracer::getFilePath(CharString& output, UErrorCode& status) const {
     if (fResB) {
+        // Note: if you get a segfault around here, check that ResourceTable and
+        // ResourceArray instances outlive ResourceValue instances referring to
+        // their contents:
         output.append(fResB->fData->fPath, status);
         output.append('/', status);
         output.append(fResB->fData->fName, status);
diff --git a/thirdparty/icu4c/common/servnotf.h b/thirdparty/icu4c/common/servnotf.h
index 305570c1e6..73ce38c772 100644
--- a/thirdparty/icu4c/common/servnotf.h
+++ b/thirdparty/icu4c/common/servnotf.h
@@ -82,7 +82,7 @@ public:
     /**
      * Add a listener to be notified when notifyChanged is called.
      * The listener must not be null. AcceptsListener must return
-     * true for the listener.  Attempts to concurrently
+     * true for the listener. Attempts to concurrently
      * register the identical listener more than once will be
      * silently ignored.  
      */
@@ -90,7 +90,7 @@ public:
     
     /**
      * Stop notifying this listener.  The listener must
-     * not be null.  Attemps to remove a listener that is
+     * not be null. Attempts to remove a listener that is
      * not registered will be silently ignored.
      */
     virtual void removeListener(const EventListener* l, UErrorCode& status);
diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp
index f8bdf5a6b6..bb5bdd1b50 100644
--- a/thirdparty/icu4c/common/ubrk.cpp
+++ b/thirdparty/icu4c/common/ubrk.cpp
@@ -174,6 +174,18 @@ ubrk_safeClone(
     return (UBreakIterator *)newBI;
 }
 
+U_CAPI UBreakIterator * U_EXPORT2
+ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return nullptr;
+    }
+    BreakIterator *newBI = ((BreakIterator *)bi)->clone();
+    if (newBI == nullptr) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
+    }
+    return (UBreakIterator *)newBI;
+}
 
 
 U_CAPI void U_EXPORT2
diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp
index 2b142f5bc2..4f4c274d60 100644
--- a/thirdparty/icu4c/common/ucase.cpp
+++ b/thirdparty/icu4c/common/ucase.cpp
@@ -681,7 +681,7 @@ ucase_isCaseSensitive(UChar32 c) {
  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
  *   - Given D = NFD(C), then it is not the case that:
  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
- *     (This third criterium does not add any characters to the list
+ *     (This third criterion does not add any characters to the list
  *      for Unicode 3.2. Ignored.)
  *
  * D2. A character C is defined to be case-ignorable
diff --git a/thirdparty/icu4c/common/uchar.cpp b/thirdparty/icu4c/common/uchar.cpp
index eb14e4c75d..61e9c3d900 100644
--- a/thirdparty/icu4c/common/uchar.cpp
+++ b/thirdparty/icu4c/common/uchar.cpp
@@ -194,7 +194,7 @@ u_isISOControl(UChar32 c) {
 
 /* Some control characters that are used as space. */
 #define IS_THAT_CONTROL_SPACE(c) \
-    (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
+    (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==0x85))
 
 /* Java has decided that U+0085 New Line is not whitespace any more. */
 #define IS_THAT_ASCII_CONTROL_SPACE(c) \
@@ -677,14 +677,14 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     sa->add(sa->set, CR+1); /* range TAB..CR */
     sa->add(sa->set, 0x1c);
     sa->add(sa->set, 0x1f+1);
-    USET_ADD_CP_AND_NEXT(sa, NL);
+    USET_ADD_CP_AND_NEXT(sa, 0x85);  // NEXT LINE (NEL)
 
     /* add for u_isIDIgnorable() what was not added above */
-    sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
+    sa->add(sa->set, 0x7f); /* range DEL..NBSP-1, NBSP added below */
     sa->add(sa->set, HAIRSP);
     sa->add(sa->set, RLM+1);
-    sa->add(sa->set, INHSWAP);
-    sa->add(sa->set, NOMDIG+1);
+    sa->add(sa->set, 0x206a);  // INHIBIT SYMMETRIC SWAPPING
+    sa->add(sa->set, 0x206f+1);  // NOMINAL DIGIT SHAPES
     USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
 
     /* add no-break spaces for u_isWhitespace() what was not added above */
@@ -693,23 +693,25 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     USET_ADD_CP_AND_NEXT(sa, NNBSP);
 
     /* add for u_digit() */
-    sa->add(sa->set, U_a);
-    sa->add(sa->set, U_z+1);
-    sa->add(sa->set, U_A);
-    sa->add(sa->set, U_Z+1);
-    sa->add(sa->set, U_FW_a);
-    sa->add(sa->set, U_FW_z+1);
-    sa->add(sa->set, U_FW_A);
-    sa->add(sa->set, U_FW_Z+1);
+    sa->add(sa->set, u'a');
+    sa->add(sa->set, u'z'+1);
+    sa->add(sa->set, u'A');
+    sa->add(sa->set, u'Z'+1);
+    // fullwidth
+    sa->add(sa->set, u'ａ');
+    sa->add(sa->set, u'ｚ'+1);
+    sa->add(sa->set, u'Ａ');
+    sa->add(sa->set, u'Ｚ'+1);
 
     /* add for u_isxdigit() */
-    sa->add(sa->set, U_f+1);
-    sa->add(sa->set, U_F+1);
-    sa->add(sa->set, U_FW_f+1);
-    sa->add(sa->set, U_FW_F+1);
+    sa->add(sa->set, u'f'+1);
+    sa->add(sa->set, u'F'+1);
+    // fullwidth
+    sa->add(sa->set, u'ｆ'+1);
+    sa->add(sa->set, u'Ｆ'+1);
 
     /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
-    sa->add(sa->set, WJ); /* range WJ..NOMDIG */
+    sa->add(sa->set, 0x2060); /* range 2060..206f */
     sa->add(sa->set, 0xfff0);
     sa->add(sa->set, 0xfffb+1);
     sa->add(sa->set, 0xe0000);
diff --git a/thirdparty/icu4c/common/ucnv2022.cpp b/thirdparty/icu4c/common/ucnv2022.cpp
index 169ad4c526..1726440b94 100644
--- a/thirdparty/icu4c/common/ucnv2022.cpp
+++ b/thirdparty/icu4c/common/ucnv2022.cpp
@@ -820,7 +820,7 @@ getKey_2022(char c,int32_t* key,int32_t* offset){
     return INVALID_2022;
 }
 
-/*runs through a state machine to determine the escape sequence - codepage correspondance
+/*runs through a state machine to determine the escape sequence - codepage correspondence
  */
 static void
 changeState_2022(UConverter* _this,
@@ -1424,7 +1424,7 @@ toUnicodeCallback(UConverter *cnv,
 *          KSC5601 : alias to ibm-949 mapping table
 *          GB2312 : alias to ibm-1386 mapping table
 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
-*          ISO-8859-7 : alisas to ibm-9409 mapping table
+*          ISO-8859-7 : alias to ibm-9409 mapping table
 */
 
 /* preference order of JP charsets */
@@ -2324,7 +2324,7 @@ endloop:
 /***************************************************************
 *   Rules for ISO-2022-KR encoding
 *   i) The KSC5601 designator sequence should appear only once in a file,
-*      at the begining of a line before any KSC5601 characters. This usually
+*      at the beginning of a line before any KSC5601 characters. This usually
 *      means that it appears by itself on the first line of the file
 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
 *      and SI to shift into single byte mode
diff --git a/thirdparty/icu4c/common/ucnv_bld.cpp b/thirdparty/icu4c/common/ucnv_bld.cpp
index 0e198892f1..d08eec7369 100644
--- a/thirdparty/icu4c/common/ucnv_bld.cpp
+++ b/thirdparty/icu4c/common/ucnv_bld.cpp
@@ -427,7 +427,7 @@ getAlgorithmicTypeFromName(const char *realName)
 #define UCNV_CACHE_LOAD_FACTOR 2
 
 /* Puts the shared data in the static hashtable SHARED_DATA_HASHTABLE */
-/*   Will always be called with the cnvCacheMutex alrady being held   */
+/*   Will always be called with the cnvCacheMutex already being held   */
 /*     by the calling function.                                       */
 /* Stores the shared data in the SHARED_DATA_HASHTABLE
  * @param data The shared data
diff --git a/thirdparty/icu4c/common/ucnv_err.cpp b/thirdparty/icu4c/common/ucnv_err.cpp
index 6b738face5..e1f2b934aa 100644
--- a/thirdparty/icu4c/common/ucnv_err.cpp
+++ b/thirdparty/icu4c/common/ucnv_err.cpp
@@ -321,7 +321,7 @@ UCNV_FROM_U_CALLBACK_ESCAPE (
       case UCNV_PRV_ESCAPE_CSS2:
           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
-          /* Always add space character, becase the next character might be whitespace,
+          /* Always add space character, because the next character might be whitespace,
              which would erroneously be considered the termination of the escape sequence. */
           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
           break;
diff --git a/thirdparty/icu4c/common/ucnv_lmb.cpp b/thirdparty/icu4c/common/ucnv_lmb.cpp
index 168392837b..41317d1cc0 100644
--- a/thirdparty/icu4c/common/ucnv_lmb.cpp
+++ b/thirdparty/icu4c/common/ucnv_lmb.cpp
@@ -81,7 +81,7 @@
   [G] D1 [D2]
 
   That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
-  data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
+  data bytes. The maximum size of a LMBCS character is 3 bytes:
 */
 #define ULMBCS_CHARSIZE_MAX      3
 /*
@@ -164,7 +164,7 @@ beginning of internal 'system' range names: */
 /* Then we needed a place to put all the other ansi control characters 
 that must be moved to different values because LMBCS reserves those 
 values for other purposes. To represent the control characters, we start 
-with a first byte of 0xF & add the control chaarcter value as the 
+with a first byte of 0xF & add the control character value as the 
 second byte */
 #define ULMBCS_GRP_CTRL       0x0F   
 
diff --git a/thirdparty/icu4c/common/ucnv_u7.cpp b/thirdparty/icu4c/common/ucnv_u7.cpp
index 87ba8cf37e..de9f3f42ec 100644
--- a/thirdparty/icu4c/common/ucnv_u7.cpp
+++ b/thirdparty/icu4c/common/ucnv_u7.cpp
@@ -814,7 +814,7 @@ const UConverterSharedData _UTF7Data=
  *       the use of "~" in some servers as a home directory indicator.
  *
  *    5) UTF-7 permits multiple alternate forms to represent the same
- *       string; in particular, printable US-ASCII chararacters can be
+ *       string; in particular, printable US-ASCII characters can be
  *       represented in encoded form.
  *
  * In modified UTF-7, printable US-ASCII characters except for "&"
diff --git a/thirdparty/icu4c/common/ucnvisci.cpp b/thirdparty/icu4c/common/ucnvisci.cpp
index 44a7c05a3c..ffb8c7ac3e 100644
--- a/thirdparty/icu4c/common/ucnvisci.cpp
+++ b/thirdparty/icu4c/common/ucnvisci.cpp
@@ -992,7 +992,7 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
                     
                     if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { 
                         if (sourceChar == PNJ_TIPPI) {
-                            /* Make sure Tippi is converterd to Bindi. */
+                            /* Make sure Tippi is converted to Bindi. */
                             sourceChar = PNJ_BINDI;
                         } else if (sourceChar == PNJ_ADHAK) {
                             /* This is for consonant cluster handling. */
@@ -1147,7 +1147,7 @@ static const uint16_t lookupTable[][2]={
     /* is the code point valid in current script? */                                     \
     if(sourceChar> ASCII_END &&                                                          \
             (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){    \
-        /* Vocallic RR is assigne in ISCII Telugu and Unicode */                         \
+        /* Vocallic RR is assigned in ISCII Telugu and Unicode */                         \
         if(data->currentDeltaToUnicode!=(TELUGU_DELTA) ||                                \
                     targetUniChar!=VOCALLIC_RR){                                         \
             targetUniChar=missingCharMarker;                                             \
@@ -1272,7 +1272,7 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCo
                 goto CALLBACK;
             } else if (*contextCharToUnicode==ISCII_INV) {
                 if (sourceChar==ISCII_HALANT) {
-                    targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */
+                    targetUniChar = 0x0020; /* replace with space according to Indic FAQ */
                 } else {
                     targetUniChar = ZWJ;
                 }
diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp
index 0e14cddcff..20bbd51488 100644
--- a/thirdparty/icu4c/common/ucurr.cpp
+++ b/thirdparty/icu4c/common/ucurr.cpp
@@ -844,7 +844,7 @@ typedef struct {
 #endif
 
 
-// Comparason function used in quick sort.
+// Comparison function used in quick sort.
 static int U_CALLCONV currencyNameComparator(const void* a, const void* b) {
     const CurrencyNameStruct* currName_1 = (const CurrencyNameStruct*)a;
     const CurrencyNameStruct* currName_2 = (const CurrencyNameStruct*)b;
@@ -1530,7 +1530,7 @@ uprv_parseCurrency(const char* locale,
 
     int32_t max = 0;
     int32_t matchIndex = -1;
-    // case in-sensitive comparision against currency names
+    // case in-sensitive comparison against currency names
     searchCurrencyName(currencyNames, total_currency_name_count, 
                        upperText, textLen, partialMatchLen, &max, &matchIndex);
 
diff --git a/thirdparty/icu4c/common/uhash.cpp b/thirdparty/icu4c/common/uhash.cpp
index 86311ceb0b..67c7c36354 100644
--- a/thirdparty/icu4c/common/uhash.cpp
+++ b/thirdparty/icu4c/common/uhash.cpp
@@ -133,8 +133,10 @@ static const float RESIZE_POLICY_RATIO_TABLE[6] = {
  * or a pointer.  If a hint bit is zero, then the associated
  * token is assumed to be an integer.
  */
+#define HINT_BOTH_INTEGERS (0)
 #define HINT_KEY_POINTER   (1)
 #define HINT_VALUE_POINTER (2)
+#define HINT_ALLOW_ZERO    (4)
 
 /********************************************************************
  * PRIVATE Implementation
@@ -479,8 +481,9 @@ _uhash_put(UHashtable *hash,
         goto err;
     }
     U_ASSERT(hash != NULL);
-    /* Cannot always check pointer here or iSeries sees NULL every time. */
-    if ((hint & HINT_VALUE_POINTER) && value.pointer == NULL) {
+    if ((hint & HINT_VALUE_POINTER) ?
+            value.pointer == NULL :
+            value.integer == 0 && (hint & HINT_ALLOW_ZERO) == 0) {
         /* Disallow storage of NULL values, since NULL is returned by
          * get() to indicate an absent key.  Storing NULL == removing.
          */
@@ -687,6 +690,28 @@ uhash_igeti(const UHashtable *hash,
     return _uhash_find(hash, keyholder, hash->keyHasher(keyholder))->value.integer;
 }
 
+U_CAPI int32_t U_EXPORT2
+uhash_getiAndFound(const UHashtable *hash,
+                   const void *key,
+                   UBool *found) {
+    UHashTok keyholder;
+    keyholder.pointer = (void *)key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    *found = !IS_EMPTY_OR_DELETED(e->hashcode);
+    return e->value.integer;
+}
+
+U_CAPI int32_t U_EXPORT2
+uhash_igetiAndFound(const UHashtable *hash,
+                    int32_t key,
+                    UBool *found) {
+    UHashTok keyholder;
+    keyholder.integer = key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    *found = !IS_EMPTY_OR_DELETED(e->hashcode);
+    return e->value.integer;
+}
+
 U_CAPI void* U_EXPORT2
 uhash_put(UHashtable *hash,
           void* key,
@@ -736,7 +761,34 @@ uhash_iputi(UHashtable *hash,
     keyholder.integer = key;
     valueholder.integer = value;
     return _uhash_put(hash, keyholder, valueholder,
-                      0, /* neither is a ptr */
+                      HINT_BOTH_INTEGERS,
+                      status).integer;
+}
+
+U_CAPI int32_t U_EXPORT2
+uhash_putiAllowZero(UHashtable *hash,
+                    void *key,
+                    int32_t value,
+                    UErrorCode *status) {
+    UHashTok keyholder, valueholder;
+    keyholder.pointer = key;
+    valueholder.integer = value;
+    return _uhash_put(hash, keyholder, valueholder,
+                      HINT_KEY_POINTER | HINT_ALLOW_ZERO,
+                      status).integer;
+}
+
+
+U_CAPI int32_t U_EXPORT2
+uhash_iputiAllowZero(UHashtable *hash,
+                     int32_t key,
+                     int32_t value,
+                     UErrorCode *status) {
+    UHashTok keyholder, valueholder;
+    keyholder.integer = key;
+    valueholder.integer = value;
+    return _uhash_put(hash, keyholder, valueholder,
+                      HINT_BOTH_INTEGERS | HINT_ALLOW_ZERO,
                       status).integer;
 }
 
@@ -785,6 +837,29 @@ uhash_removeAll(UHashtable *hash) {
     U_ASSERT(hash->count == 0);
 }
 
+U_CAPI UBool U_EXPORT2
+uhash_containsKey(const UHashtable *hash, const void *key) {
+    UHashTok keyholder;
+    keyholder.pointer = (void *)key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    return !IS_EMPTY_OR_DELETED(e->hashcode);
+}
+
+/**
+ * Returns true if the UHashtable contains an item with this integer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_icontainsKey(const UHashtable *hash, int32_t key) {
+    UHashTok keyholder;
+    keyholder.integer = key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    return !IS_EMPTY_OR_DELETED(e->hashcode);
+}
+
 U_CAPI const UHashElement* U_EXPORT2
 uhash_find(const UHashtable *hash, const void* key) {
     UHashTok keyholder;
diff --git a/thirdparty/icu4c/common/uhash.h b/thirdparty/icu4c/common/uhash.h
index b59d2711bb..af75999860 100644
--- a/thirdparty/icu4c/common/uhash.h
+++ b/thirdparty/icu4c/common/uhash.h
@@ -23,7 +23,7 @@
 /**
  * UHashtable stores key-value pairs and does moderately fast lookup
  * based on keys.  It provides a good tradeoff between access time and
- * storage space.  As elements are added to it, it grows to accomodate
+ * storage space.  As elements are added to it, it grows to accommodate
  * them.  By default, the table never shrinks, even if all elements
  * are removed from it.
  *
@@ -54,6 +54,13 @@
  * uhash_remove() on that key.  This keeps uhash_get(), uhash_count(),
  * and uhash_nextElement() consistent with one another.
  *
+ * Keys and values can be integers.
+ * Functions that work with an integer key have an "i" prefix.
+ * Functions that work with an integer value have an "i" suffix.
+ * As with putting a NULL value pointer, putting a zero value integer removes the item.
+ * Except, there are pairs of functions that allow setting zero values
+ * and fetching (value, found) pairs.
+ *
  * To see everything in a hashtable, use uhash_nextElement() to
  * iterate through its contents.  Each call to this function returns a
  * UHashElement pointer.  A hash element contains a key, value, and
@@ -406,6 +413,44 @@ uhash_iputi(UHashtable *hash,
            UErrorCode *status);
 
 /**
+ * Put a (key=pointer, value=integer) item in a UHashtable.  If the
+ * keyDeleter is non-NULL, then the hashtable owns 'key' after this
+ * call.  valueDeleter must be NULL.
+ * Storing a 0 value is possible; call uhash_igetiAndFound() to retrieve values including zero.
+ *
+ * @param hash The target UHashtable.
+ * @param key The key to store.
+ * @param value The integer value to store.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The previous value, or 0 if none.
+ * @see uhash_getiAndFound
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_putiAllowZero(UHashtable *hash,
+                    void *key,
+                    int32_t value,
+                    UErrorCode *status);
+
+/**
+ * Put a (key=integer, value=integer) item in a UHashtable.  If the
+ * keyDeleter is non-NULL, then the hashtable owns 'key' after this
+ * call.  valueDeleter must be NULL.
+ * Storing a 0 value is possible; call uhash_igetiAndFound() to retrieve values including zero.
+ *
+ * @param hash The target UHashtable.
+ * @param key The key to store.
+ * @param value The integer value to store.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The previous value, or 0 if none.
+ * @see uhash_igetiAndFound
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_iputiAllowZero(UHashtable *hash,
+                     int32_t key,
+                     int32_t value,
+                     UErrorCode *status);
+
+/**
  * Retrieve a pointer value from a UHashtable using a pointer key,
  * as previously stored by uhash_put().
  * @param hash The target UHashtable.
@@ -449,6 +494,34 @@ uhash_igeti(const UHashtable *hash,
            int32_t key);
 
 /**
+ * Retrieves an integer value from a UHashtable using a pointer key,
+ * as previously stored by uhash_putiAllowZero() or uhash_puti().
+ *
+ * @param hash The target UHashtable.
+ * @param key A pointer key stored in a hashtable
+ * @param found A pointer to a boolean which will be set for whether the key was found.
+ * @return The requested item, or 0 if not found.
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_getiAndFound(const UHashtable *hash,
+                   const void *key,
+                   UBool *found);
+
+/**
+ * Retrieves an integer value from a UHashtable using an integer key,
+ * as previously stored by uhash_iputiAllowZero() or uhash_iputi().
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @param found A pointer to a boolean which will be set for whether the key was found.
+ * @return The requested item, or 0 if not found.
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_igetiAndFound(const UHashtable *hash,
+                    int32_t key,
+                    UBool *found);
+
+/**
  * Remove an item from a UHashtable stored by uhash_put().
  * @param hash The target UHashtable.
  * @param key A key stored in a hashtable
@@ -496,6 +569,26 @@ U_CAPI void U_EXPORT2
 uhash_removeAll(UHashtable *hash);
 
 /**
+ * Returns true if the UHashtable contains an item with this pointer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key A pointer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_containsKey(const UHashtable *hash, const void *key);
+
+/**
+ * Returns true if the UHashtable contains an item with this integer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_icontainsKey(const UHashtable *hash, int32_t key);
+
+/**
  * Locate an element of a UHashtable.  The caller must not modify the
  * returned object.  The primary use of this function is to obtain the
  * stored key when it may not be identical to the search key.  For
diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp
index ebfbb50650..d96e79b8fd 100644
--- a/thirdparty/icu4c/common/uloc.cpp
+++ b/thirdparty/icu4c/common/uloc.cpp
@@ -143,7 +143,7 @@ static const char * const LANGUAGES[] = {
     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
-    "ml",  "mn",  "mnc", "mni", "mo",
+    "ml",  "mn",  "mnc", "mni",
     "moh", "mos", "mr",  "mrj",
     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
     "my",  "mye", "myv", "mzn",
@@ -166,9 +166,9 @@ static const char * const LANGUAGES[] = {
     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
-    "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
+    "sv",  "sw",  "swb", "syc", "syr", "szl",
     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
-    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
+    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
@@ -181,7 +181,7 @@ static const char * const LANGUAGES[] = {
     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
     "zun", "zxx", "zza",
 NULL,
-    "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
+    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
 NULL
 };
 
@@ -260,7 +260,7 @@ static const char * const LANGUAGES_3[] = {
     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
-    "mal", "mon", "mnc", "mni", "mol",
+    "mal", "mon", "mnc", "mni",
     "moh", "mos", "mar", "mrj",
     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
     "mya", "mye", "myv", "mzn",
@@ -283,9 +283,9 @@ static const char * const LANGUAGES_3[] = {
     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
-    "swe", "swa", "swb", "swc", "syc", "syr", "szl",
+    "swe", "swa", "swb", "syc", "syr", "szl",
     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
-    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
+    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
@@ -298,8 +298,8 @@ static const char * const LANGUAGES_3[] = {
     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
     "zun", "zxx", "zza",
 NULL,
-/*  "in",  "iw",  "ji",  "jw",  "sh",                          */
-    "ind", "heb", "yid", "jaw", "srp",
+/*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
+    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
 NULL
 };
 
@@ -334,13 +334,13 @@ static const char * const COUNTRIES[] = {
     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
-    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
-    "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
+    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
+    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
-    "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
+    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
@@ -357,7 +357,7 @@ static const char * const COUNTRIES[] = {
     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
-    "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
+    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 NULL,
     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 NULL
@@ -397,10 +397,10 @@ static const char * const COUNTRIES_3[] = {
     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
-/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
-    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
-/*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
-    "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
+/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
+    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
+/*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
+    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
@@ -409,8 +409,8 @@ static const char * const COUNTRIES_3[] = {
     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
-/*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
-    "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
+/*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
+    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
@@ -443,8 +443,8 @@ static const char * const COUNTRIES_3[] = {
     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
-/*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
-    "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
+/*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
+    "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 NULL,
 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
diff --git a/thirdparty/icu4c/common/uloc_keytype.cpp b/thirdparty/icu4c/common/uloc_keytype.cpp
index 019da058cf..c289ebe76f 100644
--- a/thirdparty/icu4c/common/uloc_keytype.cpp
+++ b/thirdparty/icu4c/common/uloc_keytype.cpp
@@ -271,7 +271,7 @@ initFromResourceBundle(UErrorCode& sts) {
                         if (U_FAILURE(sts)) {
                             break;
                         }
-                        // check if this is an alias of canoncal legacy type
+                        // check if this is an alias of canonical legacy type
                         if (uprv_compareInvWithUChar(NULL, legacyTypeId, -1, to, toLen) == 0) {
                             const char* from = ures_getKey(typeAliasDataEntry.getAlias());
                             if (isTZ) {
diff --git a/thirdparty/icu4c/common/uloc_tag.cpp b/thirdparty/icu4c/common/uloc_tag.cpp
index 7f7fd9119e..1235081bf3 100644
--- a/thirdparty/icu4c/common/uloc_tag.cpp
+++ b/thirdparty/icu4c/common/uloc_tag.cpp
@@ -129,7 +129,6 @@ static const char* const LEGACY[] = {
     // Legacy tags with no preferred value in the IANA
     // registry. Kept for now for the backward compatibility
     // because ICU has mapped them this way.
-    "cel-gaulish",  "xtg-x-cel-gaulish",
     "i-default",    "en-x-i-default",
     "i-enochian",   "und-x-i-enochian",
     "i-mingo",      "see-x-i-mingo",
@@ -647,6 +646,22 @@ _isTKey(const char* s, int32_t len)
     return FALSE;
 }
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID) {
+    const char *result = localeID;
+    const char *sep;
+    while((sep = uprv_strchr(result, SEP)) != nullptr) {
+        if (_isTKey(result, static_cast<int32_t>(sep - result))) {
+            return result;
+        }
+        result = ++sep;
+    }
+    if (_isTKey(result, -1)) {
+        return result;
+    }
+    return nullptr;
+}
+
 static UBool
 _isTValue(const char* s, int32_t len)
 {
@@ -671,9 +686,13 @@ _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
     const int32_t kGotTKey = -1;    // Got tkey, wait for tvalue. ERROR if stop here.
     const int32_t kGotTValue = 6;   // Got tvalue, wait for tkey, tvalue or end
 
+
+    if (len < 0) {
+        len = (int32_t)uprv_strlen(s);
+    }
     switch (state) {
         case kStart:
-            if (ultag_isLanguageSubtag(s, len)) {
+            if (ultag_isLanguageSubtag(s, len) && len != 4) {
                 state = kGotLanguage;
                 return TRUE;
             }
@@ -1775,11 +1794,6 @@ _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status)
         return;
     }
 
-    /* Determine if variants already exists */
-    if (ultag_getVariantsSize(langtag)) {
-        posixVariant = TRUE;
-    }
-
     n = ultag_getExtensionsSize(langtag);
 
     /* resolve locale keywords and reordering keys */
@@ -1787,6 +1801,11 @@ _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status)
         key = ultag_getExtensionKey(langtag, i);
         type = ultag_getExtensionValue(langtag, i);
         if (*key == LDMLEXT) {
+            /* Determine if variants already exists */
+            if (ultag_getVariantsSize(langtag)) {
+                posixVariant = TRUE;
+            }
+
             _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
             if (U_FAILURE(*status)) {
                 break;
@@ -2028,7 +2047,10 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
         *status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
-    uprv_memcpy(tagBuf, tag, tagLen);
+    
+    if (tagLen > 0) {
+        uprv_memcpy(tagBuf, tag, tagLen);
+    }
     *(tagBuf + tagLen) = 0;
 
     /* create a ULanguageTag */
@@ -2692,8 +2714,7 @@ ulocimp_toLanguageTag(const char* localeID,
                     if (U_SUCCESS(tmpStatus)) {
                         if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
                             /* return private use only tag */
-                            static const char PREFIX[] = { PRIVATEUSE, SEP };
-                            sink.Append(PREFIX, sizeof(PREFIX));
+                            sink.Append("und-x-", 6);
                             sink.Append(buf.data(), buf.length());
                             done = TRUE;
                         } else if (strict) {
diff --git a/thirdparty/icu4c/common/ulocimp.h b/thirdparty/icu4c/common/ulocimp.h
index 5691fe9a77..1f796aa213 100644
--- a/thirdparty/icu4c/common/ulocimp.h
+++ b/thirdparty/icu4c/common/ulocimp.h
@@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
 U_CFUNC UBool
 ultag_isVariantSubtags(const char* s, int32_t len);
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID);
+
 U_CFUNC const char*
 ulocimp_toBcpKey(const char* key);
 
diff --git a/thirdparty/icu4c/common/unicode/bytestream.h b/thirdparty/icu4c/common/unicode/bytestream.h
index 044f7a77e7..9735ee0bf8 100644
--- a/thirdparty/icu4c/common/unicode/bytestream.h
+++ b/thirdparty/icu4c/common/unicode/bytestream.h
@@ -71,7 +71,6 @@ public:
    */
   virtual void Append(const char* bytes, int32_t n) = 0;
 
-#ifndef U_HIDE_DRAFT_API
   /**
    * Appends n bytes to this. Same as Append().
    * Call AppendU8() with u8"string literals" which are const char * in C++11
@@ -81,7 +80,7 @@ public:
    *
    * @param bytes the pointer to the bytes
    * @param n the number of bytes; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void AppendU8(const char* bytes, int32_t n) {
     Append(bytes, n);
@@ -97,13 +96,12 @@ public:
    *
    * @param bytes the pointer to the bytes
    * @param n the number of bytes; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void AppendU8(const char8_t* bytes, int32_t n) {
     Append(reinterpret_cast<const char*>(bytes), n);
   }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Returns a writable buffer for appending and writes the buffer's capacity to
diff --git a/thirdparty/icu4c/common/unicode/bytestrie.h b/thirdparty/icu4c/common/unicode/bytestrie.h
index 85f802df42..271a81d1b4 100644
--- a/thirdparty/icu4c/common/unicode/bytestrie.h
+++ b/thirdparty/icu4c/common/unicode/bytestrie.h
@@ -30,6 +30,8 @@
 #include "unicode/uobject.h"
 #include "unicode/ustringtrie.h"
 
+class BytesTrieTest;
+
 U_NAMESPACE_BEGIN
 
 class ByteSink;
@@ -378,6 +380,7 @@ public:
 
 private:
     friend class BytesTrieBuilder;
+    friend class ::BytesTrieTest;
 
     /**
      * Constructs a BytesTrie reader instance.
diff --git a/thirdparty/icu4c/common/unicode/bytestriebuilder.h b/thirdparty/icu4c/common/unicode/bytestriebuilder.h
index cae16e48b4..3cff89e443 100644
--- a/thirdparty/icu4c/common/unicode/bytestriebuilder.h
+++ b/thirdparty/icu4c/common/unicode/bytestriebuilder.h
@@ -30,6 +30,8 @@
 #include "unicode/stringpiece.h"
 #include "unicode/stringtriebuilder.h"
 
+class BytesTrieTest;
+
 U_NAMESPACE_BEGIN
 
 class BytesTrieElement;
@@ -125,6 +127,8 @@ public:
     BytesTrieBuilder &clear();
 
 private:
+    friend class ::BytesTrieTest;
+
     BytesTrieBuilder(const BytesTrieBuilder &other);  // no copy constructor
     BytesTrieBuilder &operator=(const BytesTrieBuilder &other);  // no assignment operator
 
@@ -168,6 +172,7 @@ private:
     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal);
     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
     virtual int32_t writeDeltaTo(int32_t jumpTarget);
+    static int32_t internalEncodeDelta(int32_t i, char intBytes[]);
 
     CharString *strings;  // Pointer not object so we need not #include internal charstr.h.
     BytesTrieElement *elements;
diff --git a/thirdparty/icu4c/common/unicode/docmain.h b/thirdparty/icu4c/common/unicode/docmain.h
index edcb5d4e83..e82678c95f 100644
--- a/thirdparty/icu4c/common/unicode/docmain.h
+++ b/thirdparty/icu4c/common/unicode/docmain.h
@@ -15,7 +15,7 @@
  * \file
  * \brief (Non API- contains Doxygen definitions)
  *
- * This file contains documentation for Doxygen and doesnot have
+ * This file contains documentation for Doxygen and does not have
  * any significance with respect to C or C++ API
  */
 
@@ -74,7 +74,7 @@
  *   </tr>
  *   <tr>
  *     <td>Strings and Character Iteration</td>
- *     <td>ustring.h, utf8.h, utf16.h, UText, UCharIterator</td>
+ *     <td>ustring.h, utf8.h, utf16.h, icu::StringPiece, UText, UCharIterator, icu::ByteSink</td>
  *     <td>icu::UnicodeString, icu::CharacterIterator, icu::Appendable, icu::StringPiece,icu::ByteSink</td>
  *   </tr>
  *   <tr>
@@ -128,9 +128,9 @@
  *     <td>icu::Normalizer2</td>
  *   </tr>
  *   <tr>
- *     <td>Calendars</td>
+ *     <td>Calendars and Time Zones</td>
  *     <td>ucal.h</td>
- *     <td>icu::Calendar</td>
+ *     <td>icu::Calendar, icu::TimeZone</td>
  *   </tr>
  *   <tr>
  *     <td>Date and Time Formatting</td>
diff --git a/thirdparty/icu4c/common/unicode/icuplug.h b/thirdparty/icu4c/common/unicode/icuplug.h
index 52f810da57..205af360d4 100644
--- a/thirdparty/icu4c/common/unicode/icuplug.h
+++ b/thirdparty/icu4c/common/unicode/icuplug.h
@@ -117,14 +117,13 @@
 /* === Basic types === */
 
 #ifndef U_HIDE_INTERNAL_API
+struct UPlugData;
 /**
  * @{
- * Opaque structure passed to/from a plugin. 
- * use the APIs to access it.
+ * Typedef for opaque structure passed to/from a plugin. 
+ * Use the APIs to access it.
  * @internal ICU 4.4 Technology Preview
  */
-
-struct UPlugData;
 typedef struct UPlugData UPlugData;
 
 /** @} */
diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h
index 63a68b0b7f..0cd068ef32 100644
--- a/thirdparty/icu4c/common/unicode/localematcher.h
+++ b/thirdparty/icu4c/common/unicode/localematcher.h
@@ -91,8 +91,6 @@ enum ULocMatchDemotion {
 typedef enum ULocMatchDemotion ULocMatchDemotion;
 #endif
 
-#ifndef U_FORCE_HIDE_DRAFT_API
-
 /**
  * Builder option for whether to include or ignore one-way (fallback) match data.
  * The LocaleMatcher uses CLDR languageMatch data which includes fallback (oneway=true) entries.
@@ -108,20 +106,20 @@ typedef enum ULocMatchDemotion ULocMatchDemotion;
  * but not if it is merely a fallback.
  *
  * @see LocaleMatcher::Builder#setDirection(ULocMatchDirection)
- * @draft ICU 67
+ * @stable ICU 67
  */
 enum ULocMatchDirection {
     /**
      * Locale matching includes one-way matches such as Breton→French. (default)
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     ULOCMATCH_DIRECTION_WITH_ONE_WAY,
     /**
      * Locale matching limited to two-way matches including e.g. Danish↔Norwegian
      * but ignoring one-way matches.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     ULOCMATCH_DIRECTION_ONLY_TWO_WAY
 };
@@ -129,8 +127,6 @@ enum ULocMatchDirection {
 typedef enum ULocMatchDirection ULocMatchDirection;
 #endif
 
-#endif  // U_FORCE_HIDE_DRAFT_API
-
 struct UHashtable;
 
 U_NAMESPACE_BEGIN
@@ -463,14 +459,13 @@ public:
          */
         Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion);
 
-#ifndef U_HIDE_DRAFT_API
         /**
          * Option for whether to include or ignore one-way (fallback) match data.
          * By default, they are included.
          *
          * @param direction the match direction to set.
          * @return this Builder object
-         * @draft ICU 67
+         * @stable ICU 67
          */
         Builder &setDirection(ULocMatchDirection direction) {
             if (U_SUCCESS(errorCode_)) {
@@ -478,7 +473,6 @@ public:
             }
             return *this;
         }
-#endif  // U_HIDE_DRAFT_API
 
 #ifndef U_HIDE_DRAFT_API
         /**
@@ -704,7 +698,7 @@ private:
     LSR *lsrs;
     int32_t supportedLocalesLength;
     // These are in preference order: 1. Default locale 2. paradigm locales 3. others.
-    UHashtable *supportedLsrToIndex;  // Map<LSR, Integer> stores index+1 because 0 is "not found"
+    UHashtable *supportedLsrToIndex;  // Map<LSR, Integer>
     // Array versions of the supportedLsrToIndex keys and values.
     // The distance lookup loops over the supportedLSRs and returns the index of the best match.
     const LSR **supportedLSRs;
diff --git a/thirdparty/icu4c/common/unicode/locid.h b/thirdparty/icu4c/common/unicode/locid.h
index ba858d702a..81f4685d65 100644
--- a/thirdparty/icu4c/common/unicode/locid.h
+++ b/thirdparty/icu4c/common/unicode/locid.h
@@ -571,15 +571,13 @@ public:
      */
     void minimizeSubtags(UErrorCode& status);
 
-#ifndef U_HIDE_DRAFT_API
     /**
      * Canonicalize the locale ID of this object according to CLDR.
      * @param status the status code
-     * @draft ICU 67
+     * @stable ICU 67
      * @see createCanonical
      */
     void canonicalize(UErrorCode& status);
-#endif  // U_HIDE_DRAFT_API
 
     /**
      * Gets the list of keywords for the specified locale.
diff --git a/thirdparty/icu4c/common/unicode/normalizer2.h b/thirdparty/icu4c/common/unicode/normalizer2.h
index 5eb1d95caf..2d355250c2 100644
--- a/thirdparty/icu4c/common/unicode/normalizer2.h
+++ b/thirdparty/icu4c/common/unicode/normalizer2.h
@@ -225,10 +225,8 @@ public:
      * Normalizes a UTF-8 string and optionally records how source substrings
      * relate to changed and unchanged result substrings.
      *
-     * Currently implemented completely only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * Otherwise currently converts to & from UTF-16 and does not support edits.
+     * Implemented completely for all built-in modes except for FCD.
+     * The base class implementation converts to & from UTF-16 and does not support edits.
      *
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
      * @param src       Source UTF-8 string.
@@ -381,11 +379,9 @@ public:
      * resolves to "yes" or "no" to provide a definitive result,
      * at the cost of doing more work in those cases.
      *
-     * This works for all normalization modes,
-     * but it is currently optimized for UTF-8 only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     * This works for all normalization modes.
+     * It is optimized for UTF-8 for all built-in modes except for FCD.
+     * The base class implementation converts to UTF-16 and calls isNormalized().
      *
      * @param s UTF-8 input string
      * @param errorCode Standard ICU error code. Its input value must
@@ -543,10 +539,8 @@ public:
      * Normalizes a UTF-8 string and optionally records how source substrings
      * relate to changed and unchanged result substrings.
      *
-     * Currently implemented completely only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * Otherwise currently converts to & from UTF-16 and does not support edits.
+     * Implemented completely for most built-in modes except for FCD.
+     * The base class implementation converts to & from UTF-16 and does not support edits.
      *
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
      * @param src       Source UTF-8 string.
@@ -676,11 +670,9 @@ public:
      * resolves to "yes" or "no" to provide a definitive result,
      * at the cost of doing more work in those cases.
      *
-     * This works for all normalization modes,
-     * but it is currently optimized for UTF-8 only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     * This works for all normalization modes.
+     * It is optimized for UTF-8 for all built-in modes except for FCD.
+     * The base class implementation converts to UTF-16 and calls isNormalized().
      *
      * @param s UTF-8 input string
      * @param errorCode Standard ICU error code. Its input value must
diff --git a/thirdparty/icu4c/common/unicode/platform.h b/thirdparty/icu4c/common/unicode/platform.h
index 2bb2f8b318..cb3a833fef 100644
--- a/thirdparty/icu4c/common/unicode/platform.h
+++ b/thirdparty/icu4c/common/unicode/platform.h
@@ -880,6 +880,6 @@ namespace std {
 #else
 #    define U_CALLCONV_FPTR
 #endif
-/* @} */
+/** @} */
 
 #endif  // _PLATFORM_H
diff --git a/thirdparty/icu4c/common/unicode/stringpiece.h b/thirdparty/icu4c/common/unicode/stringpiece.h
index 7d7d871e1f..8c96789e73 100644
--- a/thirdparty/icu4c/common/unicode/stringpiece.h
+++ b/thirdparty/icu4c/common/unicode/stringpiece.h
@@ -75,12 +75,11 @@ class U_COMMON_API StringPiece : public UMemory {
    * @stable ICU 4.2
    */
   StringPiece(const char* str);
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a NUL-terminated const char8_t * pointer.
    * @param str a NUL-terminated const char8_t * pointer
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
 #endif
@@ -88,10 +87,9 @@ class U_COMMON_API StringPiece : public UMemory {
    * Constructs an empty StringPiece.
    * Needed for type disambiguation from multiple other overloads.
    * @param p nullptr
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Constructs from a std::string.
@@ -99,17 +97,15 @@ class U_COMMON_API StringPiece : public UMemory {
    */
   StringPiece(const std::string& str)
     : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a std::u8string.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const std::u8string& str)
     : ptr_(reinterpret_cast<const char*>(str.data())),
       length_(static_cast<int32_t>(str.size())) { }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Constructs from some other implementation of a string piece class, from any
@@ -152,18 +148,16 @@ class U_COMMON_API StringPiece : public UMemory {
    * @stable ICU 4.2
    */
   StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a const char8_t * pointer and a specified length.
    * @param str a const char8_t * pointer (need not be terminated)
    * @param len the length of the string; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const char8_t* str, int32_t len) :
       StringPiece(reinterpret_cast<const char*>(str), len) {}
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Substring of another StringPiece.
@@ -233,13 +227,12 @@ class U_COMMON_API StringPiece : public UMemory {
    */
   void set(const char* str);
 
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Resets the stringpiece to refer to new data.
    * @param xdata pointer the new string data. Need not be NUL-terminated.
    * @param len the length of the new data
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void set(const char8_t* xdata, int32_t len) {
       set(reinterpret_cast<const char*>(xdata), len);
@@ -248,13 +241,12 @@ class U_COMMON_API StringPiece : public UMemory {
   /**
    * Resets the stringpiece to refer to new data.
    * @param str a pointer to a NUL-terminated string.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void set(const char8_t* str) {
       set(reinterpret_cast<const char*>(str));
   }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Removes the first n string units.
@@ -286,13 +278,12 @@ class U_COMMON_API StringPiece : public UMemory {
     }
   }
 
-#ifndef U_HIDE_DRAFT_API
   /**
    * Searches the StringPiece for the given search string (needle);
    * @param needle The string for which to search.
    * @param offset Where to start searching within this string (haystack).
    * @return The offset of needle in haystack, or -1 if not found.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   int32_t find(StringPiece needle, int32_t offset);
 
@@ -301,10 +292,9 @@ class U_COMMON_API StringPiece : public UMemory {
    * similar to std::string::compare().
    * @param other The string to compare to.
    * @return below zero if this < other; above zero if this > other; 0 if this == other.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   int32_t compare(StringPiece other);
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Maximum integer, used as a default value for substring methods.
diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h
index 37189a8598..1249b0b160 100644
--- a/thirdparty/icu4c/common/unicode/ubrk.h
+++ b/thirdparty/icu4c/common/unicode/ubrk.h
@@ -296,6 +296,8 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
                      const UChar *  text, int32_t textLength,
                      UErrorCode *   status);
 
+#ifndef U_HIDE_DEPRECATED_API
+
 /**
  * Thread safe cloning operation
  * @param bi iterator to be cloned
@@ -312,7 +314,7 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
  * @param status to indicate whether the operation went on smoothly or there were errors
  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
  * @return pointer to the new clone
- * @stable ICU 2.0
+ * @deprecated ICU 69 Use ubrk_clone() instead.
  */
 U_CAPI UBreakIterator * U_EXPORT2
 ubrk_safeClone(
@@ -321,6 +323,23 @@ ubrk_safeClone(
           int32_t *pBufferSize,
           UErrorCode *status);
 
+#endif /* U_HIDE_DEPRECATED_API */
+
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Thread safe cloning operation.
+ * @param bi iterator to be cloned
+ * @param status to indicate whether the operation went on smoothly or there were errors
+ * @return pointer to the new clone
+ * @draft ICU 69
+ */
+U_CAPI UBreakIterator * U_EXPORT2
+ubrk_clone(const UBreakIterator *bi,
+           UErrorCode *status);
+
+#endif  // U_HIDE_DRAFT_API
+
 #ifndef U_HIDE_DEPRECATED_API
 
 /**
diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h
index 58f271cfb5..5d784990f2 100644
--- a/thirdparty/icu4c/common/unicode/ucnv.h
+++ b/thirdparty/icu4c/common/unicode/ucnv.h
@@ -1699,10 +1699,10 @@ ucnv_countAvailable(void);
 
 /**
  * Gets the canonical converter name of the specified converter from a list of
- * all available converters contaied in the alias file. All converters
+ * all available converters contained in the alias file. All converters
  * in this list can be opened.
  *
- * @param n the index to a converter available on the system (in the range <TT>[0..ucnv_countAvaiable()]</TT>)
+ * @param n the index to a converter available on the system (in the range <TT>[0..ucnv_countAvailable()]</TT>)
  * @return a pointer a string (library owned), or <TT>NULL</TT> if the index is out of bounds.
  * @see ucnv_countAvailable
  * @stable ICU 2.0
diff --git a/thirdparty/icu4c/common/unicode/ucnvsel.h b/thirdparty/icu4c/common/unicode/ucnvsel.h
index 5e0a71cf35..3d7d3327f7 100644
--- a/thirdparty/icu4c/common/unicode/ucnvsel.h
+++ b/thirdparty/icu4c/common/unicode/ucnvsel.h
@@ -45,11 +45,11 @@
  * from the serialized form.
  */
 
+struct UConverterSelector;
 /**
  * @{
- * The selector data structure
+ * Typedef for selector data structure.
  */
-struct UConverterSelector;
 typedef struct UConverterSelector UConverterSelector;
 /** @} */
 
diff --git a/thirdparty/icu4c/common/unicode/unifilt.h b/thirdparty/icu4c/common/unicode/unifilt.h
index 420e1a1905..7870b55939 100644
--- a/thirdparty/icu4c/common/unicode/unifilt.h
+++ b/thirdparty/icu4c/common/unicode/unifilt.h
@@ -40,8 +40,8 @@ U_NAMESPACE_BEGIN
  *
  * <code>UnicodeFilter</code> defines a protocol for selecting a
  * subset of the full range (U+0000 to U+10FFFF) of Unicode characters.
- * Currently, filters are used in conjunction with classes like {@link
- * Transliterator} to only process selected characters through a
+ * Currently, filters are used in conjunction with classes like
+ * {@link Transliterator} to only process selected characters through a
  * transformation.
  *
  * <p>Note: UnicodeFilter currently stubs out two pure virtual methods
diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h
index 50b6360f3a..8403c4026c 100644
--- a/thirdparty/icu4c/common/unicode/uniset.h
+++ b/thirdparty/icu4c/common/unicode/uniset.h
@@ -178,8 +178,6 @@ class RuleCharacterIterator;
  * Unicode property
  * </table>
  *
- * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
- *
  * <p><b>Formal syntax</b></p>
  *
  * \htmlonly<blockquote>\endhtmlonly
@@ -601,7 +599,7 @@ public:
 
     /**
      * Make this object represent the range `start - end`.
-     * If `end > start` then this object is set to an empty range.
+     * If `start > end` then this object is set to an empty range.
      * A frozen set will not be modified.
      *
      * @param start first character in the set, inclusive
@@ -1077,7 +1075,7 @@ public:
     /**
      * Adds the specified range to this set if it is not already
      * present.  If this set already contains the specified range,
-     * the call leaves this set unchanged.  If <code>end > start</code>
+     * the call leaves this set unchanged.  If <code>start > end</code>
      * then an empty range is added, leaving the set unchanged.
      * This is equivalent to a boolean logic OR, or a set UNION.
      * A frozen set will not be modified.
@@ -1095,6 +1093,9 @@ public:
      * present.  If this set already contains the specified character,
      * the call leaves this set unchanged.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& add(UChar32 c);
@@ -1104,8 +1105,8 @@ public:
      * present.  If this set already contains the multicharacter,
      * the call leaves this set unchanged.
      * Thus "ch" => {"ch"}
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
      * A frozen set will not be modified.
+     *
      * @param s the source string
      * @return this object, for chaining
      * @stable ICU 2.4
@@ -1124,8 +1125,8 @@ public:
 
  public:
     /**
-     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
+     * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+     * If this set already contains any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1135,7 +1136,6 @@ public:
 
     /**
      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1145,7 +1145,6 @@ public:
 
     /**
      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1155,7 +1154,6 @@ public:
 
     /**
      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1165,7 +1163,7 @@ public:
 
     /**
      * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     *
      * @param s the source string
      * @return a newly created set containing the given string.
      * The caller owns the return object and is responsible for deleting it.
@@ -1185,15 +1183,13 @@ public:
 
     /**
      * Retain only the elements in this set that are contained in the
-     * specified range.  If <code>end > start</code> then an empty range is
+     * specified range.  If <code>start > end</code> then an empty range is
      * retained, leaving the set empty.  This is equivalent to
      * a boolean logic AND, or a set INTERSECTION.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be retained
-     * to this set.
-     * @param end last character, inclusive, of range to be retained
-     * to this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
@@ -1202,14 +1198,31 @@ public:
     /**
      * Retain the specified character from this set if it is present.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& retain(UChar32 c);
 
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Retains only the specified string from this set if it is present.
+     * Upon return this set will be empty if it did not contain s, or
+     * will only contain s if it did contain s.
+     * A frozen set will not be modified.
+     *
+     * @param s the source string
+     * @return this object, for chaining
+     * @draft ICU 69
+     */
+    UnicodeSet& retain(const UnicodeString &s);
+#endif  // U_HIDE_DRAFT_API
+
     /**
      * Removes the specified range from this set if it is present.
      * The set will not contain the specified range once the call
-     * returns.  If <code>end > start</code> then an empty range is
+     * returns.  If <code>start > end</code> then an empty range is
      * removed, leaving the set unchanged.
      * A frozen set will not be modified.
      *
@@ -1226,6 +1239,9 @@ public:
      * The set will not contain the specified range once the call
      * returns.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& remove(UChar32 c);
@@ -1253,15 +1269,13 @@ public:
     /**
      * Complements the specified range in this set.  Any character in
      * the range will be removed if it is in this set, or will be
-     * added if it is not in this set.  If <code>end > start</code>
+     * added if it is not in this set.  If <code>start > end</code>
      * then an empty range is complemented, leaving the set unchanged.
      * This is equivalent to a boolean logic XOR.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be removed
-     * from this set.
-     * @param end last character, inclusive, of range to be removed
-     * from this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
@@ -1271,16 +1285,18 @@ public:
      * will be removed if it is in this set, or will be added if it is
      * not in this set.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& complement(UChar32 c);
 
     /**
      * Complement the specified string in this set.
-     * The set will not contain the specified string once the call
-     * returns.
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     * The string will be removed if it is in this set, or will be added if it is not in this set.
      * A frozen set will not be modified.
+     *
      * @param s the string to complement
      * @return this object, for chaining
      * @stable ICU 2.4
diff --git a/thirdparty/icu4c/common/unicode/unistr.h b/thirdparty/icu4c/common/unicode/unistr.h
index 456389f265..85bd964951 100644
--- a/thirdparty/icu4c/common/unicode/unistr.h
+++ b/thirdparty/icu4c/common/unicode/unistr.h
@@ -44,9 +44,10 @@ struct UConverter;          // unicode/ucnv.h
 #ifndef USTRING_H
 /**
  * \ingroup ustring_ustrlen
+ * @param s Pointer to sequence of UChars.
+ * @return Length of sequence.
  */
-U_CAPI int32_t U_EXPORT2
-u_strlen(const UChar *s);
+U_CAPI int32_t U_EXPORT2 u_strlen(const UChar *s);
 #endif
 
 U_NAMESPACE_BEGIN
@@ -2766,7 +2767,6 @@ public:
    * @param options   Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
    *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
    *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
-   * @param options Options bit set, see ucasemap_open().
    * @return A reference to this.
    * @stable ICU 3.8
    */
@@ -3614,7 +3614,7 @@ private:
   // turn a bogus string into an empty one
   void unBogus();
 
-  // implements assigment operator, copy constructor, and fastCopyFrom()
+  // implements assignment operator, copy constructor, and fastCopyFrom()
   UnicodeString &copyFrom(const UnicodeString &src, UBool fastCopy=false);
 
   // Copies just the fields without memory management.
diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h
index fe59fdd893..737f4b308e 100644
--- a/thirdparty/icu4c/common/unicode/urename.h
+++ b/thirdparty/icu4c/common/unicode/urename.h
@@ -482,6 +482,7 @@
 #define ubiditransform_open U_ICU_ENTRY_POINT_RENAME(ubiditransform_open)
 #define ubiditransform_transform U_ICU_ENTRY_POINT_RENAME(ubiditransform_transform)
 #define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode)
+#define ubrk_clone U_ICU_ENTRY_POINT_RENAME(ubrk_clone)
 #define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close)
 #define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable)
 #define ubrk_current U_ICU_ENTRY_POINT_RENAME(ubrk_current)
@@ -534,6 +535,7 @@
 #define ucal_getTimeZoneDisplayName U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneDisplayName)
 #define ucal_getTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneID)
 #define ucal_getTimeZoneIDForWindowsID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneIDForWindowsID)
+#define ucal_getTimeZoneOffsetFromLocal U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneOffsetFromLocal)
 #define ucal_getTimeZoneTransitionDate U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneTransitionDate)
 #define ucal_getType U_ICU_ENTRY_POINT_RENAME(ucal_getType)
 #define ucal_getWeekendTransition U_ICU_ENTRY_POINT_RENAME(ucal_getWeekendTransition)
@@ -962,6 +964,7 @@
 #define uhash_compareScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_compareScriptSet)
 #define uhash_compareUChars U_ICU_ENTRY_POINT_RENAME(uhash_compareUChars)
 #define uhash_compareUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_compareUnicodeString)
+#define uhash_containsKey U_ICU_ENTRY_POINT_RENAME(uhash_containsKey)
 #define uhash_count U_ICU_ENTRY_POINT_RENAME(uhash_count)
 #define uhash_deleteHashtable U_ICU_ENTRY_POINT_RENAME(uhash_deleteHashtable)
 #define uhash_deleteScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_deleteScriptSet)
@@ -970,6 +973,7 @@
 #define uhash_find U_ICU_ENTRY_POINT_RENAME(uhash_find)
 #define uhash_get U_ICU_ENTRY_POINT_RENAME(uhash_get)
 #define uhash_geti U_ICU_ENTRY_POINT_RENAME(uhash_geti)
+#define uhash_getiAndFound U_ICU_ENTRY_POINT_RENAME(uhash_getiAndFound)
 #define uhash_hashCaselessUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashCaselessUnicodeString)
 #define uhash_hashChars U_ICU_ENTRY_POINT_RENAME(uhash_hashChars)
 #define uhash_hashIChars U_ICU_ENTRY_POINT_RENAME(uhash_hashIChars)
@@ -977,12 +981,15 @@
 #define uhash_hashScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_hashScriptSet)
 #define uhash_hashUChars U_ICU_ENTRY_POINT_RENAME(uhash_hashUChars)
 #define uhash_hashUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashUnicodeString)
+#define uhash_icontainsKey U_ICU_ENTRY_POINT_RENAME(uhash_icontainsKey)
 #define uhash_iget U_ICU_ENTRY_POINT_RENAME(uhash_iget)
 #define uhash_igeti U_ICU_ENTRY_POINT_RENAME(uhash_igeti)
+#define uhash_igetiAndFound U_ICU_ENTRY_POINT_RENAME(uhash_igetiAndFound)
 #define uhash_init U_ICU_ENTRY_POINT_RENAME(uhash_init)
 #define uhash_initSize U_ICU_ENTRY_POINT_RENAME(uhash_initSize)
 #define uhash_iput U_ICU_ENTRY_POINT_RENAME(uhash_iput)
 #define uhash_iputi U_ICU_ENTRY_POINT_RENAME(uhash_iputi)
+#define uhash_iputiAllowZero U_ICU_ENTRY_POINT_RENAME(uhash_iputiAllowZero)
 #define uhash_iremove U_ICU_ENTRY_POINT_RENAME(uhash_iremove)
 #define uhash_iremovei U_ICU_ENTRY_POINT_RENAME(uhash_iremovei)
 #define uhash_nextElement U_ICU_ENTRY_POINT_RENAME(uhash_nextElement)
@@ -990,6 +997,7 @@
 #define uhash_openSize U_ICU_ENTRY_POINT_RENAME(uhash_openSize)
 #define uhash_put U_ICU_ENTRY_POINT_RENAME(uhash_put)
 #define uhash_puti U_ICU_ENTRY_POINT_RENAME(uhash_puti)
+#define uhash_putiAllowZero U_ICU_ENTRY_POINT_RENAME(uhash_putiAllowZero)
 #define uhash_remove U_ICU_ENTRY_POINT_RENAME(uhash_remove)
 #define uhash_removeAll U_ICU_ENTRY_POINT_RENAME(uhash_removeAll)
 #define uhash_removeElement U_ICU_ENTRY_POINT_RENAME(uhash_removeElement)
@@ -1150,6 +1158,8 @@
 #define ultag_isUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleKey)
 #define ultag_isUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleType)
 #define ultag_isVariantSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isVariantSubtags)
+#define umeas_getPrefixBase U_ICU_ENTRY_POINT_RENAME(umeas_getPrefixBase)
+#define umeas_getPrefixPower U_ICU_ENTRY_POINT_RENAME(umeas_getPrefixPower)
 #define umsg_applyPattern U_ICU_ENTRY_POINT_RENAME(umsg_applyPattern)
 #define umsg_autoQuoteApostrophe U_ICU_ENTRY_POINT_RENAME(umsg_autoQuoteApostrophe)
 #define umsg_clone U_ICU_ENTRY_POINT_RENAME(umsg_clone)
@@ -1672,6 +1682,9 @@
 #define uset_compact U_ICU_ENTRY_POINT_RENAME(uset_compact)
 #define uset_complement U_ICU_ENTRY_POINT_RENAME(uset_complement)
 #define uset_complementAll U_ICU_ENTRY_POINT_RENAME(uset_complementAll)
+#define uset_complementAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_complementAllCodePoints)
+#define uset_complementRange U_ICU_ENTRY_POINT_RENAME(uset_complementRange)
+#define uset_complementString U_ICU_ENTRY_POINT_RENAME(uset_complementString)
 #define uset_contains U_ICU_ENTRY_POINT_RENAME(uset_contains)
 #define uset_containsAll U_ICU_ENTRY_POINT_RENAME(uset_containsAll)
 #define uset_containsAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_containsAllCodePoints)
@@ -1695,12 +1708,15 @@
 #define uset_openPatternOptions U_ICU_ENTRY_POINT_RENAME(uset_openPatternOptions)
 #define uset_remove U_ICU_ENTRY_POINT_RENAME(uset_remove)
 #define uset_removeAll U_ICU_ENTRY_POINT_RENAME(uset_removeAll)
+#define uset_removeAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_removeAllCodePoints)
 #define uset_removeAllStrings U_ICU_ENTRY_POINT_RENAME(uset_removeAllStrings)
 #define uset_removeRange U_ICU_ENTRY_POINT_RENAME(uset_removeRange)
 #define uset_removeString U_ICU_ENTRY_POINT_RENAME(uset_removeString)
 #define uset_resemblesPattern U_ICU_ENTRY_POINT_RENAME(uset_resemblesPattern)
 #define uset_retain U_ICU_ENTRY_POINT_RENAME(uset_retain)
 #define uset_retainAll U_ICU_ENTRY_POINT_RENAME(uset_retainAll)
+#define uset_retainAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_retainAllCodePoints)
+#define uset_retainString U_ICU_ENTRY_POINT_RENAME(uset_retainString)
 #define uset_serialize U_ICU_ENTRY_POINT_RENAME(uset_serialize)
 #define uset_serializedContains U_ICU_ENTRY_POINT_RENAME(uset_serializedContains)
 #define uset_set U_ICU_ENTRY_POINT_RENAME(uset_set)
diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h
index 502ea8dc14..1d0daf9d09 100644
--- a/thirdparty/icu4c/common/unicode/uset.h
+++ b/thirdparty/icu4c/common/unicode/uset.h
@@ -582,8 +582,8 @@ U_CAPI void U_EXPORT2
 uset_addString(USet* set, const UChar* str, int32_t strLen);
 
 /**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
+ * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+ * If this set already contains any particular character, it has no effect on that character.
  * A frozen set will not be modified.
  * @param set the object to which to add the character
  * @param str the source string
@@ -628,6 +628,20 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
 U_CAPI void U_EXPORT2
 uset_removeString(USet* set, const UChar* str, int32_t strLen);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Removes from this set all of its elements that are contained in the
  * specified set.  This operation effectively modifies this
@@ -650,15 +664,41 @@ uset_removeAll(USet* set, const USet* removeSet);
  * A frozen set will not be modified.
  *
  * @param set the object for which to retain only the specified range
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
  * @stable ICU 3.2
  */
 U_CAPI void U_EXPORT2
 uset_retain(USet* set, UChar32 start, UChar32 end);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Retains only the specified string from this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Retains only the elements in this set that are contained in the
  * specified set.  In other words, removes from this set all of
@@ -696,6 +736,49 @@ uset_compact(USet* set);
 U_CAPI void U_EXPORT2
 uset_complement(USet* set);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Complements the specified range in this set.  Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set.  If <code>start > end</code>
+ * then an empty range is complemented, leaving the set unchanged.
+ * This is equivalent to a boolean logic XOR.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end);
+
+/**
+ * Complements the specified string in this set.
+ * The string will be removed if it is in this set, or will be added if it is not in this set.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Complements in this set all elements contained in the specified
  * set.  Any character in the other set will be removed if it is
diff --git a/thirdparty/icu4c/common/unicode/ushape.h b/thirdparty/icu4c/common/unicode/ushape.h
index fed4869abd..14371edc8f 100644
--- a/thirdparty/icu4c/common/unicode/ushape.h
+++ b/thirdparty/icu4c/common/unicode/ushape.h
@@ -323,7 +323,7 @@ u_shapeArabic(const UChar *source, int32_t sourceLength,
 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
 /** Presentation form option: 
  * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with 
- * their unshaped correspondants in range 0+06xx, before shaping.
+ * their unshaped correspondents in range 0+06xx, before shaping.
  * @stable ICU 3.6 
  */
 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
diff --git a/thirdparty/icu4c/common/unicode/utrace.h b/thirdparty/icu4c/common/unicode/utrace.h
index 28c313c582..677486f473 100644
--- a/thirdparty/icu4c/common/unicode/utrace.h
+++ b/thirdparty/icu4c/common/unicode/utrace.h
@@ -173,24 +173,23 @@ typedef enum UTraceFunctionNumber {
     UTRACE_RES_DATA_LIMIT,
 #endif  // U_HIDE_INTERNAL_API
 
-#ifndef U_HIDE_DRAFT_API
     /**
      * The lowest break iterator location.
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_START=0x4000,
 
     /**
      * Indicates that a character instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_CHARACTER = UTRACE_UBRK_START,
 
     /**
      * Indicates that a word instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_WORD,
 
@@ -200,21 +199,21 @@ typedef enum UTraceFunctionNumber {
      * Provides one C-style string to UTraceData: the lb value ("",
      * "loose", "strict", or "normal").
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_LINE,
 
     /**
      * Indicates that a sentence instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_SENTENCE,
 
     /**
      * Indicates that a title instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_TITLE,
 
@@ -224,12 +223,10 @@ typedef enum UTraceFunctionNumber {
      * Provides one C-style string to UTraceData: the script code of what
      * the break engine cover ("Hani", "Khmr", "Laoo", "Mymr", or "Thai").
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_BREAK_ENGINE,
 
-#endif  // U_HIDE_DRAFT_API
-
 #ifndef U_HIDE_INTERNAL_API
     /**
      * One more than the highest normal break iterator trace location.
diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h
index a46481a3fe..b09d4943c1 100644
--- a/thirdparty/icu4c/common/unicode/uvernum.h
+++ b/thirdparty/icu4c/common/unicode/uvernum.h
@@ -60,13 +60,13 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION_MAJOR_NUM 68
+#define U_ICU_VERSION_MAJOR_NUM 69
 
 /** The current ICU minor version as an integer.
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_MINOR_NUM 2
+#define U_ICU_VERSION_MINOR_NUM 1
 
 /** The current ICU patchlevel version as an integer.
  *  This value will change in the subsequent releases of ICU
@@ -86,7 +86,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_SUFFIX _68
+#define U_ICU_VERSION_SUFFIX _69
 
 /**
  * \def U_DEF2_ICU_ENTRY_POINT_RENAME
@@ -139,7 +139,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION "68.2"
+#define U_ICU_VERSION "69.1"
 
 /**
  * The current ICU library major version number as a string, for library name suffixes.
@@ -152,13 +152,13 @@
  *
  * @stable ICU 2.6
  */
-#define U_ICU_VERSION_SHORT "68"
+#define U_ICU_VERSION_SHORT "69"
 
 #ifndef U_HIDE_INTERNAL_API
 /** Data version in ICU4C.
  * @internal ICU 4.4 Internal Use Only
  **/
-#define U_ICU_DATA_VERSION "68.2"
+#define U_ICU_DATA_VERSION "69.1"
 #endif  /* U_HIDE_INTERNAL_API */
 
 /*===========================================================================
diff --git a/thirdparty/icu4c/common/uniset.cpp b/thirdparty/icu4c/common/uniset.cpp
index b73d612f24..461e5a7197 100644
--- a/thirdparty/icu4c/common/uniset.cpp
+++ b/thirdparty/icu4c/common/uniset.cpp
@@ -30,24 +30,6 @@
 #include "bmpset.h"
 #include "unisetspan.h"
 
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN        ((UChar)0x005B) /*[*/
-#define SET_CLOSE       ((UChar)0x005D) /*]*/
-#define HYPHEN          ((UChar)0x002D) /*-*/
-#define COMPLEMENT      ((UChar)0x005E) /*^*/
-#define COLON           ((UChar)0x003A) /*:*/
-#define BACKSLASH       ((UChar)0x005C) /*\*/
-#define INTERSECTION    ((UChar)0x0026) /*&*/
-#define UPPER_U         ((UChar)0x0055) /*U*/
-#define LOWER_U         ((UChar)0x0075) /*u*/
-#define OPEN_BRACE      ((UChar)123)    /*{*/
-#define CLOSE_BRACE     ((UChar)125)    /*}*/
-#define UPPER_P         ((UChar)0x0050) /*P*/
-#define LOWER_P         ((UChar)0x0070) /*p*/
-#define UPPER_N         ((UChar)78)     /*N*/
-#define EQUALS          ((UChar)0x003D) /*=*/
-
 // HIGH_VALUE > all valid values. 110000 for codepoints
 #define UNICODESET_HIGH 0x0110000
 
@@ -444,7 +426,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
  * @return <tt>true</tt> if this set contains the specified string
  */
 UBool UnicodeSet::contains(const UnicodeString& s) const {
-    if (s.length() == 0) return FALSE;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         return stringsContains(s);
@@ -559,11 +540,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
     if (hasStrings()) {
         for (i=0; i<strings->size(); ++i) {
             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
-            //if (s.length() == 0) {
-            //    // Empty strings match everything
-            //    return TRUE;
-            //}
-            // assert(s.length() != 0); // We enforce this elsewhere
+            if (s.isEmpty()) {
+                continue;  // skip the empty string
+            }
             UChar32 c = s.char32At(0);
             if ((c & 0xFF) == v) {
                 return TRUE;
@@ -582,9 +561,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
                                  int32_t limit,
                                  UBool incremental) {
     if (offset == limit) {
-        // Strings, if any, have length != 0, so we don't worry
-        // about them here.  If we ever allow zero-length strings
-        // we much check for them here.
         if (contains(U_ETHER)) {
             return incremental ? U_PARTIAL_MATCH : U_MATCH;
         } else {
@@ -614,11 +590,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
 
             for (i=0; i<strings->size(); ++i) {
                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
-
-                //if (trial.length() == 0) {
-                //    return U_MATCH; // null-string always matches
-                //}
-                // assert(trial.length() != 0); // We ensure this elsewhere
+                if (trial.isEmpty()) {
+                    continue;  // skip the empty string
+                }
 
                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 
@@ -971,12 +945,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
  * present.  If this set already contains the multicharacter,
  * the call leaves this set unchanged.
  * Thus "ch" => {"ch"}
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
  * @param s the source string
  * @return the modified set, for chaining
  */
 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (!stringsContains(s)) {
@@ -991,8 +965,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 
 /**
  * Adds the given string, in order, to 'strings'.  The given string
- * must have been checked by the caller to not be empty and to not
- * already be in 'strings'.
+ * must have been checked by the caller to not already be in 'strings'.
  */
 void UnicodeSet::_add(const UnicodeString& s) {
     if (isFrozen() || isBogus()) {
@@ -1021,16 +994,13 @@ void UnicodeSet::_add(const UnicodeString& s) {
  * @param string to test
  */
 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
-    //if (s.length() < 1) {
-    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
-    //}
-    if (s.length() > 2) return -1;
-    if (s.length() == 1) return s.charAt(0);
-
-    // at this point, len = 2
-    UChar32 cp = s.char32At(0);
-    if (cp > 0xFFFF) { // is surrogate pair
-        return cp;
+    int32_t sLength = s.length();
+    if (sLength == 1) return s.charAt(0);
+    if (sLength == 2) {
+        UChar32 cp = s.char32At(0);
+        if (cp > 0xFFFF) { // is surrogate pair
+            return cp;
+        }
     }
     return -1;
 }
@@ -1150,6 +1120,26 @@ UnicodeSet& UnicodeSet::retain(UChar32 c) {
     return retain(c, c);
 }
 
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+    if (isFrozen() || isBogus()) { return *this; }
+    UChar32 cp = getSingleCP(s);
+    if (cp < 0) {
+        bool isIn = stringsContains(s);
+        // Check for getRangeCount() first to avoid somewhat-expensive size()
+        // when there are single code points.
+        if (isIn && getRangeCount() == 0 && size() == 1) {
+            return *this;
+        }
+        clear();
+        if (isIn) {
+            _add(s);
+        }
+    } else {
+        retain(cp, cp);
+    }
+    return *this;
+}
+
 /**
  * Removes the specified range from this set if it is present.
  * The set will not contain the specified range once the call
@@ -1186,7 +1176,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
  * @return the modified set, for chaining
  */
 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (strings != nullptr && strings->removeElement((void*) &s)) {
@@ -1252,12 +1242,12 @@ UnicodeSet& UnicodeSet::complement(void) {
  * Complement the specified string in this set.
  * The set will not contain the specified string once the call
  * returns.
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
  * @param s the string to complement
  * @return this object, for chaining
  */
 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (stringsContains(s)) {
@@ -2001,22 +1991,22 @@ escapeUnprintable) {
     }
     // Okay to let ':' pass through
     switch (c) {
-    case SET_OPEN:
-    case SET_CLOSE:
-    case HYPHEN:
-    case COMPLEMENT:
-    case INTERSECTION:
-    case BACKSLASH:
-    case OPEN_BRACE:
-    case CLOSE_BRACE:
-    case COLON:
+    case u'[':
+    case u']':
+    case u'-':
+    case u'^':
+    case u'&':
+    case u'\\':
+    case u'{':
+    case u'}':
+    case u':':
     case SymbolTable::SYMBOL_REF:
-        buf.append(BACKSLASH);
+        buf.append(u'\\');
         break;
     default:
         // Escape whitespace
         if (PatternProps::isWhiteSpace(c)) {
-            buf.append(BACKSLASH);
+            buf.append(u'\\');
         }
         break;
     }
@@ -2049,7 +2039,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
                 backslashCount = 0;
             } else {
                 result.append(c);
-                if (c == BACKSLASH) {
+                if (c == u'\\') {
                     ++backslashCount;
                 } else {
                     backslashCount = 0;
@@ -2082,13 +2072,13 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
                                             UBool escapeUnprintable) const
 {
-    result.append(SET_OPEN);
+    result.append(u'[');
 
 //  // Check against the predefined categories.  We implicitly build
 //  // up ALL category sets the first time toPattern() is called.
 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
 //      if (*this == getCategorySet(cat)) {
-//          result.append(COLON);
+//          result.append(u':');
 //          result.append(CATEGORY_NAMES, cat*2, 2);
 //          return result.append(CATEGORY_CLOSE);
 //      }
@@ -2104,7 +2094,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
         getRangeEnd(count-1) == MAX_VALUE) {
 
         // Emit the inverse
-        result.append(COMPLEMENT);
+        result.append(u'^');
 
         for (int32_t i = 1; i < count; ++i) {
             UChar32 start = getRangeEnd(i-1)+1;
@@ -2112,7 +2102,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
             _appendToPat(result, start, escapeUnprintable);
             if (start != end) {
                 if ((start+1) != end) {
-                    result.append(HYPHEN);
+                    result.append(u'-');
                 }
                 _appendToPat(result, end, escapeUnprintable);
             }
@@ -2127,7 +2117,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
             _appendToPat(result, start, escapeUnprintable);
             if (start != end) {
                 if ((start+1) != end) {
-                    result.append(HYPHEN);
+                    result.append(u'-');
                 }
                 _appendToPat(result, end, escapeUnprintable);
             }
@@ -2136,14 +2126,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
 
     if (strings != nullptr) {
         for (int32_t i = 0; i<strings->size(); ++i) {
-            result.append(OPEN_BRACE);
+            result.append(u'{');
             _appendToPat(result,
                          *(const UnicodeString*) strings->elementAt(i),
                          escapeUnprintable);
-            result.append(CLOSE_BRACE);
+            result.append(u'}');
         }
     }
-    return result.append(SET_CLOSE);
+    return result.append(u']');
 }
 
 /**
diff --git a/thirdparty/icu4c/common/uniset_props.cpp b/thirdparty/icu4c/common/uniset_props.cpp
index 37277fcb75..8fde5abcdd 100644
--- a/thirdparty/icu4c/common/uniset_props.cpp
+++ b/thirdparty/icu4c/common/uniset_props.cpp
@@ -47,31 +47,6 @@
 
 U_NAMESPACE_USE
 
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN        ((UChar)0x005B) /*[*/
-#define SET_CLOSE       ((UChar)0x005D) /*]*/
-#define HYPHEN          ((UChar)0x002D) /*-*/
-#define COMPLEMENT      ((UChar)0x005E) /*^*/
-#define COLON           ((UChar)0x003A) /*:*/
-#define BACKSLASH       ((UChar)0x005C) /*\*/
-#define INTERSECTION    ((UChar)0x0026) /*&*/
-#define UPPER_U         ((UChar)0x0055) /*U*/
-#define LOWER_U         ((UChar)0x0075) /*u*/
-#define OPEN_BRACE      ((UChar)123)    /*{*/
-#define CLOSE_BRACE     ((UChar)125)    /*}*/
-#define UPPER_P         ((UChar)0x0050) /*P*/
-#define LOWER_P         ((UChar)0x0070) /*p*/
-#define UPPER_N         ((UChar)78)     /*N*/
-#define EQUALS          ((UChar)0x003D) /*=*/
-
-//static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
-static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
-//static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
-//static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
-//static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
-static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
-
 // Special property set IDs
 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
@@ -81,12 +56,6 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
 #define NAME_PROP "na"
 #define NAME_PROP_LENGTH 2
 
-/**
- * Delimiter string used in patterns to close a category reference:
- * ":]".  Example: "[:Lu:]".
- */
-//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
-
 // Cached sets ------------------------------------------------------------- ***
 
 U_CDECL_BEGIN
@@ -140,27 +109,27 @@ uniset_getUnicode32Instance(UErrorCode &errorCode) {
 static inline UBool
 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
     UChar c;
-    return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
+    return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
 }
 
 /*static inline UBool
 isPerlClose(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==CLOSE_BRACE;
+    return pattern.charAt(pos)==u'}';
 }*/
 
 static inline UBool
 isNameOpen(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
+    return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
 }
 
 static inline UBool
 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
+    return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
 }
 
 /*static inline UBool
 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
+    return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
 }*/
 
 // TODO memory debugging provided inside uniset.cpp
@@ -326,9 +295,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 
     while (mode != 2 && !chars.atEnd()) {
         U_ASSERT((lastItem == 0 && op == 0) ||
-                 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
-                 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
-                                    op == INTERSECTION /*'&'*/)));
+                 (lastItem == 1 && (op == 0 || op == u'-')) ||
+                 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
 
         UChar32 c = 0;
         UBool literal = FALSE;
@@ -356,27 +324,27 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             c = chars.next(opts, literal, ec);
             if (U_FAILURE(ec)) return;
 
-            if (c == 0x5B /*'['*/ && !literal) {
+            if (c == u'[' && !literal) {
                 if (mode == 1) {
                     chars.setPos(backup); // backup
                     setMode = 1;
                 } else {
                     // Handle opening '[' delimiter
                     mode = 1;
-                    patLocal.append((UChar) 0x5B /*'['*/);
+                    patLocal.append(u'[');
                     chars.getPos(backup); // prepare to backup
                     c = chars.next(opts, literal, ec); 
                     if (U_FAILURE(ec)) return;
-                    if (c == 0x5E /*'^'*/ && !literal) {
+                    if (c == u'^' && !literal) {
                         invert = TRUE;
-                        patLocal.append((UChar) 0x5E /*'^'*/);
+                        patLocal.append(u'^');
                         chars.getPos(backup); // prepare to backup
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
                     }
                     // Fall through to handle special leading '-';
                     // otherwise restart loop for nested [], \p{}, etc.
-                    if (c == HYPHEN /*'-'*/) {
+                    if (c == u'-') {
                         literal = TRUE;
                         // Fall through to handle literal '-' below
                     } else {
@@ -418,7 +386,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 op = 0;
             }
 
-            if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
+            if (op == u'-' || op == u'&') {
                 patLocal.append(op);
             }
 
@@ -454,10 +422,10 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             }
 
             switch (op) {
-            case HYPHEN: /*'-'*/
+            case u'-':
                 removeAll(*nested);
                 break;
-            case INTERSECTION: /*'&'*/
+            case u'&':
                 retainAll(*nested);
                 break;
             case 0:
@@ -483,24 +451,24 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 
         if (!literal) {
             switch (c) {
-            case 0x5D /*']'*/:
+            case u']':
                 if (lastItem == 1) {
                     add(lastChar, lastChar);
                     _appendToPat(patLocal, lastChar, FALSE);
                 }
                 // Treat final trailing '-' as a literal
-                if (op == HYPHEN /*'-'*/) {
+                if (op == u'-') {
                     add(op, op);
                     patLocal.append(op);
-                } else if (op == INTERSECTION /*'&'*/) {
+                } else if (op == u'&') {
                     // syntaxError(chars, "Trailing '&'");
                     ec = U_MALFORMED_SET;
                     return;
                 }
-                patLocal.append((UChar) 0x5D /*']'*/);
+                patLocal.append(u']');
                 mode = 2;
                 continue;
-            case HYPHEN /*'-'*/:
+            case u'-':
                 if (op == 0) {
                     if (lastItem != 0) {
                         op = (UChar) c;
@@ -510,8 +478,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                         add(c, c);
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
-                        if (c == 0x5D /*']'*/ && !literal) {
-                            patLocal.append(HYPHEN_RIGHT_BRACE, 2);
+                        if (c == u']' && !literal) {
+                            patLocal.append(u"-]", 2);
                             mode = 2;
                             continue;
                         }
@@ -520,7 +488,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // syntaxError(chars, "'-' not after char or set");
                 ec = U_MALFORMED_SET;
                 return;
-            case INTERSECTION /*'&'*/:
+            case u'&':
                 if (lastItem == 2 && op == 0) {
                     op = (UChar) c;
                     continue;
@@ -528,11 +496,11 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // syntaxError(chars, "'&' not after set");
                 ec = U_MALFORMED_SET;
                 return;
-            case 0x5E /*'^'*/:
+            case u'^':
                 // syntaxError(chars, "'^' not after '['");
                 ec = U_MALFORMED_SET;
                 return;
-            case 0x7B /*'{'*/:
+            case u'{':
                 if (op != 0) {
                     // syntaxError(chars, "Missing operand after operator");
                     ec = U_MALFORMED_SET;
@@ -549,13 +517,13 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                     while (!chars.atEnd()) {
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
-                        if (c == 0x7D /*'}'*/ && !literal) {
+                        if (c == u'}' && !literal) {
                             ok = TRUE;
                             break;
                         }
                         buf.append(c);
                     }
-                    if (buf.length() < 1 || !ok) {
+                    if (!ok) {
                         // syntaxError(chars, "Invalid multicharacter string");
                         ec = U_MALFORMED_SET;
                         return;
@@ -565,9 +533,9 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // we don't need to drop through to the further
                 // processing
                 add(buf);
-                patLocal.append((UChar) 0x7B /*'{'*/);
+                patLocal.append(u'{');
                 _appendToPat(patLocal, buf, FALSE);
-                patLocal.append((UChar) 0x7D /*'}'*/);
+                patLocal.append(u'}');
                 continue;
             case SymbolTable::SYMBOL_REF:
                 //         symbols  nosymbols
@@ -580,7 +548,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                     chars.getPos(backup);
                     c = chars.next(opts, literal, ec);
                     if (U_FAILURE(ec)) return;
-                    UBool anchor = (c == 0x5D /*']'*/ && !literal);
+                    UBool anchor = (c == u']' && !literal);
                     if (symbols == 0 && !anchor) {
                         c = SymbolTable::SYMBOL_REF;
                         chars.setPos(backup);
@@ -594,7 +562,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                         add(U_ETHER);
                         usePat = TRUE;
                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
-                        patLocal.append((UChar) 0x5D /*']'*/);
+                        patLocal.append(u']');
                         mode = 2;
                         continue;
                     }
@@ -617,7 +585,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             lastChar = c;
             break;
         case 1:
-            if (op == HYPHEN /*'-'*/) {
+            if (op == u'-') {
                 if (lastChar >= c) {
                     // Don't allow redundant (a-a) or empty (b-a) ranges;
                     // these are most likely typos.
@@ -1036,11 +1004,11 @@ UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
     RuleCharacterIterator::Pos pos;
     chars.getPos(pos);
     UChar32 c = chars.next(iterOpts, literal, ec);
-    if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
+    if (c == u'[' || c == u'\\') {
         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
                                literal, ec);
-        result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
-                 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
+        result = (c == u'[') ? (d == u':') :
+                               (d == u'N' || d == u'p' || d == u'P');
     }
     chars.setPos(pos);
     return result && U_SUCCESS(ec);
@@ -1071,17 +1039,17 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
         posix = TRUE;
         pos += 2;
         pos = ICU_Utility::skipWhitespace(pattern, pos);
-        if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
+        if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
             ++pos;
             invert = TRUE;
         }
     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
         UChar c = pattern.charAt(pos+1);
-        invert = (c == UPPER_P);
-        isName = (c == UPPER_N);
+        invert = (c == u'P');
+        isName = (c == u'N');
         pos += 2;
         pos = ICU_Utility::skipWhitespace(pattern, pos);
-        if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
+        if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
             // Syntax error; "\p" or "\P" not followed by "{"
             FAIL(ec);
         }
@@ -1093,9 +1061,9 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
     // Look for the matching close delimiter, either :] or }
     int32_t close;
     if (posix) {
-      close = pattern.indexOf(POSIX_CLOSE, 2, pos);
+      close = pattern.indexOf(u":]", 2, pos);
     } else {
-      close = pattern.indexOf(CLOSE_BRACE, pos);
+      close = pattern.indexOf(u'}', pos);
     }
     if (close < 0) {
         // Syntax error; close delimiter missing
@@ -1105,7 +1073,7 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
     // Look for an '=' sign.  If this is present, we will parse a
     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
     // pattern.
-    int32_t equals = pattern.indexOf(EQUALS, pos);
+    int32_t equals = pattern.indexOf(u'=', pos);
     UnicodeString propName, valueName;
     if (equals >= 0 && equals < close && !isName) {
         // Equals seen; parse medium/long pattern
diff --git a/thirdparty/icu4c/common/unisetspan.cpp b/thirdparty/icu4c/common/unisetspan.cpp
index 68e44d91ee..fe0d74f5b2 100644
--- a/thirdparty/icu4c/common/unisetspan.cpp
+++ b/thirdparty/icu4c/common/unisetspan.cpp
@@ -231,6 +231,9 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
+        if (length16==0) {
+            continue;  // skip the empty string
+        }
         UBool thisRelevant;
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
         if(spanLength<length16) {  // Relevant string.
@@ -312,7 +315,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
-        if(spanLength<length16) {  // Relevant string.
+        if(spanLength<length16 && length16>0) {  // Relevant string.
             if(which&UTF16) {
                 if(which&CONTAINED) {
                     if(which&FWD) {
@@ -362,7 +365,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
                     addToSpanNotSet(c);
                 }
             }
-        } else {  // Irrelevant string.
+        } else {  // Irrelevant string. (Also the empty string.)
             if(which&UTF8) {
                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
                     uint8_t *s8=utf8+utf8Count;
@@ -653,11 +656,12 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
-                    continue;  // Irrelevant string.
+                    continue;  // Irrelevant string. (Also the empty string.)
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                U_ASSERT(length>0);
 
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
@@ -697,6 +701,9 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                if (length16==0) {
+                    continue;  // skip the empty string
+                }
 
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
@@ -817,11 +824,12 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanBackLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
-                    continue;  // Irrelevant string.
+                    continue;  // Irrelevant string. (Also the empty string.)
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                U_ASSERT(length>0);
 
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
@@ -863,6 +871,9 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                if (length16==0) {
+                    continue;  // skip the empty string
+                }
 
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
@@ -1358,11 +1369,12 @@ int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
         // Try to match the strings at pos.
         for(i=0; i<stringsLength; ++i) {
             if(spanLengths[i]==ALL_CP_CONTAINED) {
-                continue;  // Irrelevant string.
+                continue;  // Irrelevant string. (Also the empty string.)
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
+            U_ASSERT(length>0);
             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
@@ -1401,11 +1413,12 @@ int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const
             // it is easier and we only need to know whether the string is irrelevant
             // which is the same in either array.
             if(spanLengths[i]==ALL_CP_CONTAINED) {
-                continue;  // Irrelevant string.
+                continue;  // Irrelevant string. (Also the empty string.)
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
+            U_ASSERT(length>0);
             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
diff --git a/thirdparty/icu4c/common/uprops.h b/thirdparty/icu4c/common/uprops.h
index 8bf929919f..09830bdeb9 100644
--- a/thirdparty/icu4c/common/uprops.h
+++ b/thirdparty/icu4c/common/uprops.h
@@ -310,55 +310,12 @@ u_isgraphPOSIX(UChar32 c);
 U_CFUNC UBool
 u_isprintPOSIX(UChar32 c);
 
-/** Turn a bit index into a bit flag. @internal */
-#define FLAG(n) ((uint32_t)1<<(n))
-
-/** Flags for general categories in the order of UCharCategory. @internal */
-#define _Cn     FLAG(U_GENERAL_OTHER_TYPES)
-#define _Lu     FLAG(U_UPPERCASE_LETTER)
-#define _Ll     FLAG(U_LOWERCASE_LETTER)
-#define _Lt     FLAG(U_TITLECASE_LETTER)
-#define _Lm     FLAG(U_MODIFIER_LETTER)
-/* #define _Lo     FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */
-#define _Mn     FLAG(U_NON_SPACING_MARK)
-#define _Me     FLAG(U_ENCLOSING_MARK)
-#define _Mc     FLAG(U_COMBINING_SPACING_MARK)
-#define _Nd     FLAG(U_DECIMAL_DIGIT_NUMBER)
-#define _Nl     FLAG(U_LETTER_NUMBER)
-#define _No     FLAG(U_OTHER_NUMBER)
-#define _Zs     FLAG(U_SPACE_SEPARATOR)
-#define _Zl     FLAG(U_LINE_SEPARATOR)
-#define _Zp     FLAG(U_PARAGRAPH_SEPARATOR)
-#define _Cc     FLAG(U_CONTROL_CHAR)
-#define _Cf     FLAG(U_FORMAT_CHAR)
-#define _Co     FLAG(U_PRIVATE_USE_CHAR)
-#define _Cs     FLAG(U_SURROGATE)
-#define _Pd     FLAG(U_DASH_PUNCTUATION)
-#define _Ps     FLAG(U_START_PUNCTUATION)
-/* #define _Pe     FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */
-/* #define _Pc     FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
-#define _Po     FLAG(U_OTHER_PUNCTUATION)
-#define _Sm     FLAG(U_MATH_SYMBOL)
-#define _Sc     FLAG(U_CURRENCY_SYMBOL)
-#define _Sk     FLAG(U_MODIFIER_SYMBOL)
-#define _So     FLAG(U_OTHER_SYMBOL)
-#define _Pi     FLAG(U_INITIAL_PUNCTUATION)
-/* #define _Pf     FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
-
 /** Some code points. @internal */
 enum {
     TAB     =0x0009,
     LF      =0x000a,
     FF      =0x000c,
     CR      =0x000d,
-    U_A     =0x0041,
-    U_F     =0x0046,
-    U_Z     =0x005a,
-    U_a     =0x0061,
-    U_f     =0x0066,
-    U_z     =0x007a,
-    DEL     =0x007f,
-    NL      =0x0085,
     NBSP    =0x00a0,
     CGJ     =0x034f,
     FIGURESP=0x2007,
@@ -367,15 +324,6 @@ enum {
     ZWJ     =0x200d,
     RLM     =0x200f,
     NNBSP   =0x202f,
-    WJ      =0x2060,
-    INHSWAP =0x206a,
-    NOMDIG  =0x206f,
-    U_FW_A  =0xff21,
-    U_FW_F  =0xff26,
-    U_FW_Z  =0xff3a,
-    U_FW_a  =0xff41,
-    U_FW_f  =0xff46,
-    U_FW_z  =0xff5a,
     ZWNBSP  =0xfeff
 };
 
diff --git a/thirdparty/icu4c/common/uresbund.cpp b/thirdparty/icu4c/common/uresbund.cpp
index 2ece87897d..5ea4187100 100644
--- a/thirdparty/icu4c/common/uresbund.cpp
+++ b/thirdparty/icu4c/common/uresbund.cpp
@@ -92,6 +92,15 @@ static UBool chopLocale(char *name) {
 }
 
 /**
+ *  Called to check whether a name without '_' needs to be checked for a parent.
+ *  Some code had assumed that locale IDs with '_' could not have a non-root parent.
+ *  We may want a better way of doing this.
+ */
+static UBool mayHaveParent(char *name) {
+    return (name[0] != 0 && uprv_strstr("nb nn",name) != nullptr);
+}
+
+/**
  *  Internal function
  */
 static void entryIncrease(UResourceDataEntry *entry) {
@@ -529,8 +538,8 @@ loadParentsExceptRoot(UResourceDataEntry *&t1,
                       char name[], int32_t nameCapacity,
                       UBool usingUSRData, char usrDataPath[], UErrorCode *status) {
     if (U_FAILURE(*status)) { return FALSE; }
-    UBool hasChopped = TRUE;
-    while (hasChopped && t1->fParent == NULL && !t1->fData.noFallback &&
+    UBool checkParent = TRUE;
+    while (checkParent && t1->fParent == NULL && !t1->fData.noFallback &&
             res_getResource(&t1->fData,"%%ParentIsRoot") == RES_BOGUS) {
         Resource parentRes = res_getResource(&t1->fData, "%%Parent");
         if (parentRes != RES_BOGUS) {  // An explicit parent was found.
@@ -573,7 +582,7 @@ loadParentsExceptRoot(UResourceDataEntry *&t1,
             }
         }
         t1 = t2;
-        hasChopped = chopLocale(name);
+        checkParent = chopLocale(name) || mayHaveParent(name);
     }
     return TRUE;
 }
@@ -692,7 +701,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID,
                 }
             }
         }
-        if (hasChopped && !isRoot) {
+        if ((hasChopped || mayHaveParent(name)) && !isRoot) {
             if (!loadParentsExceptRoot(t1, name, UPRV_LENGTHOF(name), usingUSRData, usrDataPath, status)) {
                 goto finish;
             }
@@ -716,7 +725,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID,
             hasRealData = TRUE;
             isDefault = TRUE;
             // TODO: Why not if (usingUSRData) { ... } like in the non-default-locale code path?
-            if (hasChopped && !isRoot) {
+            if ((hasChopped || mayHaveParent(name)) && !isRoot) {
                 if (!loadParentsExceptRoot(t1, name, UPRV_LENGTHOF(name), usingUSRData, usrDataPath, status)) {
                     goto finish;
                 }
@@ -1908,6 +1917,8 @@ ures_getByKeyWithFallback(const UResourceBundle *resB,
                             } else {
                               break;
                             }
+                        } else if (res == RES_BOGUS) {
+                            break;
                         }
                     } while(*myPath); /* Continue until the whole path is consumed */
                 }
@@ -3019,7 +3030,7 @@ ures_getKeywordValues(const char *path, const char *keyword, UErrorCode *status)
 U_CAPI UBool U_EXPORT2
 ures_equal(const UResourceBundle* res1, const UResourceBundle* res2){
     if(res1==NULL || res2==NULL){
-        return res1==res2; /* pointer comparision */
+        return res1==res2; /* pointer comparison */
     }
     if(res1->fKey==NULL||  res2->fKey==NULL){
         return (res1->fKey==res2->fKey);
diff --git a/thirdparty/icu4c/common/uresdata.cpp b/thirdparty/icu4c/common/uresdata.cpp
index ae731e4544..9af081be40 100644
--- a/thirdparty/icu4c/common/uresdata.cpp
+++ b/thirdparty/icu4c/common/uresdata.cpp
@@ -960,14 +960,6 @@ res_findResource(const ResourceData *pResData, Resource r, char** path, const ch
     if(URES_IS_TABLE(type)) {
       *key = pathP;
       t2 = res_getTableItemByKey(pResData, t1, &indexR, key);
-      if(t2 == RES_BOGUS) { 
-        /* if we fail to get the resource by key, maybe we got an index */
-        indexR = uprv_strtol(pathP, &closeIndex, 10);
-        if(indexR >= 0 && *closeIndex == 0 && (*pathP != '0' || closeIndex - pathP == 1)) {
-          /* if we indeed have an index, try to get the item by index */
-          t2 = res_getTableItemByIndex(pResData, t1, indexR, key);
-        } // else t2 is already RES_BOGUS
-      }
     } else if(URES_IS_ARRAY(type)) {
       indexR = uprv_strtol(pathP, &closeIndex, 10);
       if(indexR >= 0 && *closeIndex == 0) {
diff --git a/thirdparty/icu4c/common/uresimp.h b/thirdparty/icu4c/common/uresimp.h
index 69d82566fe..f038dedace 100644
--- a/thirdparty/icu4c/common/uresimp.h
+++ b/thirdparty/icu4c/common/uresimp.h
@@ -270,11 +270,13 @@ ures_getByKeyWithFallback(const UResourceBundle *resB,
  * function can perform fallback on the sub-resources of the table.
  * @param resB              a resource
  * @param inKey             a key associated with the requested resource
+ * @param len               if not NULL, used to return the length of the string
  * @param status: fills in the outgoing error code
  *                could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
  *                could be a non-failing error 
  *                e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
- * @return                  a pointer to a UResourceBundle struct. If fill in param was NULL, caller must delete it
+ * @return returns a pointer to a zero-terminated UChar array which lives in a
+ *         memory mapped/DLL file.
  */
 U_CAPI const UChar* U_EXPORT2 
 ures_getStringByKeyWithFallback(const UResourceBundle *resB, 
diff --git a/thirdparty/icu4c/common/uset.cpp b/thirdparty/icu4c/common/uset.cpp
index eae7981d52..a7e3046dbf 100644
--- a/thirdparty/icu4c/common/uset.cpp
+++ b/thirdparty/icu4c/common/uset.cpp
@@ -117,6 +117,12 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen) {
 }
 
 U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::removeAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_removeAll(USet* set, const USet* remove) {
     ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
 }
@@ -127,6 +133,18 @@ uset_retain(USet* set, UChar32 start, UChar32 end) {
 }
 
 U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retain(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retainAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_retainAll(USet* set, const USet* retain) {
     ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
 }
@@ -142,6 +160,23 @@ uset_complement(USet* set) {
 }
 
 U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end) {
+    ((UnicodeSet*) set)->UnicodeSet::complement(start, end);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complement(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complementAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_complementAll(USet* set, const USet* complement) {
     ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
 }
diff --git a/thirdparty/icu4c/common/usprep.cpp b/thirdparty/icu4c/common/usprep.cpp
index 8351a77370..874ffc63a8 100644
--- a/thirdparty/icu4c/common/usprep.cpp
+++ b/thirdparty/icu4c/common/usprep.cpp
@@ -575,7 +575,7 @@ usprep_map(  const UStringPrepProfile* profile,
             }
 
         }else if(type==USPREP_DELETE){
-             // just consume the codepoint and contine
+             // just consume the codepoint and continue
             continue;
         }
         //copy the code point into destination
diff --git a/thirdparty/icu4c/common/ustr_wcs.cpp b/thirdparty/icu4c/common/ustr_wcs.cpp
index e9f278e969..89d0762480 100644
--- a/thirdparty/icu4c/common/ustr_wcs.cpp
+++ b/thirdparty/icu4c/common/ustr_wcs.cpp
@@ -364,7 +364,7 @@ _strFromWCS( UChar   *dest,
                 }
 
                 /* we have found a null  so convert the 
-                 * chunk from begining of non-null char to null
+                 * chunk from beginning of non-null char to null
                  */
                 retVal = uprv_wcstombs(pCSrc,pSrc,remaining);
 
@@ -387,7 +387,7 @@ _strFromWCS( UChar   *dest,
                  * null terminate it and convert wchar_ts to chars
                  */
                 if(nulLen >= _STACK_BUFFER_CAPACITY){
-                    /* Should rarely occcur */
+                    /* Should rarely occur */
                     /* allocate new buffer buffer */
                     pWStack =(wchar_t*) uprv_malloc(sizeof(wchar_t) * (nulLen + 1));
                     if(pWStack==NULL){
diff --git a/thirdparty/icu4c/common/utext.cpp b/thirdparty/icu4c/common/utext.cpp
index 763b6684fb..d79f8141bb 100644
--- a/thirdparty/icu4c/common/utext.cpp
+++ b/thirdparty/icu4c/common/utext.cpp
@@ -382,7 +382,7 @@ utext_previous32From(UText *ut, int64_t index) {
     //
     UChar32     cPrev;    // The character preceding cCurr, which is what we will return.
 
-    // Address the chunk containg the position preceding the incoming index
+    // Address the chunk containing the position preceding the incoming index
     // A tricky edge case:
     //   We try to test the requested native index against the chunkNativeStart to determine
     //    whether the character preceding the one at the index is in the current chunk.
@@ -894,7 +894,7 @@ struct UTF8Buf {
                                                      //    one for a supplementary starting in the last normal position,
                                                      //    and one for an entry for the buffer limit position.
     uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
-                                                     //   correspoding offset in filled part of buf.
+                                                     //   corresponding offset in filled part of buf.
     int32_t   align;
 };
 
@@ -1545,7 +1545,7 @@ utf8TextMapOffsetToNative(const UText *ut) {
 }
 
 //
-// Map a native index to the corrsponding chunk offset
+// Map a native index to the corresponding chunk offset
 //
 static int32_t U_CALLCONV
 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
diff --git a/thirdparty/icu4c/common/util.h b/thirdparty/icu4c/common/util.h
index 9c3b76d9ed..b5fac383a2 100644
--- a/thirdparty/icu4c/common/util.h
+++ b/thirdparty/icu4c/common/util.h
@@ -13,10 +13,10 @@
 #ifndef ICU_UTIL_H
 #define ICU_UTIL_H
 
-#include "unicode/utypes.h"
-#include "unicode/uobject.h"
+#include "charstr.h"
 #include "unicode/unistr.h"
-
+#include "unicode/uobject.h"
+#include "unicode/utypes.h"
 //--------------------------------------------------------------------
 // class ICU_Utility
 // i18n utility functions, scoped into the class ICU_Utility.
diff --git a/thirdparty/icu4c/common/utracimp.h b/thirdparty/icu4c/common/utracimp.h
index f32fe1db39..945540d25a 100644
--- a/thirdparty/icu4c/common/utracimp.h
+++ b/thirdparty/icu4c/common/utracimp.h
@@ -193,7 +193,7 @@ UPRV_BLOCK_MACRO_BEGIN { \
  * Trace statement for each exit point of a function that has a UTRACE_ENTRY()
  * statement, and that returns a value.
  *
- * @param val       The function's return value, int32_t or comatible type.
+ * @param val       The function's return value, int32_t or compatible type.
  *
  * @internal 
  */
diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp
index cf19edf646..9c7e74c6d5 100644
--- a/thirdparty/icu4c/common/uvector.cpp
+++ b/thirdparty/icu4c/common/uvector.cpp
@@ -312,7 +312,7 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const {
     } else {
         for (i=startIndex; i<count; ++i) {
             /* Pointers are not always the same size as ints so to perform
-             * a valid comparision we need to know whether we are being
+             * a valid comparison we need to know whether we are being
              * provided an int or a pointer. */
             if (hint & HINT_KEY_POINTER) {
                 if (key.pointer == elements[i].pointer) {
@@ -518,7 +518,7 @@ sortiComparator(const void * /*context */, const void *left, const void *right)
 }
 
 /**
-  * Sort the vector, assuming it constains ints.
+  * Sort the vector, assuming it contains ints.
   *     (A more general sort would take a comparison function, but it's
   *     not clear whether UVector's UElementComparator or
   *     UComparator from uprv_sortAray would be more appropriate.)
diff --git a/thirdparty/icu4c/common/wintz.cpp b/thirdparty/icu4c/common/wintz.cpp
index 580cedadb6..ebf31650c2 100644
--- a/thirdparty/icu4c/common/wintz.cpp
+++ b/thirdparty/icu4c/common/wintz.cpp
@@ -124,10 +124,26 @@ uprv_detectWindowsTimeZone()
         // No way to support when DST is turned off and the offset in minutes is not a multiple of 60.
         if (utcOffsetMins % 60 == 0) {
             char gmtOffsetTz[11] = {}; // "Etc/GMT+dd" is 11-char long with a terminal null.
-            // Note '-' before 'utcOffsetMin'. The timezone ID's sign convention
-            // is that a timezone ahead of UTC is Etc/GMT-<offset> and a timezone
-            // behind UTC is Etc/GMT+<offset>.
-            int ret = snprintf(gmtOffsetTz, UPRV_LENGTHOF(gmtOffsetTz), "Etc/GMT%+ld", -utcOffsetMins / 60);
+            // Important note on the sign convention for zones:
+            //
+            // From https://en.wikipedia.org/wiki/Tz_database#Area
+            //   "In order to conform with the POSIX style, those zone names beginning with "Etc/GMT" have their sign reversed
+            //   from the standard ISO 8601 convention. In the "Etc" area, zones west of GMT have a positive sign and those
+            //   east have a negative sign in their name (e.g "Etc/GMT-14" is 14 hours ahead of GMT)."
+            //
+            // Regarding the POSIX style, from https://www.gnu.org/software/libc/manual/html_node/TZ-Variable.html
+            //   "The offset specifies the time value you must add to the local time to get a Coordinated Universal Time value."
+            //
+            // However, the Bias value in DYNAMIC_TIME_ZONE_INFORMATION *already* follows the POSIX convention.
+            // 
+            // From https://docs.microsoft.com/en-us/windows/win32/api/timezoneapi/ns-timezoneapi-dynamic_time_zone_information
+            //   "The bias is the difference, in minutes, between Coordinated Universal Time (UTC) and
+            //   local time. All translations between UTC and local time are based on the following formula:
+            //      UTC = local time + bias"
+            //
+            // For example, a time zone that is 3 hours ahead of UTC (UTC+03:00) would have a Bias value of -180, and the
+            // corresponding time zone ID would be "Etc/GMT-3". (So there is no need to negate utcOffsetMins below.)
+            int ret = snprintf(gmtOffsetTz, UPRV_LENGTHOF(gmtOffsetTz), "Etc/GMT%+ld", utcOffsetMins / 60);
             if (ret > 0 && ret < UPRV_LENGTHOF(gmtOffsetTz)) {
                 return uprv_strdup(gmtOffsetTz);
             }
diff --git a/thirdparty/icu4c/icudt68l.dat b/thirdparty/icu4c/icudt69l.dat
index 9ecea5d548..3101a49695 100644
--- a/thirdparty/icu4c/icudt68l.dat
+++ b/thirdparty/icu4c/icudt69l.dat
diff --git a/thirdparty/mbedtls/include/mbedtls/config.h b/thirdparty/mbedtls/include/mbedtls/config.h
index e17bc7e306..610f5d1f50 100644
--- a/thirdparty/mbedtls/include/mbedtls/config.h
+++ b/thirdparty/mbedtls/include/mbedtls/config.h
@@ -1747,6 +1747,23 @@
 //#define MBEDTLS_SSL_TRUNCATED_HMAC_COMPAT
 
 /**
+ * \def MBEDTLS_TEST_HOOKS
+ *
+ * Enable features for invasive testing such as introspection functions and
+ * hooks for fault injection. This enables additional unit tests.
+ *
+ * Merely enabling this feature should not change the behavior of the product.
+ * It only adds new code, and new branching points where the default behavior
+ * is the same as when this feature is disabled.
+ * However, this feature increases the attack surface: there is an added
+ * risk of vulnerabilities, and more gadgets that can make exploits easier.
+ * Therefore this feature must never be enabled in production.
+ *
+ * Uncomment to enable invasive tests.
+ */
+//#define MBEDTLS_TEST_HOOKS
+
+/**
  * \def MBEDTLS_THREADING_ALT
  *
  * Provide your own alternate threading implementation.
diff --git a/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h b/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
index 278fbbbb7a..6c099adf4d 100644
--- a/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
+++ b/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
@@ -214,6 +214,13 @@ typedef struct mbedtls_ctr_drbg_context
     void *p_entropy;            /*!< The context for the entropy function. */
 
 #if defined(MBEDTLS_THREADING_C)
+    /* Invariant: the mutex is initialized if and only if f_entropy != NULL.
+     * This means that the mutex is initialized during the initial seeding
+     * in mbedtls_ctr_drbg_seed() and freed in mbedtls_ctr_drbg_free().
+     *
+     * Note that this invariant may change without notice. Do not rely on it
+     * and do not access the mutex directly in application code.
+     */
     mbedtls_threading_mutex_t mutex;
 #endif
 }
@@ -277,6 +284,15 @@ void mbedtls_ctr_drbg_init( mbedtls_ctr_drbg_context *ctx );
  *                      device.
  */
 #endif
+#if defined(MBEDTLS_THREADING_C)
+/**
+ * \note                When Mbed TLS is built with threading support,
+ *                      after this function returns successfully,
+ *                      it is safe to call mbedtls_ctr_drbg_random()
+ *                      from multiple threads. Other operations, including
+ *                      reseeding, are not thread-safe.
+ */
+#endif /* MBEDTLS_THREADING_C */
 /**
  * \param ctx           The CTR_DRBG context to seed.
  *                      It must have been initialized with
@@ -286,6 +302,8 @@ void mbedtls_ctr_drbg_init( mbedtls_ctr_drbg_context *ctx );
  *                      the same context unless you call
  *                      mbedtls_ctr_drbg_free() and mbedtls_ctr_drbg_init()
  *                      again first.
+ *                      After a failed call to mbedtls_ctr_drbg_seed(),
+ *                      you must call mbedtls_ctr_drbg_free().
  * \param f_entropy     The entropy callback, taking as arguments the
  *                      \p p_entropy context, the buffer to fill, and the
  *                      length of the buffer.
@@ -377,6 +395,11 @@ void mbedtls_ctr_drbg_set_reseed_interval( mbedtls_ctr_drbg_context *ctx,
  * \brief               This function reseeds the CTR_DRBG context, that is
  *                      extracts data from the entropy source.
  *
+ * \note                This function is not thread-safe. It is not safe
+ *                      to call this function if another thread might be
+ *                      concurrently obtaining random numbers from the same
+ *                      context or updating or reseeding the same context.
+ *
  * \param ctx           The CTR_DRBG context.
  * \param additional    Additional data to add to the state. Can be \c NULL.
  * \param len           The length of the additional data.
@@ -394,6 +417,11 @@ int mbedtls_ctr_drbg_reseed( mbedtls_ctr_drbg_context *ctx,
 /**
  * \brief              This function updates the state of the CTR_DRBG context.
  *
+ * \note               This function is not thread-safe. It is not safe
+ *                     to call this function if another thread might be
+ *                     concurrently obtaining random numbers from the same
+ *                     context or updating or reseeding the same context.
+ *
  * \param ctx          The CTR_DRBG context.
  * \param additional   The data to update the state with. This must not be
  *                     \c NULL unless \p add_len is \c 0.
@@ -417,6 +445,11 @@ int mbedtls_ctr_drbg_update_ret( mbedtls_ctr_drbg_context *ctx,
  * This function automatically reseeds if the reseed counter is exceeded
  * or prediction resistance is enabled.
  *
+ * \note                This function is not thread-safe. It is not safe
+ *                      to call this function if another thread might be
+ *                      concurrently obtaining random numbers from the same
+ *                      context or updating or reseeding the same context.
+ *
  * \param p_rng         The CTR_DRBG context. This must be a pointer to a
  *                      #mbedtls_ctr_drbg_context structure.
  * \param output        The buffer to fill.
@@ -445,8 +478,16 @@ int mbedtls_ctr_drbg_random_with_add( void *p_rng,
  *
  * This function automatically reseeds if the reseed counter is exceeded
  * or prediction resistance is enabled.
- *
- *
+ */
+#if defined(MBEDTLS_THREADING_C)
+/**
+ * \note                When Mbed TLS is built with threading support,
+ *                      it is safe to call mbedtls_ctr_drbg_random()
+ *                      from multiple threads. Other operations, including
+ *                      reseeding, are not thread-safe.
+ */
+#endif /* MBEDTLS_THREADING_C */
+/**
  * \param p_rng         The CTR_DRBG context. This must be a pointer to a
  *                      #mbedtls_ctr_drbg_context structure.
  * \param output        The buffer to fill.
diff --git a/thirdparty/mbedtls/include/mbedtls/entropy.h b/thirdparty/mbedtls/include/mbedtls/entropy.h
index 1e1d3f56ec..1d6e9b821b 100644
--- a/thirdparty/mbedtls/include/mbedtls/entropy.h
+++ b/thirdparty/mbedtls/include/mbedtls/entropy.h
@@ -147,13 +147,15 @@ mbedtls_entropy_source_state;
  */
 typedef struct mbedtls_entropy_context
 {
-    int accumulator_started;
+    int accumulator_started; /* 0 after init.
+                              * 1 after the first update.
+                              * -1 after free. */
 #if defined(MBEDTLS_ENTROPY_SHA512_ACCUMULATOR)
     mbedtls_sha512_context  accumulator;
 #else
     mbedtls_sha256_context  accumulator;
 #endif
-    int             source_count;
+    int             source_count; /* Number of entries used in source. */
     mbedtls_entropy_source_state    source[MBEDTLS_ENTROPY_MAX_SOURCES];
 #if defined(MBEDTLS_HAVEGE_C)
     mbedtls_havege_state    havege_data;
diff --git a/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h b/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
index 970c033c15..5718e187a9 100644
--- a/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
+++ b/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
@@ -128,6 +128,14 @@ typedef struct mbedtls_hmac_drbg_context
     void *p_entropy;            /*!< context for the entropy function        */
 
 #if defined(MBEDTLS_THREADING_C)
+    /* Invariant: the mutex is initialized if and only if
+     * md_ctx->md_info != NULL. This means that the mutex is initialized
+     * during the initial seeding in mbedtls_hmac_drbg_seed() or
+     * mbedtls_hmac_drbg_seed_buf() and freed in mbedtls_ctr_drbg_free().
+     *
+     * Note that this invariant may change without notice. Do not rely on it
+     * and do not access the mutex directly in application code.
+     */
     mbedtls_threading_mutex_t mutex;
 #endif
 } mbedtls_hmac_drbg_context;
@@ -177,7 +185,17 @@ void mbedtls_hmac_drbg_init( mbedtls_hmac_drbg_context *ctx );
  * \note                During the initial seeding, this function calls
  *                      the entropy source to obtain a nonce
  *                      whose length is half the entropy length.
- *
+ */
+#if defined(MBEDTLS_THREADING_C)
+/**
+ * \note                When Mbed TLS is built with threading support,
+ *                      after this function returns successfully,
+ *                      it is safe to call mbedtls_hmac_drbg_random()
+ *                      from multiple threads. Other operations, including
+ *                      reseeding, are not thread-safe.
+ */
+#endif /* MBEDTLS_THREADING_C */
+/**
  * \param ctx           HMAC_DRBG context to be seeded.
  * \param md_info       MD algorithm to use for HMAC_DRBG.
  * \param f_entropy     The entropy callback, taking as arguments the
@@ -216,7 +234,17 @@ int mbedtls_hmac_drbg_seed( mbedtls_hmac_drbg_context *ctx,
  *
  * This function is meant for use in algorithms that need a pseudorandom
  * input such as deterministic ECDSA.
- *
+ */
+#if defined(MBEDTLS_THREADING_C)
+/**
+ * \note                When Mbed TLS is built with threading support,
+ *                      after this function returns successfully,
+ *                      it is safe to call mbedtls_hmac_drbg_random()
+ *                      from multiple threads. Other operations, including
+ *                      reseeding, are not thread-safe.
+ */
+#endif /* MBEDTLS_THREADING_C */
+/**
  * \param ctx           HMAC_DRBG context to be initialised.
  * \param md_info       MD algorithm to use for HMAC_DRBG.
  * \param data          Concatenation of the initial entropy string and
@@ -279,6 +307,11 @@ void mbedtls_hmac_drbg_set_reseed_interval( mbedtls_hmac_drbg_context *ctx,
 /**
  * \brief               This function updates the state of the HMAC_DRBG context.
  *
+ * \note                This function is not thread-safe. It is not safe
+ *                      to call this function if another thread might be
+ *                      concurrently obtaining random numbers from the same
+ *                      context or updating or reseeding the same context.
+ *
  * \param ctx           The HMAC_DRBG context.
  * \param additional    The data to update the state with.
  *                      If this is \c NULL, there is no additional data.
@@ -295,6 +328,11 @@ int mbedtls_hmac_drbg_update_ret( mbedtls_hmac_drbg_context *ctx,
  * \brief               This function reseeds the HMAC_DRBG context, that is
  *                      extracts data from the entropy source.
  *
+ * \note                This function is not thread-safe. It is not safe
+ *                      to call this function if another thread might be
+ *                      concurrently obtaining random numbers from the same
+ *                      context or updating or reseeding the same context.
+ *
  * \param ctx           The HMAC_DRBG context.
  * \param additional    Additional data to add to the state.
  *                      If this is \c NULL, there is no additional data
@@ -320,6 +358,11 @@ int mbedtls_hmac_drbg_reseed( mbedtls_hmac_drbg_context *ctx,
  * This function automatically reseeds if the reseed counter is exceeded
  * or prediction resistance is enabled.
  *
+ * \note                This function is not thread-safe. It is not safe
+ *                      to call this function if another thread might be
+ *                      concurrently obtaining random numbers from the same
+ *                      context or updating or reseeding the same context.
+ *
  * \param p_rng         The HMAC_DRBG context. This must be a pointer to a
  *                      #mbedtls_hmac_drbg_context structure.
  * \param output        The buffer to fill.
@@ -349,7 +392,16 @@ int mbedtls_hmac_drbg_random_with_add( void *p_rng,
  *
  * This function automatically reseeds if the reseed counter is exceeded
  * or prediction resistance is enabled.
- *
+ */
+#if defined(MBEDTLS_THREADING_C)
+/**
+ * \note                When Mbed TLS is built with threading support,
+ *                      it is safe to call mbedtls_ctr_drbg_random()
+ *                      from multiple threads. Other operations, including
+ *                      reseeding, are not thread-safe.
+ */
+#endif /* MBEDTLS_THREADING_C */
+/**
  * \param p_rng         The HMAC_DRBG context. This must be a pointer to a
  *                      #mbedtls_hmac_drbg_context structure.
  * \param output        The buffer to fill.
diff --git a/thirdparty/mbedtls/include/mbedtls/net_sockets.h b/thirdparty/mbedtls/include/mbedtls/net_sockets.h
index 00fea7db19..c6e1a0270e 100644
--- a/thirdparty/mbedtls/include/mbedtls/net_sockets.h
+++ b/thirdparty/mbedtls/include/mbedtls/net_sockets.h
@@ -151,6 +151,7 @@ int mbedtls_net_connect( mbedtls_net_context *ctx, const char *host, const char
  *
  * \return         0 if successful, or one of:
  *                      MBEDTLS_ERR_NET_SOCKET_FAILED,
+ *                      MBEDTLS_ERR_NET_UNKNOWN_HOST,
  *                      MBEDTLS_ERR_NET_BIND_FAILED,
  *                      MBEDTLS_ERR_NET_LISTEN_FAILED
  *
@@ -170,6 +171,8 @@ int mbedtls_net_bind( mbedtls_net_context *ctx, const char *bind_ip, const char
  *                  can be NULL if client_ip is null
  *
  * \return          0 if successful, or
+ *                  MBEDTLS_ERR_NET_SOCKET_FAILED,
+ *                  MBEDTLS_ERR_NET_BIND_FAILED,
  *                  MBEDTLS_ERR_NET_ACCEPT_FAILED, or
  *                  MBEDTLS_ERR_NET_BUFFER_TOO_SMALL if buf_size is too small,
  *                  MBEDTLS_ERR_SSL_WANT_READ if bind_fd was set to
@@ -182,6 +185,10 @@ int mbedtls_net_accept( mbedtls_net_context *bind_ctx,
 /**
  * \brief          Check and wait for the context to be ready for read/write
  *
+ * \note           The current implementation of this function uses
+ *                 select() and returns an error if the file descriptor
+ *                 is \c FD_SETSIZE or greater.
+ *
  * \param ctx      Socket to check
  * \param rw       Bitflag composed of MBEDTLS_NET_POLL_READ and
  *                 MBEDTLS_NET_POLL_WRITE specifying the events
@@ -263,16 +270,21 @@ int mbedtls_net_send( void *ctx, const unsigned char *buf, size_t len );
  *                 'timeout' seconds. If no error occurs, the actual amount
  *                 read is returned.
  *
+ * \note           The current implementation of this function uses
+ *                 select() and returns an error if the file descriptor
+ *                 is \c FD_SETSIZE or greater.
+ *
  * \param ctx      Socket
  * \param buf      The buffer to write to
  * \param len      Maximum length of the buffer
  * \param timeout  Maximum number of milliseconds to wait for data
  *                 0 means no timeout (wait forever)
  *
- * \return         the number of bytes received,
- *                 or a non-zero error code:
- *                 MBEDTLS_ERR_SSL_TIMEOUT if the operation timed out,
+ * \return         The number of bytes received if successful.
+ *                 MBEDTLS_ERR_SSL_TIMEOUT if the operation timed out.
  *                 MBEDTLS_ERR_SSL_WANT_READ if interrupted by a signal.
+ *                 Another negative error code (MBEDTLS_ERR_NET_xxx)
+ *                 for other failures.
  *
  * \note           This function will block (until data becomes available or
  *                 timeout is reached) even if the socket is set to
diff --git a/thirdparty/mbedtls/include/mbedtls/rsa.h b/thirdparty/mbedtls/include/mbedtls/rsa.h
index 188c37cf3a..b2f65334fe 100644
--- a/thirdparty/mbedtls/include/mbedtls/rsa.h
+++ b/thirdparty/mbedtls/include/mbedtls/rsa.h
@@ -124,7 +124,10 @@ extern "C" {
  */
 typedef struct mbedtls_rsa_context
 {
-    int ver;                    /*!<  Always 0.*/
+    int ver;                    /*!<  Reserved for internal purposes.
+                                 *    Do not set this field in application
+                                 *    code. Its meaning might change without
+                                 *    notice. */
     size_t len;                 /*!<  The size of \p N in Bytes. */
 
     mbedtls_mpi N;              /*!<  The public modulus. */
@@ -154,6 +157,7 @@ typedef struct mbedtls_rsa_context
                                      mask generating function used in the
                                      EME-OAEP and EMSA-PSS encodings. */
 #if defined(MBEDTLS_THREADING_C)
+    /* Invariant: the mutex is initialized iff ver != 0. */
     mbedtls_threading_mutex_t mutex;    /*!<  Thread-safety mutex. */
 #endif
 }
diff --git a/thirdparty/mbedtls/include/mbedtls/threading.h b/thirdparty/mbedtls/include/mbedtls/threading.h
index a8183a6ef4..45161ce467 100644
--- a/thirdparty/mbedtls/include/mbedtls/threading.h
+++ b/thirdparty/mbedtls/include/mbedtls/threading.h
@@ -73,6 +73,9 @@ extern "C" {
 typedef struct mbedtls_threading_mutex_t
 {
     pthread_mutex_t mutex;
+    /* is_valid is 0 after a failed init or a free, and nonzero after a
+     * successful init. This field is not considered part of the public
+     * API of Mbed TLS and may change without notice. */
     char is_valid;
 } mbedtls_threading_mutex_t;
 #endif
diff --git a/thirdparty/mbedtls/include/mbedtls/version.h b/thirdparty/mbedtls/include/mbedtls/version.h
index 5f0a8f114c..bd5c730c1d 100644
--- a/thirdparty/mbedtls/include/mbedtls/version.h
+++ b/thirdparty/mbedtls/include/mbedtls/version.h
@@ -65,16 +65,16 @@
  */
 #define MBEDTLS_VERSION_MAJOR  2
 #define MBEDTLS_VERSION_MINOR  16
-#define MBEDTLS_VERSION_PATCH  9
+#define MBEDTLS_VERSION_PATCH  10
 
 /**
  * The single version number has the following structure:
  *    MMNNPP00
  *    Major version | Minor version | Patch version
  */
-#define MBEDTLS_VERSION_NUMBER         0x02100900
-#define MBEDTLS_VERSION_STRING         "2.16.9"
-#define MBEDTLS_VERSION_STRING_FULL    "mbed TLS 2.16.9"
+#define MBEDTLS_VERSION_NUMBER         0x02100A00
+#define MBEDTLS_VERSION_STRING         "2.16.10"
+#define MBEDTLS_VERSION_STRING_FULL    "mbed TLS 2.16.10"
 
 #if defined(MBEDTLS_VERSION_C)
 
diff --git a/thirdparty/mbedtls/library/base64.c b/thirdparty/mbedtls/library/base64.c
index bfafb05353..692e11e3fa 100644
--- a/thirdparty/mbedtls/library/base64.c
+++ b/thirdparty/mbedtls/library/base64.c
@@ -97,6 +97,99 @@ static const unsigned char base64_dec_map[128] =
 #define BASE64_SIZE_T_MAX   ( (size_t) -1 ) /* SIZE_T_MAX is not standard */
 
 /*
+ * Constant flow conditional assignment to unsigned char
+ */
+static void mbedtls_base64_cond_assign_uchar( unsigned char * dest, const unsigned char * const src,
+                                       unsigned char condition )
+{
+    /* MSVC has a warning about unary minus on unsigned integer types,
+     * but this is well-defined and precisely what we want to do here. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4146 )
+#endif
+
+    /* Generate bitmask from condition, mask will either be 0xFF or 0 */
+    unsigned char mask = ( condition | -condition );
+    mask >>= 7;
+    mask = -mask;
+
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+
+    *dest = ( ( *src ) & mask ) | ( ( *dest ) & ~mask );
+}
+
+/*
+ * Constant flow conditional assignment to uint_32
+ */
+static void mbedtls_base64_cond_assign_uint32( uint32_t * dest, const uint32_t src,
+                                       uint32_t condition )
+{
+    /* MSVC has a warning about unary minus on unsigned integer types,
+     * but this is well-defined and precisely what we want to do here. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4146 )
+#endif
+
+    /* Generate bitmask from condition, mask will either be 0xFFFFFFFF or 0 */
+    uint32_t mask = ( condition | -condition );
+    mask >>= 31;
+    mask = -mask;
+
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+
+    *dest = ( src & mask ) | ( ( *dest ) & ~mask );
+}
+
+/*
+ * Constant flow check for equality
+ */
+static unsigned char mbedtls_base64_eq( size_t in_a, size_t in_b )
+{
+    size_t difference = in_a ^ in_b;
+
+    /* MSVC has a warning about unary minus on unsigned integer types,
+     * but this is well-defined and precisely what we want to do here. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4146 )
+#endif
+
+    difference |= -difference;
+
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+
+    /* cope with the varying size of size_t per platform */
+    difference >>= ( sizeof( difference ) * 8 - 1 );
+
+    return (unsigned char) ( 1 ^ difference );
+}
+
+/*
+ * Constant flow lookup into table.
+ */
+static unsigned char mbedtls_base64_table_lookup( const unsigned char * const table,
+                                                 const size_t table_size, const size_t table_index )
+{
+    size_t i;
+    unsigned char result = 0;
+
+    for( i = 0; i < table_size; ++i )
+    {
+        mbedtls_base64_cond_assign_uchar( &result, &table[i], mbedtls_base64_eq( i, table_index ) );
+    }
+
+    return result;
+}
+
+/*
  * Encode a buffer into base64 format
  */
 int mbedtls_base64_encode( unsigned char *dst, size_t dlen, size_t *olen,
@@ -136,10 +229,17 @@ int mbedtls_base64_encode( unsigned char *dst, size_t dlen, size_t *olen,
         C2 = *src++;
         C3 = *src++;
 
-        *p++ = base64_enc_map[(C1 >> 2) & 0x3F];
-        *p++ = base64_enc_map[(((C1 &  3) << 4) + (C2 >> 4)) & 0x3F];
-        *p++ = base64_enc_map[(((C2 & 15) << 2) + (C3 >> 6)) & 0x3F];
-        *p++ = base64_enc_map[C3 & 0x3F];
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( ( C1 >> 2 ) & 0x3F ) );
+
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( ( ( ( C1 &  3 ) << 4 ) + ( C2 >> 4 ) ) & 0x3F ) );
+
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( ( ( ( C2 & 15 ) << 2 ) + ( C3 >> 6 ) ) & 0x3F ) );
+
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( C3 & 0x3F ) );
     }
 
     if( i < slen )
@@ -147,11 +247,15 @@ int mbedtls_base64_encode( unsigned char *dst, size_t dlen, size_t *olen,
         C1 = *src++;
         C2 = ( ( i + 1 ) < slen ) ? *src++ : 0;
 
-        *p++ = base64_enc_map[(C1 >> 2) & 0x3F];
-        *p++ = base64_enc_map[(((C1 & 3) << 4) + (C2 >> 4)) & 0x3F];
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( ( C1 >> 2 ) & 0x3F ) );
+
+        *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                            ( ( ( ( C1 & 3 ) << 4 ) + ( C2 >> 4 ) ) & 0x3F ) );
 
         if( ( i + 1 ) < slen )
-             *p++ = base64_enc_map[((C2 & 15) << 2) & 0x3F];
+             *p++ = mbedtls_base64_table_lookup( base64_enc_map, sizeof( base64_enc_map ),
+                                                 ( ( ( C2 & 15 ) << 2 ) & 0x3F ) );
         else *p++ = '=';
 
         *p++ = '=';
@@ -172,6 +276,7 @@ int mbedtls_base64_decode( unsigned char *dst, size_t dlen, size_t *olen,
     size_t i, n;
     uint32_t j, x;
     unsigned char *p;
+    unsigned char dec_map_lookup;
 
     /* First pass: check for validity and get output length */
     for( i = n = j = 0; i < slen; i++ )
@@ -202,10 +307,12 @@ int mbedtls_base64_decode( unsigned char *dst, size_t dlen, size_t *olen,
         if( src[i] == '=' && ++j > 2 )
             return( MBEDTLS_ERR_BASE64_INVALID_CHARACTER );
 
-        if( src[i] > 127 || base64_dec_map[src[i]] == 127 )
+        dec_map_lookup = mbedtls_base64_table_lookup( base64_dec_map, sizeof( base64_dec_map ), src[i] );
+
+        if( src[i] > 127 || dec_map_lookup == 127 )
             return( MBEDTLS_ERR_BASE64_INVALID_CHARACTER );
 
-        if( base64_dec_map[src[i]] < 64 && j != 0 )
+        if( dec_map_lookup < 64 && j != 0 )
             return( MBEDTLS_ERR_BASE64_INVALID_CHARACTER );
 
         n++;
@@ -235,8 +342,10 @@ int mbedtls_base64_decode( unsigned char *dst, size_t dlen, size_t *olen,
         if( *src == '\r' || *src == '\n' || *src == ' ' )
             continue;
 
-        j -= ( base64_dec_map[*src] == 64 );
-        x  = ( x << 6 ) | ( base64_dec_map[*src] & 0x3F );
+        dec_map_lookup = mbedtls_base64_table_lookup( base64_dec_map, sizeof( base64_dec_map ), *src );
+
+        mbedtls_base64_cond_assign_uint32( &j, j - 1, mbedtls_base64_eq( dec_map_lookup, 64 ) );
+        x  = ( x << 6 ) | ( dec_map_lookup & 0x3F );
 
         if( ++n == 4 )
         {
diff --git a/thirdparty/mbedtls/library/bignum.c b/thirdparty/mbedtls/library/bignum.c
index 2feb727d89..f133f6c13c 100644
--- a/thirdparty/mbedtls/library/bignum.c
+++ b/thirdparty/mbedtls/library/bignum.c
@@ -1354,6 +1354,12 @@ int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi
     for( n = B->n; n > 0; n-- )
         if( B->p[n - 1] != 0 )
             break;
+    if( n > A->n )
+    {
+        /* B >= (2^ciL)^n > A */
+        ret = MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
+        goto cleanup;
+    }
 
     carry = mpi_sub_hlp( n, X->p, B->p );
     if( carry != 0 )
diff --git a/thirdparty/mbedtls/library/ctr_drbg.c b/thirdparty/mbedtls/library/ctr_drbg.c
index e92008bbe8..90264e844a 100644
--- a/thirdparty/mbedtls/library/ctr_drbg.c
+++ b/thirdparty/mbedtls/library/ctr_drbg.c
@@ -83,10 +83,6 @@ void mbedtls_ctr_drbg_init( mbedtls_ctr_drbg_context *ctx )
     memset( ctx, 0, sizeof( mbedtls_ctr_drbg_context ) );
 
     ctx->reseed_interval = MBEDTLS_CTR_DRBG_RESEED_INTERVAL;
-
-#if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_init( &ctx->mutex );
-#endif
 }
 
 /*
@@ -99,14 +95,13 @@ void mbedtls_ctr_drbg_free( mbedtls_ctr_drbg_context *ctx )
         return;
 
 #if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_free( &ctx->mutex );
+    /* The mutex is initialized iff f_entropy is set. */
+    if( ctx->f_entropy != NULL )
+        mbedtls_mutex_free( &ctx->mutex );
 #endif
     mbedtls_aes_free( &ctx->aes_ctx );
     mbedtls_platform_zeroize( ctx, sizeof( mbedtls_ctr_drbg_context ) );
     ctx->reseed_interval = MBEDTLS_CTR_DRBG_RESEED_INTERVAL;
-#if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_init( &ctx->mutex );
-#endif
 }
 
 void mbedtls_ctr_drbg_set_prediction_resistance( mbedtls_ctr_drbg_context *ctx, int resistance )
@@ -422,6 +417,11 @@ int mbedtls_ctr_drbg_seed( mbedtls_ctr_drbg_context *ctx,
 
     memset( key, 0, MBEDTLS_CTR_DRBG_KEYSIZE );
 
+    /* The mutex is initialized iff f_entropy is set. */
+#if defined(MBEDTLS_THREADING_C)
+    mbedtls_mutex_init( &ctx->mutex );
+#endif
+
     mbedtls_aes_init( &ctx->aes_ctx );
 
     ctx->f_entropy = f_entropy;
diff --git a/thirdparty/mbedtls/library/ecdsa.c b/thirdparty/mbedtls/library/ecdsa.c
index da8df9cde2..2456238b17 100644
--- a/thirdparty/mbedtls/library/ecdsa.c
+++ b/thirdparty/mbedtls/library/ecdsa.c
@@ -247,6 +247,9 @@ static void ecdsa_restart_det_free( mbedtls_ecdsa_restart_det_ctx *ctx )
 
 #endif /* MBEDTLS_ECP_RESTARTABLE */
 
+#if defined(MBEDTLS_ECDSA_DETERMINISTIC) || \
+    !defined(MBEDTLS_ECDSA_SIGN_ALT)     || \
+    !defined(MBEDTLS_ECDSA_VERIFY_ALT)
 /*
  * Derive a suitable integer for group grp from a buffer of length len
  * SEC1 4.1.3 step 5 aka SEC1 4.1.4 step 3
@@ -269,6 +272,7 @@ static int derive_mpi( const mbedtls_ecp_group *grp, mbedtls_mpi *x,
 cleanup:
     return( ret );
 }
+#endif /* ECDSA_DETERMINISTIC || !ECDSA_SIGN_ALT || !ECDSA_VERIFY_ALT */
 
 #if !defined(MBEDTLS_ECDSA_SIGN_ALT)
 /*
@@ -780,6 +784,8 @@ int mbedtls_ecdsa_write_signature_restartable( mbedtls_ecdsa_context *ctx,
     (void) md_alg;
 
 #if defined(MBEDTLS_ECDSA_SIGN_ALT)
+    (void) rs_ctx;
+
     MBEDTLS_MPI_CHK( mbedtls_ecdsa_sign( &ctx->grp, &r, &s, &ctx->d,
                          hash, hlen, f_rng, p_rng ) );
 #else
@@ -888,6 +894,8 @@ int mbedtls_ecdsa_read_signature_restartable( mbedtls_ecdsa_context *ctx,
         goto cleanup;
     }
 #if defined(MBEDTLS_ECDSA_VERIFY_ALT)
+    (void) rs_ctx;
+
     if( ( ret = mbedtls_ecdsa_verify( &ctx->grp, hash, hlen,
                                       &ctx->Q, &r, &s ) ) != 0 )
         goto cleanup;
diff --git a/thirdparty/mbedtls/library/ecjpake.c b/thirdparty/mbedtls/library/ecjpake.c
index f6e24580c7..0532a295e6 100644
--- a/thirdparty/mbedtls/library/ecjpake.c
+++ b/thirdparty/mbedtls/library/ecjpake.c
@@ -850,6 +850,8 @@ static const unsigned char ecjpake_test_password[] = {
     0x65, 0x73, 0x74
 };
 
+#if !defined(MBEDTLS_ECJPAKE_ALT)
+
 static const unsigned char ecjpake_test_x1[] = {
     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
     0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
@@ -994,6 +996,8 @@ cleanup:
     return( ret );
 }
 
+#endif /* ! MBEDTLS_ECJPAKE_ALT */
+
 /* For tests we don't need a secure RNG;
  * use the LGC from Numerical Recipes for simplicity */
 static int ecjpake_lgc( void *p, unsigned char *out, size_t len )
@@ -1089,6 +1093,12 @@ int mbedtls_ecjpake_self_test( int verbose )
     if( verbose != 0 )
         mbedtls_printf( "passed\n" );
 
+#if !defined(MBEDTLS_ECJPAKE_ALT)
+    /* 'reference handshake' tests can only be run against implementations
+     * for which we have 100% control over how the random ephemeral keys
+     * are generated. This is only the case for the internal mbed TLS
+     * implementation, so these tests are skipped in case the internal
+     * implementation is swapped out for an alternative one. */
     if( verbose != 0 )
         mbedtls_printf( "  ECJPAKE test #2 (reference handshake): " );
 
@@ -1137,6 +1147,7 @@ int mbedtls_ecjpake_self_test( int verbose )
 
     if( verbose != 0 )
         mbedtls_printf( "passed\n" );
+#endif /* ! MBEDTLS_ECJPAKE_ALT */
 
 cleanup:
     mbedtls_ecjpake_free( &cli );
diff --git a/thirdparty/mbedtls/library/entropy.c b/thirdparty/mbedtls/library/entropy.c
index 666c55654c..c5f414a010 100644
--- a/thirdparty/mbedtls/library/entropy.c
+++ b/thirdparty/mbedtls/library/entropy.c
@@ -146,6 +146,11 @@ void mbedtls_entropy_init( mbedtls_entropy_context *ctx )
 
 void mbedtls_entropy_free( mbedtls_entropy_context *ctx )
 {
+    /* If the context was already free, don't call free() again.
+     * This is important for mutexes which don't allow double-free. */
+    if( ctx->accumulator_started == -1 )
+        return;
+
 #if defined(MBEDTLS_HAVEGE_C)
     mbedtls_havege_free( &ctx->havege_data );
 #endif
@@ -162,7 +167,7 @@ void mbedtls_entropy_free( mbedtls_entropy_context *ctx )
 #endif
     ctx->source_count = 0;
     mbedtls_platform_zeroize( ctx->source, sizeof( ctx->source ) );
-    ctx->accumulator_started = 0;
+    ctx->accumulator_started = -1;
 }
 
 int mbedtls_entropy_add_source( mbedtls_entropy_context *ctx,
diff --git a/thirdparty/mbedtls/library/hmac_drbg.c b/thirdparty/mbedtls/library/hmac_drbg.c
index 10cbd462ba..b45d61616f 100644
--- a/thirdparty/mbedtls/library/hmac_drbg.c
+++ b/thirdparty/mbedtls/library/hmac_drbg.c
@@ -84,10 +84,6 @@ void mbedtls_hmac_drbg_init( mbedtls_hmac_drbg_context *ctx )
     memset( ctx, 0, sizeof( mbedtls_hmac_drbg_context ) );
 
     ctx->reseed_interval = MBEDTLS_HMAC_DRBG_RESEED_INTERVAL;
-
-#if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_init( &ctx->mutex );
-#endif
 }
 
 /*
@@ -159,6 +155,10 @@ int mbedtls_hmac_drbg_seed_buf( mbedtls_hmac_drbg_context *ctx,
     if( ( ret = mbedtls_md_setup( &ctx->md_ctx, md_info, 1 ) ) != 0 )
         return( ret );
 
+#if defined(MBEDTLS_THREADING_C)
+    mbedtls_mutex_init( &ctx->mutex );
+#endif
+
     /*
      * Set initial working state.
      * Use the V memory location, which is currently all 0, to initialize the
@@ -284,6 +284,11 @@ int mbedtls_hmac_drbg_seed( mbedtls_hmac_drbg_context *ctx,
     if( ( ret = mbedtls_md_setup( &ctx->md_ctx, md_info, 1 ) ) != 0 )
         return( ret );
 
+    /* The mutex is initialized iff the md context is set up. */
+#if defined(MBEDTLS_THREADING_C)
+    mbedtls_mutex_init( &ctx->mutex );
+#endif
+
     md_size = mbedtls_md_get_size( md_info );
 
     /*
@@ -451,14 +456,13 @@ void mbedtls_hmac_drbg_free( mbedtls_hmac_drbg_context *ctx )
         return;
 
 #if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_free( &ctx->mutex );
+    /* The mutex is initialized iff the md context is set up. */
+    if( ctx->md_ctx.md_info != NULL )
+        mbedtls_mutex_free( &ctx->mutex );
 #endif
     mbedtls_md_free( &ctx->md_ctx );
     mbedtls_platform_zeroize( ctx, sizeof( mbedtls_hmac_drbg_context ) );
     ctx->reseed_interval = MBEDTLS_HMAC_DRBG_RESEED_INTERVAL;
-#if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_init( &ctx->mutex );
-#endif
 }
 
 #if defined(MBEDTLS_FS_IO)
diff --git a/thirdparty/mbedtls/library/net_sockets.c b/thirdparty/mbedtls/library/net_sockets.c
index 1130408263..671115f15f 100644
--- a/thirdparty/mbedtls/library/net_sockets.c
+++ b/thirdparty/mbedtls/library/net_sockets.c
@@ -496,6 +496,13 @@ int mbedtls_net_poll( mbedtls_net_context *ctx, uint32_t rw, uint32_t timeout )
     if( fd < 0 )
         return( MBEDTLS_ERR_NET_INVALID_CONTEXT );
 
+    /* A limitation of select() is that it only works with file descriptors
+     * that are strictly less than FD_SETSIZE. This is a limitation of the
+     * fd_set type. Error out early, because attempting to call FD_SET on a
+     * large file descriptor is a buffer overflow on typical platforms. */
+    if( fd >= FD_SETSIZE )
+        return( MBEDTLS_ERR_NET_POLL_FAILED );
+
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
     /* Ensure that memory sanitizers consider read_fds and write_fds as
@@ -615,6 +622,13 @@ int mbedtls_net_recv_timeout( void *ctx, unsigned char *buf,
     if( fd < 0 )
         return( MBEDTLS_ERR_NET_INVALID_CONTEXT );
 
+    /* A limitation of select() is that it only works with file descriptors
+     * that are strictly less than FD_SETSIZE. This is a limitation of the
+     * fd_set type. Error out early, because attempting to call FD_SET on a
+     * large file descriptor is a buffer overflow on typical platforms. */
+    if( fd >= FD_SETSIZE )
+        return( MBEDTLS_ERR_NET_POLL_FAILED );
+
     FD_ZERO( &read_fds );
     FD_SET( fd, &read_fds );
 
diff --git a/thirdparty/mbedtls/library/pkwrite.c b/thirdparty/mbedtls/library/pkwrite.c
index 150626c147..a770dfb93e 100644
--- a/thirdparty/mbedtls/library/pkwrite.c
+++ b/thirdparty/mbedtls/library/pkwrite.c
@@ -455,7 +455,7 @@ int mbedtls_pk_write_key_der( mbedtls_pk_context *key, unsigned char *buf, size_
  *      publicExponent    INTEGER   -- e            1 + 3 + MPI_MAX + 1
  *  }
  */
-#define RSA_PUB_DER_MAX_BYTES   38 + 2 * MBEDTLS_MPI_MAX_SIZE
+#define RSA_PUB_DER_MAX_BYTES   ( 38 + 2 * MBEDTLS_MPI_MAX_SIZE )
 
 /*
  * RSA private keys:
@@ -472,10 +472,10 @@ int mbedtls_pk_write_key_der( mbedtls_pk_context *key, unsigned char *buf, size_
  *      otherPrimeInfos   OtherPrimeInfos OPTIONAL  0 (not supported)
  *  }
  */
-#define MPI_MAX_SIZE_2          MBEDTLS_MPI_MAX_SIZE / 2 + \
-                                MBEDTLS_MPI_MAX_SIZE % 2
-#define RSA_PRV_DER_MAX_BYTES   47 + 3 * MBEDTLS_MPI_MAX_SIZE \
-                                   + 5 * MPI_MAX_SIZE_2
+#define MPI_MAX_SIZE_2          ( MBEDTLS_MPI_MAX_SIZE / 2 + \
+                                  MBEDTLS_MPI_MAX_SIZE % 2 )
+#define RSA_PRV_DER_MAX_BYTES   ( 47 + 3 * MBEDTLS_MPI_MAX_SIZE \
+                                   + 5 * MPI_MAX_SIZE_2 )
 
 #else /* MBEDTLS_RSA_C */
 
@@ -496,7 +496,7 @@ int mbedtls_pk_write_key_der( mbedtls_pk_context *key, unsigned char *buf, size_
  *                                            + 2 * ECP_MAX (coords)    [1]
  *  }
  */
-#define ECP_PUB_DER_MAX_BYTES   30 + 2 * MBEDTLS_ECP_MAX_BYTES
+#define ECP_PUB_DER_MAX_BYTES   ( 30 + 2 * MBEDTLS_ECP_MAX_BYTES )
 
 /*
  * EC private keys:
@@ -507,7 +507,7 @@ int mbedtls_pk_write_key_der( mbedtls_pk_context *key, unsigned char *buf, size_
  *      publicKey  [1] BIT STRING OPTIONAL      1 + 2 + [1] above
  *    }
  */
-#define ECP_PRV_DER_MAX_BYTES   29 + 3 * MBEDTLS_ECP_MAX_BYTES
+#define ECP_PRV_DER_MAX_BYTES   ( 29 + 3 * MBEDTLS_ECP_MAX_BYTES )
 
 #else /* MBEDTLS_ECP_C */
 
@@ -516,10 +516,10 @@ int mbedtls_pk_write_key_der( mbedtls_pk_context *key, unsigned char *buf, size_
 
 #endif /* MBEDTLS_ECP_C */
 
-#define PUB_DER_MAX_BYTES   RSA_PUB_DER_MAX_BYTES > ECP_PUB_DER_MAX_BYTES ? \
-                            RSA_PUB_DER_MAX_BYTES : ECP_PUB_DER_MAX_BYTES
-#define PRV_DER_MAX_BYTES   RSA_PRV_DER_MAX_BYTES > ECP_PRV_DER_MAX_BYTES ? \
-                            RSA_PRV_DER_MAX_BYTES : ECP_PRV_DER_MAX_BYTES
+#define PUB_DER_MAX_BYTES   ( RSA_PUB_DER_MAX_BYTES > ECP_PUB_DER_MAX_BYTES ? \
+                              RSA_PUB_DER_MAX_BYTES : ECP_PUB_DER_MAX_BYTES )
+#define PRV_DER_MAX_BYTES   ( RSA_PRV_DER_MAX_BYTES > ECP_PRV_DER_MAX_BYTES ? \
+                              RSA_PRV_DER_MAX_BYTES : ECP_PRV_DER_MAX_BYTES )
 
 int mbedtls_pk_write_pubkey_pem( mbedtls_pk_context *key, unsigned char *buf, size_t size )
 {
diff --git a/thirdparty/mbedtls/library/rsa.c b/thirdparty/mbedtls/library/rsa.c
index a25c633bc6..c8c23dba8c 100644
--- a/thirdparty/mbedtls/library/rsa.c
+++ b/thirdparty/mbedtls/library/rsa.c
@@ -520,6 +520,9 @@ void mbedtls_rsa_init( mbedtls_rsa_context *ctx,
     mbedtls_rsa_set_padding( ctx, padding, hash_id );
 
 #if defined(MBEDTLS_THREADING_C)
+    /* Set ctx->ver to nonzero to indicate that the mutex has been
+     * initialized and will need to be freed. */
+    ctx->ver = 1;
     mbedtls_mutex_init( &ctx->mutex );
 #endif
 }
@@ -567,9 +570,6 @@ int mbedtls_rsa_gen_key( mbedtls_rsa_context *ctx,
     RSA_VALIDATE_RET( ctx != NULL );
     RSA_VALIDATE_RET( f_rng != NULL );
 
-    if( nbits < 128 || exponent < 3 || nbits % 2 != 0 )
-        return( MBEDTLS_ERR_RSA_BAD_INPUT_DATA );
-
     /*
      * If the modulus is 1024 bit long or shorter, then the security strength of
      * the RSA algorithm is less than or equal to 80 bits and therefore an error
@@ -582,6 +582,12 @@ int mbedtls_rsa_gen_key( mbedtls_rsa_context *ctx,
     mbedtls_mpi_init( &G );
     mbedtls_mpi_init( &L );
 
+    if( nbits < 128 || exponent < 3 || nbits % 2 != 0 )
+    {
+        ret = MBEDTLS_ERR_RSA_BAD_INPUT_DATA;
+        goto cleanup;
+    }
+
     /*
      * find primes P and Q with Q < P so that:
      * 1.  |P-Q| > 2^( nbits / 2 - 100 )
@@ -659,7 +665,9 @@ cleanup:
     if( ret != 0 )
     {
         mbedtls_rsa_free( ctx );
-        return( MBEDTLS_ERR_RSA_KEY_GEN_FAILED + ret );
+        if( ( -ret & ~0x7f ) == 0 )
+            ret = MBEDTLS_ERR_RSA_KEY_GEN_FAILED + ret;
+        return( ret );
     }
 
     return( 0 );
@@ -1106,10 +1114,10 @@ cleanup:
     mbedtls_mpi_free( &C );
     mbedtls_mpi_free( &I );
 
-    if( ret != 0 )
+    if( ret != 0 && ret >= -0x007f )
         return( MBEDTLS_ERR_RSA_PRIVATE_FAILED + ret );
 
-    return( 0 );
+    return( ret );
 }
 
 #if defined(MBEDTLS_PKCS1_V21)
@@ -2502,7 +2510,6 @@ int mbedtls_rsa_copy( mbedtls_rsa_context *dst, const mbedtls_rsa_context *src )
     RSA_VALIDATE_RET( dst != NULL );
     RSA_VALIDATE_RET( src != NULL );
 
-    dst->ver = src->ver;
     dst->len = src->len;
 
     MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &dst->N, &src->N ) );
@@ -2561,7 +2568,12 @@ void mbedtls_rsa_free( mbedtls_rsa_context *ctx )
 #endif /* MBEDTLS_RSA_NO_CRT */
 
 #if defined(MBEDTLS_THREADING_C)
-    mbedtls_mutex_free( &ctx->mutex );
+    /* Free the mutex, but only if it hasn't been freed already. */
+    if( ctx->ver != 0 )
+    {
+        mbedtls_mutex_free( &ctx->mutex );
+        ctx->ver = 0;
+    }
 #endif
 }
 
diff --git a/thirdparty/mbedtls/library/threading.c b/thirdparty/mbedtls/library/threading.c
index f4f29cff5e..0dc5488c1a 100644
--- a/thirdparty/mbedtls/library/threading.c
+++ b/thirdparty/mbedtls/library/threading.c
@@ -98,6 +98,12 @@ static void threading_mutex_init_pthread( mbedtls_threading_mutex_t *mutex )
     if( mutex == NULL )
         return;
 
+    /* A nonzero value of is_valid indicates a successfully initialized
+     * mutex. This is a workaround for not being able to return an error
+     * code for this function. The lock/unlock functions return an error
+     * if is_valid is nonzero. The Mbed TLS unit test code uses this field
+     * to distinguish more states of the mutex; see helpers.function for
+     * details. */
     mutex->is_valid = pthread_mutex_init( &mutex->mutex, NULL ) == 0;
 }
 
diff --git a/thirdparty/mbedtls/library/version_features.c b/thirdparty/mbedtls/library/version_features.c
index cbf38dc2c2..8c8e815e9d 100644
--- a/thirdparty/mbedtls/library/version_features.c
+++ b/thirdparty/mbedtls/library/version_features.c
@@ -553,6 +553,9 @@ static const char *features[] = {
 #if defined(MBEDTLS_SSL_TRUNCATED_HMAC_COMPAT)
     "MBEDTLS_SSL_TRUNCATED_HMAC_COMPAT",
 #endif /* MBEDTLS_SSL_TRUNCATED_HMAC_COMPAT */
+#if defined(MBEDTLS_TEST_HOOKS)
+    "MBEDTLS_TEST_HOOKS",
+#endif /* MBEDTLS_TEST_HOOKS */
 #if defined(MBEDTLS_THREADING_ALT)
     "MBEDTLS_THREADING_ALT",
 #endif /* MBEDTLS_THREADING_ALT */
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
index 4fcd766d22..3c52415f62 100644
--- a/thirdparty/meshoptimizer/LICENSE.md
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016-2020 Arseny Kapoulkine
+Copyright (c) 2016-2021 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
index f7d88c5136..f8aad7b49c 100644
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -2,6 +2,7 @@
 #include "meshoptimizer.h"
 
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 #include <string.h>
 
@@ -12,6 +13,68 @@
 namespace meshopt
 {
 
+// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
+const size_t kMeshletMaxVertices = 255;
+
+// A reasonable limit is around 2*max_vertices or less
+const size_t kMeshletMaxTriangles = 512;
+
+struct TriangleAdjacency2
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
 static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
 {
 	assert(count > 0);
@@ -82,13 +145,310 @@ static void computeBoundingSphere(float result[4], const float points[][3], size
 	result[3] = radius;
 }
 
+struct Cone
+{
+	float px, py, pz;
+	float nx, ny, nz;
+};
+
+static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+{
+	float cone = 1.f - spread * cone_weight;
+	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
+
+	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+}
+
+static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
+{
+	Cone result = acc;
+
+	float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
+
+	result.px *= center_scale;
+	result.py *= center_scale;
+	result.pz *= center_scale;
+
+	float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
+	float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
+
+	result.nx *= axis_scale;
+	result.ny *= axis_scale;
+	result.nz *= axis_scale;
+
+	return result;
+}
+
+static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+	size_t face_count = index_count / 3;
+
+	float mesh_area = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+		float invarea = (area == 0.f) ? 0.f : 1.f / area;
+
+		triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
+		triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
+		triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
+
+		triangles[i].nx = normalx * invarea;
+		triangles[i].ny = normaly * invarea;
+		triangles[i].nz = normalz * invarea;
+
+		mesh_area += area;
+	}
+
+	return mesh_area;
+}
+
+static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+{
+	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
+
+	// fill 4b padding with 0
+	while (offset & 3)
+		meshlet_triangles[offset++] = 0;
+}
+
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
+{
+	unsigned char& av = used[a];
+	unsigned char& bv = used[b];
+	unsigned char& cv = used[c];
+
+	bool result = false;
+
+	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+	{
+		meshlets[meshlet_offset] = meshlet;
+
+		for (size_t j = 0; j < meshlet.vertex_count; ++j)
+			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
+
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlet.vertex_offset += meshlet.vertex_count;
+		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+		meshlet.vertex_count = 0;
+		meshlet.triangle_count = 0;
+
+		result = true;
+	}
+
+	if (av == 0xff)
+	{
+		av = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
+	}
+
+	if (bv == 0xff)
+	{
+		bv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
+	}
+
+	if (cv == 0xff)
+	{
+		cv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
+	}
+
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+	meshlet.triangle_count++;
+
+	return result;
+}
+
+struct KDNode
+{
+	union
+	{
+		float split;
+		unsigned int index;
+	};
+
+	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
+	unsigned int axis : 2;
+	unsigned int children : 30;
+};
+
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = points[indices[i] * stride + axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = indices[m];
+		indices[m] = indices[i];
+		indices[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
+{
+	assert(offset + count <= node_count);
+	(void)node_count;
+
+	KDNode& result = nodes[offset];
+
+	result.index = indices[0];
+	result.axis = 3;
+	result.children = unsigned(count - 1);
+
+	// all remaining points are stored in nodes immediately following the leaf
+	for (size_t i = 1; i < count; ++i)
+	{
+		KDNode& tail = nodes[offset + i];
+
+		tail.index = indices[i];
+		tail.axis = 3;
+		tail.children = ~0u >> 2; // bogus value to prevent misuse
+	}
+
+	return offset + count;
+}
+
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+{
+	assert(count > 0);
+	assert(offset < node_count);
+
+	if (count <= leaf_size)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = points + indices[i] * stride;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1
+	                                                                                      : 2;
+
+	float split = mean[axis];
+	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
+
+	// when the partition is degenerate simply consolidate the points into a single node
+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	KDNode& result = nodes[offset];
+
+	result.split = split;
+	result.axis = axis;
+
+	// left subtree is right after our node
+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+
+	// distance to the right subtree is represented explicitly
+	result.children = unsigned(next_offset - offset - 1);
+
+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+}
+
+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
+{
+	const KDNode& node = nodes[root];
+
+	if (node.axis == 3)
+	{
+		// leaf
+		for (unsigned int i = 0; i <= node.children; ++i)
+		{
+			unsigned int index = nodes[root + i].index;
+
+			if (emitted_flags[index])
+				continue;
+
+			const float* point = points + index * stride;
+
+			float distance2 =
+			    (point[0] - position[0]) * (point[0] - position[0]) +
+			    (point[1] - position[1]) * (point[1] - position[1]) +
+			    (point[2] - position[2]) * (point[2] - position[2]);
+			float distance = sqrtf(distance2);
+
+			if (distance < limit)
+			{
+				result = index;
+				limit = distance;
+			}
+		}
+	}
+	else
+	{
+		// branch; we order recursion to process the node that search position is in first
+		float delta = position[node.axis] - node.split;
+		unsigned int first = (delta <= 0) ? 0 : node.children;
+		unsigned int second = first ^ node.children;
+
+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
+
+		// only process the other node if it can have a match based on closest distance so far
+		if (fabsf(delta) <= limit)
+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	(void)kMeshletMaxVertices;
+	(void)kMeshletMaxTriangles;
 
 	// meshlet construction is limited by max vertices and max triangles per meshlet
 	// the worst case is that the input is an unindexed stream since this equally stresses both limits
@@ -100,77 +460,226 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
 }
 
-size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
 	meshopt_Allocator allocator;
 
-	meshopt_Meshlet meshlet;
-	memset(&meshlet, 0, sizeof(meshlet));
+	TriangleAdjacency2 adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	size_t face_count = index_count / 3;
+
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// for each triangle, precompute centroid & normal to use for scoring
+	Cone* triangles = allocator.allocate<Cone>(face_count);
+	float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
+	float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
+	float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
+
+	// build a kd-tree for nearest neighbor lookup
+	unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
+	for (size_t i = 0; i < face_count; ++i)
+		kdindices[i] = unsigned(i);
 
-	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
-	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
 
 	// index of the vertex in the meshlet, 0xff if the vertex isn't used
 	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
 	memset(used, -1, vertex_count);
 
-	size_t offset = 0;
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
 
-	for (size_t i = 0; i < index_count; i += 3)
-	{
-		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+	Cone meshlet_cone_acc = {};
 
-		unsigned char& av = used[a];
-		unsigned char& bv = used[b];
-		unsigned char& cv = used[c];
+	for (;;)
+	{
+		unsigned int best_triangle = ~0u;
+		unsigned int best_extra = 5;
+		float best_score = FLT_MAX;
 
-		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		for (size_t i = 0; i < meshlet.vertex_count; ++i)
 		{
-			destination[offset++] = meshlet;
+			unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t j = 0; j < neighbours_size; ++j)
+			{
+				unsigned int triangle = neighbours[j];
+				assert(!emitted_flags[triangle]);
+
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+				assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+				unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+
+				// triangles that don't add new vertices to meshlets are max. priority
+				if (extra != 0)
+				{
+					// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+					if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+						extra = 0;
+
+					extra++;
+				}
+
+				// since topology-based priority is always more important than the score, we can skip scoring in some cases
+				if (extra > best_extra)
+					continue;
+
+				const Cone& tri_cone = triangles[triangle];
+
+				float distance2 =
+				    (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
+				    (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
+				    (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
 
-			for (size_t j = 0; j < meshlet.vertex_count; ++j)
-				used[meshlet.vertices[j]] = 0xff;
+				float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
 
-			memset(&meshlet, 0, sizeof(meshlet));
+				float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
+
+				// note that topology-based priority is always more important than the score
+				// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
+				if (extra < best_extra || score < best_score)
+				{
+					best_triangle = triangle;
+					best_extra = extra;
+					best_score = score;
+				}
+			}
 		}
 
-		if (av == 0xff)
+		if (best_triangle == ~0u)
 		{
-			av = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = a;
+			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
+			unsigned int index = ~0u;
+			float limit = FLT_MAX;
+
+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+
+			best_triangle = index;
 		}
 
-		if (bv == 0xff)
+		if (best_triangle == ~0u)
+			break;
+
+		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
 		{
-			bv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = b;
+			meshlet_offset++;
+			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}
 
-		if (cv == 0xff)
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
 		{
-			cv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = c;
+			unsigned int index = indices[best_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == best_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
 		}
 
-		meshlet.indices[meshlet.triangle_count][0] = av;
-		meshlet.indices[meshlet.triangle_count][1] = bv;
-		meshlet.indices[meshlet.triangle_count][2] = cv;
-		meshlet.triangle_count++;
+		// update aggregated meshlet cone data for scoring subsequent triangles
+		meshlet_cone_acc.px += triangles[best_triangle].px;
+		meshlet_cone_acc.py += triangles[best_triangle].py;
+		meshlet_cone_acc.pz += triangles[best_triangle].pz;
+		meshlet_cone_acc.nx += triangles[best_triangle].nx;
+		meshlet_cone_acc.ny += triangles[best_triangle].ny;
+		meshlet_cone_acc.nz += triangles[best_triangle].nz;
+
+		emitted_flags[best_triangle] = 1;
+	}
+
+	if (meshlet.triangle_count)
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlets[meshlet_offset++] = meshlet;
+	}
+
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	meshopt_Allocator allocator;
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// appends triangle to the meshlet and writes previous meshlet to the output if full
+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
 	}
 
 	if (meshlet.triangle_count)
-		destination[offset++] = meshlet;
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
 
-	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+		meshlets[meshlet_offset++] = meshlet;
+	}
 
-	return offset;
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
 }
 
 meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -178,18 +687,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
+	assert(index_count / 3 <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	assert(index_count / 3 <= 256);
-
 	(void)vertex_count;
 
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
 	// compute triangle normals and gather triangle corners
-	float normals[256][3];
-	float corners[256][3][3];
+	float normals[kMeshletMaxTriangles][3];
+	float corners[kMeshletMaxTriangles][3][3];
 	size_t triangles = 0;
 
 	for (size_t i = 0; i < index_count; i += 3)
@@ -327,25 +835,23 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	return bounds;
 }
 
-meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
+	using namespace meshopt;
+
+	assert(triangle_count <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+	unsigned int indices[kMeshletMaxTriangles * 3];
 
-	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	for (size_t i = 0; i < triangle_count * 3; ++i)
 	{
-		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
-		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
-		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
-
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+		unsigned int index = meshlet_vertices[meshlet_triangles[i]];
+		assert(index < vertex_count);
 
-		indices[i * 3 + 0] = a;
-		indices[i * 3 + 1] = b;
-		indices[i * 3 + 2] = c;
+		indices[i] = index;
 	}
 
-	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
 }
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
index aa4a30efa4..f60db0dc4f 100644
--- a/thirdparty/meshoptimizer/indexgenerator.cpp
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -4,6 +4,8 @@
 #include <assert.h>
 #include <string.h>
 
+// This work is based on:
+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
 namespace meshopt
 {
 
@@ -83,10 +85,49 @@ struct VertexStreamHasher
 	}
 };
 
+struct EdgeHasher
+{
+	const unsigned int* remap;
+
+	size_t hash(unsigned long long edge) const
+	{
+		unsigned int e0 = unsigned(edge >> 32);
+		unsigned int e1 = unsigned(edge);
+
+		unsigned int h1 = remap[e0];
+		unsigned int h2 = remap[e1];
+
+		const unsigned int m = 0x5bd1e995;
+
+		// MurmurHash64B finalizer
+		h1 ^= h2 >> 18;
+		h1 *= m;
+		h2 ^= h1 >> 22;
+		h2 *= m;
+		h1 ^= h2 >> 17;
+		h1 *= m;
+		h2 ^= h1 >> 19;
+		h2 *= m;
+
+		return h2;
+	}
+
+	bool equal(unsigned long long lhs, unsigned long long rhs) const
+	{
+		unsigned int l0 = unsigned(lhs >> 32);
+		unsigned int l1 = unsigned(lhs);
+
+		unsigned int r0 = unsigned(rhs >> 32);
+		unsigned int r1 = unsigned(rhs);
+
+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
+	}
+};
+
 static size_t hashBuckets(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
@@ -119,6 +160,26 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 	return 0;
 }
 
+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
+
+	size_t vertex_table_size = hashBuckets(vertex_count);
+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -345,3 +406,146 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 		destination[i] = remap[index];
 	}
 }
+
+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[4] = {1, 2, 0, 1};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
+
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			unsigned int i2 = indices[i + next[e + 1]];
+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+			{
+				*entry = edge;
+
+				// store vertex opposite to the edge
+				edge_vertex_table[entry - edge_table] = i2;
+			}
+		}
+	}
+
+	// build resulting index buffer: 6 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[6];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			patch[e * 2 + 0] = i0;
+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
+		}
+
+		memcpy(destination + i * 2, patch, sizeof(patch));
+	}
+}
+
+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[3] = {1, 2, 0};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+				*entry = edge;
+		}
+	}
+
+	// build resulting index buffer: 12 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[12];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			// use the same edge if opposite edge doesn't exist (border)
+			oppe = (oppe == ~0ull) ? edge : oppe;
+
+			// triangle index (0, 1, 2)
+			patch[e] = i0;
+
+			// opposite edge (3, 4; 5, 6; 7, 8)
+			patch[3 + e * 2 + 0] = unsigned(oppe);
+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
+
+			// dominant vertex (9, 10, 11)
+			patch[9 + e] = remap[i0];
+		}
+
+		memcpy(destination + i * 4, patch, sizeof(patch));
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
index 1714000384..e44b99ce52 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.15
+ * meshoptimizer - version 0.16
  *
- * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+#define MESHOPTIMIZER_VERSION 160 /* 0.16 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -98,6 +98,35 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
 /**
+ * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
+ * Each triangle is converted into a 6-vertex patch with the following layout:
+ * - 0, 2, 4: original triangle vertices
+ * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40
+ * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY.
+ * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*2 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement
+ * Each triangle is converted into a 12-vertex patch with the following layout:
+ * - 0, 1, 2: original triangle vertices
+ * - 3, 4: opposing edge for edge 0, 1
+ * - 5, 6: opposing edge for edge 1, 2
+ * - 7, 8: opposing edge for edge 2, 0
+ * - 9, 10, 11: dominant vertices for corners 0, 1, 2
+ * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping.
+ * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*4 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
  * Vertex transform cache optimizer
  * Reorders indices to reduce the number of GPU vertex shader invocations
  * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
@@ -270,6 +299,11 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t ver
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
 
 /**
+ * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
+
+/**
  * Experimental: Mesh simplifier (sloppy)
  * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
  * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
@@ -373,22 +407,31 @@ MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetc
 
 struct meshopt_Meshlet
 {
-	unsigned int vertices[64];
-	unsigned char indices[126][3];
-	unsigned char triangle_count;
-	unsigned char vertex_count;
+	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
+	unsigned int vertex_offset;
+	unsigned int triangle_offset;
+
+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+	unsigned int vertex_count;
+	unsigned int triangle_count;
 };
 
 /**
  * Experimental: Meshlet builder
  * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
  * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
- * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
+ * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
  *
- * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 struct meshopt_Bounds
@@ -426,10 +469,10 @@ struct meshopt_Bounds
  * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
  *
  * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
- * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
  */
 MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
  * Experimental: Spatial sorter
@@ -513,6 +556,10 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 template <typename T>
 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -547,7 +594,9 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 template <typename T>
 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
@@ -762,6 +811,24 @@ inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indi
 }
 
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 2);
+
+	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 4);
+
+	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
@@ -908,11 +975,19 @@ inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices
 }
 
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
 
-	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
 
 template <typename T>
@@ -934,7 +1009,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2020 Arseny Kapoulkine
+ * Copyright (c) 2016-2021 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch
new file mode 100644
index 0000000000..cf648b0da3
--- /dev/null
+++ b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch
@@ -0,0 +1,262 @@
+diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
+index fe8d349731..e44b99ce52 100644
+--- a/thirdparty/meshoptimizer/meshoptimizer.h
++++ b/thirdparty/meshoptimizer/meshoptimizer.h
+@@ -298,6 +298,11 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t ver
+  */
+ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+ 
++/**
++ * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
++ */
++MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
++
+ /**
+  * Experimental: Mesh simplifier (sloppy)
+  * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
+index b2cb589462..059cabb055 100644
+--- a/thirdparty/meshoptimizer/simplifier.cpp
++++ b/thirdparty/meshoptimizer/simplifier.cpp
+@@ -20,6 +20,8 @@
+ #define TRACESTATS(i) (void)0
+ #endif
+ 
++#define ATTRIBUTES 8
++
+ // This work is based on:
+ // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+ // Michael Garland. Quadric-based polygonal surface simplification. 1999
+@@ -358,6 +360,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
+ struct Vector3
+ {
+ 	float x, y, z;
++
++#if ATTRIBUTES
++	float a[ATTRIBUTES];
++#endif
+ };
+ 
+ static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+@@ -414,6 +420,13 @@ struct Quadric
+ 	float a10, a20, a21;
+ 	float b0, b1, b2, c;
+ 	float w;
++
++#if ATTRIBUTES
++	float gx[ATTRIBUTES];
++	float gy[ATTRIBUTES];
++	float gz[ATTRIBUTES];
++	float gw[ATTRIBUTES];
++#endif
+ };
+ 
+ struct Collapse
+@@ -456,6 +469,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
+ 	Q.b2 += R.b2;
+ 	Q.c += R.c;
+ 	Q.w += R.w;
++
++#if ATTRIBUTES
++	for (int k = 0; k < ATTRIBUTES; ++k)
++	{
++		Q.gx[k] += R.gx[k];
++		Q.gy[k] += R.gy[k];
++		Q.gz[k] += R.gz[k];
++		Q.gw[k] += R.gw[k];
++	}
++#endif
+ }
+ 
+ static float quadricError(const Quadric& Q, const Vector3& v)
+@@ -481,6 +504,17 @@ static float quadricError(const Quadric& Q, const Vector3& v)
+ 	r += ry * v.y;
+ 	r += rz * v.z;
+ 
++#if ATTRIBUTES
++	// see quadricUpdateAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
++	for (int k = 0; k < ATTRIBUTES; ++k)
++	{
++		float a = v.a[k];
++
++		r += a * a * Q.w;
++		r -= 2 * a * (v.x * Q.gx[k] + v.y * Q.gy[k] + v.z * Q.gz[k] + Q.gw[k]);
++	}
++#endif
++
+ 	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
+ 
+ 	return fabsf(r) * s;
+@@ -504,6 +538,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
+ 	Q.b2 = c * dw;
+ 	Q.c = d * dw;
+ 	Q.w = w;
++
++#if ATTRIBUTES
++	memset(Q.gx, 0, sizeof(Q.gx));
++	memset(Q.gy, 0, sizeof(Q.gy));
++	memset(Q.gz, 0, sizeof(Q.gz));
++	memset(Q.gw, 0, sizeof(Q.gw));
++#endif
+ }
+ 
+ static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+@@ -556,6 +597,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
+ 	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+ }
+ 
++#if ATTRIBUTES
++static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float w)
++{
++	// for each attribute we want to encode the following function into the quadric:
++	// (eval(pos) - attr)^2
++	// where eval(pos) interpolates attribute across the triangle like so:
++	// eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
++	// where gx/gy/gz/gw are gradients
++	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
++	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
++
++	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
++	// v = (d11 * d20 - d01 * d21) / denom
++	// w = (d00 * d21 - d01 * d20) / denom
++	// u = 1 - v - w
++	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
++	const Vector3& v0 = p10;
++	const Vector3& v1 = p20;
++	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
++	float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
++	float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
++	float denom = d00 * d11 - d01 * d01;
++	float denomr = denom == 0 ? 0.f : 1.f / denom;
++
++	// precompute gradient factors
++	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
++	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
++	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
++	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
++	float gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
++	float gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
++	float gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
++
++	for (int k = 0; k < ATTRIBUTES; ++k)
++	{
++		float a0 = p0.a[k], a1 = p1.a[k], a2 = p2.a[k];
++
++		// compute gradient of eval(pos) for x/y/z/w
++		// the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
++		float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
++		float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
++		float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
++		float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
++
++		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
++		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
++		Q.a00 += w * (gx * gx);
++		Q.a11 += w * (gy * gy);
++		Q.a22 += w * (gz * gz);
++
++		Q.a10 += w * (gy * gx);
++		Q.a20 += w * (gz * gx);
++		Q.a21 += w * (gz * gy);
++
++		Q.b0 += w * (gx * gw);
++		Q.b1 += w * (gy * gw);
++		Q.b2 += w * (gz * gw);
++
++		Q.c += w * (gw * gw);
++
++		// the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
++		Q.gx[k] = w * gx;
++		Q.gy[k] = w * gy;
++		Q.gz[k] = w * gz;
++		Q.gw[k] = w * gw;
++
++#if TRACE > 2
++		printf("attr%d: %e %e %e\n",
++			k,
++			(gx * p0.x + gy * p0.y + gz * p0.z + gw - a0),
++			(gx * p1.x + gy * p1.y + gz * p1.z + gw - a1),
++			(gx * p2.x + gy * p2.y + gz * p2.z + gw - a2)
++			);
++#endif
++	}
++}
++#endif
++
+ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+ {
+ 	for (size_t i = 0; i < index_count; i += 3)
+@@ -567,6 +686,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
+ 		Quadric Q;
+ 		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
+ 
++#if ATTRIBUTES
++		quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w);
++#endif
+ 		quadricAdd(vertex_quadrics[remap[i0]], Q);
+ 		quadricAdd(vertex_quadrics[remap[i1]], Q);
+ 		quadricAdd(vertex_quadrics[remap[i2]], Q);
+@@ -1259,13 +1381,19 @@ unsigned int* meshopt_simplifyDebugLoopBack = 0;
+ #endif
+ 
+ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
++{
++	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0);
++}
++
++size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
+ {
+ 	using namespace meshopt;
+ 
+ 	assert(index_count % 3 == 0);
+-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+-	assert(vertex_positions_stride % sizeof(float) == 0);
++	assert(vertex_stride > 0 && vertex_stride <= 256);
++	assert(vertex_stride % sizeof(float) == 0);
+ 	assert(target_index_count <= index_count);
++	assert(attribute_count <= ATTRIBUTES);
+ 
+ 	meshopt_Allocator allocator;
+ 
+@@ -1279,7 +1407,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 	// build position remap that maps each vertex to the one with identical position
+ 	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+ 	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
+-	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
++	buildPositionRemap(remap, wedge, vertex_data, vertex_count, vertex_stride, allocator);
+ 
+ 	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
+ 	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
+@@ -1303,7 +1431,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ #endif
+ 
+ 	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+-	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
++	rescalePositions(vertex_positions, vertex_data, vertex_count, vertex_stride);
++
++#if ATTRIBUTES
++	for (size_t i = 0; i < vertex_count; ++i)
++	{
++		memset(vertex_positions[i].a, 0, sizeof(vertex_positions[i].a));
++
++		for (size_t k = 0; k < attribute_count; ++k)
++		{
++			float a = attributes[i * attribute_count + k];
++
++			vertex_positions[i].a[k] = a * attribute_weights[k];
++		}
++	}
++#endif
+ 
+ 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
+ 	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
+@@ -1395,7 +1537,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 
+ 	// result_error is quadratic; we need to remap it back to linear
+ 	if (out_result_error)
++	{
+ 		*out_result_error = sqrtf(result_error);
++	}
+ 
+ 	return result_count;
+ }
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
index 942db14461..0f10ebef4b 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -20,6 +20,8 @@
 #define TRACESTATS(i) (void)0
 #endif
 
+#define ATTRIBUTES 8
+
 // This work is based on:
 // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
 // Michael Garland. Quadric-based polygonal surface simplification. 1999
@@ -118,8 +120,13 @@ struct PositionHasher
 	{
 		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
 
+		// scramble bits to make sure that integer coordinates have entropy in lower bits
+		unsigned int x = key[0] ^ (key[0] >> 17);
+		unsigned int y = key[1] ^ (key[1] >> 17);
+		unsigned int z = key[2] ^ (key[2] >> 17);
+
 		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
-		return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791);
+		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
 	}
 
 	bool equal(unsigned int lhs, unsigned int rhs) const
@@ -131,7 +138,7 @@ struct PositionHasher
 static size_t hashBuckets2(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
@@ -358,6 +365,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 struct Vector3
 {
 	float x, y, z;
+
+#if ATTRIBUTES
+	float a[ATTRIBUTES];
+#endif
 };
 
 static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
@@ -414,6 +425,13 @@ struct Quadric
 	float a10, a20, a21;
 	float b0, b1, b2, c;
 	float w;
+
+#if ATTRIBUTES
+	float gx[ATTRIBUTES];
+	float gy[ATTRIBUTES];
+	float gz[ATTRIBUTES];
+	float gw[ATTRIBUTES];
+#endif
 };
 
 struct Collapse
@@ -456,6 +474,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
 	Q.b2 += R.b2;
 	Q.c += R.c;
 	Q.w += R.w;
+
+#if ATTRIBUTES
+	for (int k = 0; k < ATTRIBUTES; ++k)
+	{
+		Q.gx[k] += R.gx[k];
+		Q.gy[k] += R.gy[k];
+		Q.gz[k] += R.gz[k];
+		Q.gw[k] += R.gw[k];
+	}
+#endif
 }
 
 static float quadricError(const Quadric& Q, const Vector3& v)
@@ -481,6 +509,17 @@ static float quadricError(const Quadric& Q, const Vector3& v)
 	r += ry * v.y;
 	r += rz * v.z;
 
+#if ATTRIBUTES
+	// see quadricUpdateAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
+	for (int k = 0; k < ATTRIBUTES; ++k)
+	{
+		float a = v.a[k];
+
+		r += a * a * Q.w;
+		r -= 2 * a * (v.x * Q.gx[k] + v.y * Q.gy[k] + v.z * Q.gz[k] + Q.gw[k]);
+	}
+#endif
+
 	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
 
 	return fabsf(r) * s;
@@ -504,6 +543,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
 	Q.b2 = c * dw;
 	Q.c = d * dw;
 	Q.w = w;
+
+#if ATTRIBUTES
+	memset(Q.gx, 0, sizeof(Q.gx));
+	memset(Q.gy, 0, sizeof(Q.gy));
+	memset(Q.gz, 0, sizeof(Q.gz));
+	memset(Q.gw, 0, sizeof(Q.gw));
+#endif
 }
 
 static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
@@ -556,6 +602,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
 	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
 }
 
+#if ATTRIBUTES
+static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float w)
+{
+	// for each attribute we want to encode the following function into the quadric:
+	// (eval(pos) - attr)^2
+	// where eval(pos) interpolates attribute across the triangle like so:
+	// eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
+	// where gx/gy/gz/gw are gradients
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
+	// v = (d11 * d20 - d01 * d21) / denom
+	// w = (d00 * d21 - d01 * d20) / denom
+	// u = 1 - v - w
+	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+	const Vector3& v0 = p10;
+	const Vector3& v1 = p20;
+	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
+	float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+	float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
+	float denom = d00 * d11 - d01 * d01;
+	float denomr = denom == 0 ? 0.f : 1.f / denom;
+
+	// precompute gradient factors
+	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
+	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
+	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
+	float gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
+	float gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
+	float gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
+
+	for (int k = 0; k < ATTRIBUTES; ++k)
+	{
+		float a0 = p0.a[k], a1 = p1.a[k], a2 = p2.a[k];
+
+		// compute gradient of eval(pos) for x/y/z/w
+		// the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
+		float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
+		float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
+		float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
+		float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
+
+		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
+		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+		Q.a00 += w * (gx * gx);
+		Q.a11 += w * (gy * gy);
+		Q.a22 += w * (gz * gz);
+
+		Q.a10 += w * (gy * gx);
+		Q.a20 += w * (gz * gx);
+		Q.a21 += w * (gz * gy);
+
+		Q.b0 += w * (gx * gw);
+		Q.b1 += w * (gy * gw);
+		Q.b2 += w * (gz * gw);
+
+		Q.c += w * (gw * gw);
+
+		// the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
+		Q.gx[k] = w * gx;
+		Q.gy[k] = w * gy;
+		Q.gz[k] = w * gz;
+		Q.gw[k] = w * gw;
+
+#if TRACE > 2
+		printf("attr%d: %e %e %e\n",
+			k,
+			(gx * p0.x + gy * p0.y + gz * p0.z + gw - a0),
+			(gx * p1.x + gy * p1.y + gz * p1.z + gw - a1),
+			(gx * p2.x + gy * p2.y + gz * p2.z + gw - a2)
+			);
+#endif
+	}
+}
+#endif
+
 static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
 {
 	for (size_t i = 0; i < index_count; i += 3)
@@ -567,6 +691,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
 		Quadric Q;
 		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
 
+#if ATTRIBUTES
+		quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w);
+#endif
 		quadricAdd(vertex_quadrics[remap[i0]], Q);
 		quadricAdd(vertex_quadrics[remap[i1]], Q);
 		quadricAdd(vertex_quadrics[remap[i2]], Q);
@@ -1038,7 +1165,7 @@ struct IdHasher
 
 struct TriangleHasher
 {
-	unsigned int* indices;
+	const unsigned int* indices;
 
 	size_t hash(unsigned int i) const
 	{
@@ -1253,19 +1380,26 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
 } // namespace meshopt
 
 #ifndef NDEBUG
-unsigned char* meshopt_simplifyDebugKind = 0;
-unsigned int* meshopt_simplifyDebugLoop = 0;
-unsigned int* meshopt_simplifyDebugLoopBack = 0;
+// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds
+MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = 0;
+MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = 0;
+MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
 #endif
 
 size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
 {
+	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0);
+}
+
+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
+{
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
-	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(vertex_stride > 0 && vertex_stride <= 256);
+	assert(vertex_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
+	assert(attribute_count <= ATTRIBUTES);
 
 	meshopt_Allocator allocator;
 
@@ -1279,7 +1413,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	// build position remap that maps each vertex to the one with identical position
 	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
 	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
-	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
+	buildPositionRemap(remap, wedge, vertex_data, vertex_count, vertex_stride, allocator);
 
 	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
 	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
@@ -1303,7 +1437,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 #endif
 
 	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
-	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+	rescalePositions(vertex_positions, vertex_data, vertex_count, vertex_stride);
+
+#if ATTRIBUTES
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		memset(vertex_positions[i].a, 0, sizeof(vertex_positions[i].a));
+
+		for (size_t k = 0; k < attribute_count; ++k)
+		{
+			float a = attributes[i * attribute_count + k];
+
+			vertex_positions[i].a[k] = a * attribute_weights[k];
+		}
+	}
+#endif
 
 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
 	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
@@ -1395,7 +1543,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 
 	// result_error is quadratic; we need to remap it back to linear
 	if (out_result_error)
+	{
 		*out_result_error = sqrtf(result_error);
+	}
 
 	return result_count;
 }
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
index 2cbfaac367..5f3ec204ab 100644
--- a/thirdparty/meshoptimizer/vertexcodec.cpp
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -710,18 +710,12 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 SIMD_TARGET
 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
-
-	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
-	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;
 
 	// TODO: This can use v8x16_bitmask in the future
-	uint64_t mask_2 = mask_1a | mask_1b;
-	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
-	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
-
-	mask0 = uint8_t(mask_8);
-	mask1 = uint8_t(mask_8 >> 32);
+	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
+	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
 
 SIMD_TARGET
diff --git a/thirdparty/miniupnpc/LICENSE b/thirdparty/miniupnpc/LICENSE
index 1460310752..6ddd381baa 100644
--- a/thirdparty/miniupnpc/LICENSE
+++ b/thirdparty/miniupnpc/LICENSE
@@ -1,5 +1,5 @@
-MiniUPnP Project
-Copyright (c) 2005-2019, Thomas BERNARD
+MiniUPnPc
+Copyright (c) 2005-2020, Thomas BERNARD
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -24,3 +24,4 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.c b/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.c
new file mode 100644
index 0000000000..7e586d7da2
--- /dev/null
+++ b/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.c
@@ -0,0 +1,79 @@
+/* $Id: addr_is_reserved.c,v 1.4 2021/03/02 23:40:32 nanard Exp $ */
+/* vim: tabstop=4 shiftwidth=4 noexpandtab
+ * Project : miniupnp
+ * Web : http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
+ * Author : Thomas BERNARD
+ * copyright (c) 2005-2021 Thomas Bernard
+ * This software is subjet to the conditions detailed in the
+ * provided LICENSE file. */
+#ifdef _WIN32
+/* Win32 Specific includes and defines */
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#if !defined(_MSC_VER)
+#include <stdint.h>
+#else /* !defined(_MSC_VER) */
+typedef unsigned long uint32_t;
+#endif /* !defined(_MSC_VER) */
+#else /* _WIN32 */
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#endif /* _WIN32 */
+
+/* List of IP address blocks which are private / reserved and therefore not suitable for public external IP addresses */
+#define IP(a, b, c, d) (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+#define MSK(m) (32-(m))
+static const struct { uint32_t address; uint32_t rmask; } reserved[] = {
+	{ IP(  0,   0,   0, 0), MSK( 8) }, /* RFC1122 "This host on this network" */
+	{ IP( 10,   0,   0, 0), MSK( 8) }, /* RFC1918 Private-Use */
+	{ IP(100,  64,   0, 0), MSK(10) }, /* RFC6598 Shared Address Space */
+	{ IP(127,   0,   0, 0), MSK( 8) }, /* RFC1122 Loopback */
+	{ IP(169, 254,   0, 0), MSK(16) }, /* RFC3927 Link-Local */
+	{ IP(172,  16,   0, 0), MSK(12) }, /* RFC1918 Private-Use */
+	{ IP(192,   0,   0, 0), MSK(24) }, /* RFC6890 IETF Protocol Assignments */
+	{ IP(192,   0,   2, 0), MSK(24) }, /* RFC5737 Documentation (TEST-NET-1) */
+	{ IP(192,  31, 196, 0), MSK(24) }, /* RFC7535 AS112-v4 */
+	{ IP(192,  52, 193, 0), MSK(24) }, /* RFC7450 AMT */
+	{ IP(192,  88,  99, 0), MSK(24) }, /* RFC7526 6to4 Relay Anycast */
+	{ IP(192, 168,   0, 0), MSK(16) }, /* RFC1918 Private-Use */
+	{ IP(192, 175,  48, 0), MSK(24) }, /* RFC7534 Direct Delegation AS112 Service */
+	{ IP(198,  18,   0, 0), MSK(15) }, /* RFC2544 Benchmarking */
+	{ IP(198,  51, 100, 0), MSK(24) }, /* RFC5737 Documentation (TEST-NET-2) */
+	{ IP(203,   0, 113, 0), MSK(24) }, /* RFC5737 Documentation (TEST-NET-3) */
+	{ IP(224,   0,   0, 0), MSK( 4) }, /* RFC1112 Multicast */
+	{ IP(240,   0,   0, 0), MSK( 4) }, /* RFC1112 Reserved for Future Use + RFC919 Limited Broadcast */
+};
+#undef IP
+#undef MSK
+
+/**
+ * @return 1 or 0
+ */
+int addr_is_reserved(const char * addr_str)
+{
+	uint32_t addr_n, address;
+	size_t i;
+
+#if defined(_WIN32) && (!defined(_WIN32_WINNT_VISTA) || (_WIN32_WINNT < _WIN32_WINNT_VISTA))
+	addr_n = inet_addr(addr_str);
+	if (addr_n == INADDR_NONE)
+		return 1;
+#else
+	/* was : addr_n = inet_addr(addr_str); */
+	if (inet_pton(AF_INET, addr_str, &addr_n) <= 0) {
+		/* error */
+		return 1;
+	}
+#endif
+
+	address = ntohl(addr_n);
+
+	for (i = 0; i < sizeof(reserved)/sizeof(reserved[0]); ++i) {
+		if ((address >> reserved[i].rmask) == (reserved[i].address >> reserved[i].rmask))
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.h b/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.h
new file mode 100644
index 0000000000..f8b5d66a09
--- /dev/null
+++ b/thirdparty/miniupnpc/miniupnpc/addr_is_reserved.h
@@ -0,0 +1,14 @@
+/* $Id: $ */
+/* vim: tabstop=4 shiftwidth=4 noexpandtab
+ * Project: miniupnp
+ * http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
+ * Author: Thomas Bernard
+ * Copyright (c) 2005-2020 Thomas Bernard
+ * This software is subjects to the conditions detailed
+ * in the LICENCE file provided within this distribution */
+#ifndef ADDR_IS_RESERVED_H_INCLUDED
+#define ADDR_IS_RESERVED_H_INCLUDED
+
+int addr_is_reserved(const char * addr_str);
+
+#endif /* ADDR_IS_RESERVED_H_INCLUDED */
diff --git a/thirdparty/miniupnpc/miniupnpc/connecthostport.c b/thirdparty/miniupnpc/miniupnpc/connecthostport.c
index f3982e1a77..79f832b8db 100644
--- a/thirdparty/miniupnpc/miniupnpc/connecthostport.c
+++ b/thirdparty/miniupnpc/miniupnpc/connecthostport.c
@@ -1,8 +1,8 @@
-/* $Id: connecthostport.c,v 1.22 2019/10/13 17:22:08 nanard Exp $ */
+/* $Id: connecthostport.c,v 1.24 2020/11/09 19:26:53 nanard Exp $ */
 /* vim: tabstop=4 shiftwidth=4 noexpandtab
  * Project : miniupnp
  * Author : Thomas Bernard
- * Copyright (c) 2010-2019 Thomas Bernard
+ * Copyright (c) 2010-2020 Thomas Bernard
  * This software is subject to the conditions detailed in the
  * LICENCE file provided in this distribution. */
 
@@ -19,7 +19,7 @@
 #include <ws2tcpip.h>
 #include <io.h>
 #define MAXHOSTNAMELEN 64
-#define snprintf _snprintf
+#include "win32_snprintf.h"
 #define herror
 #define socklen_t int
 #else /* #ifdef _WIN32 */
diff --git a/thirdparty/miniupnpc/miniupnpc/listdevices.c b/thirdparty/miniupnpc/miniupnpc/listdevices.c
deleted file mode 100644
index bd9ba57efc..0000000000
--- a/thirdparty/miniupnpc/miniupnpc/listdevices.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/* $Id: listdevices.c,v 1.6 2015/07/23 20:40:08 nanard Exp $ */
-/* Project : miniupnp
- * Author : Thomas Bernard
- * Copyright (c) 2013-2015 Thomas Bernard
- * This software is subject to the conditions detailed in the
- * LICENCE file provided in this distribution. */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef _WIN32
-#include <winsock2.h>
-#endif /* _WIN32 */
-#include "miniupnpc.h"
-
-struct upnp_dev_list {
-	struct upnp_dev_list * next;
-	char * descURL;
-	struct UPNPDev * * array;
-	size_t count;
-	size_t allocated_count;
-};
-
-#define ADD_DEVICE_COUNT_STEP 16
-
-void add_device(struct upnp_dev_list * * list_head, struct UPNPDev * dev)
-{
-	struct upnp_dev_list * elt;
-	size_t i;
-
-	if(dev == NULL)
-		return;
-	for(elt = *list_head; elt != NULL; elt = elt->next) {
-		if(strcmp(elt->descURL, dev->descURL) == 0) {
-			for(i = 0; i < elt->count; i++) {
-				if (strcmp(elt->array[i]->st, dev->st) == 0 && strcmp(elt->array[i]->usn, dev->usn) == 0) {
-					return;	/* already found */
-				}
-			}
-			if(elt->count >= elt->allocated_count) {
-				struct UPNPDev * * tmp;
-				elt->allocated_count += ADD_DEVICE_COUNT_STEP;
-				tmp = realloc(elt->array, elt->allocated_count * sizeof(struct UPNPDev *));
-				if(tmp == NULL) {
-					fprintf(stderr, "Failed to realloc(%p, %lu)\n", elt->array, (unsigned long)(elt->allocated_count * sizeof(struct UPNPDev *)));
-					return;
-				}
-				elt->array = tmp;
-			}
-			elt->array[elt->count++] = dev;
-			return;
-		}
-	}
-	elt = malloc(sizeof(struct upnp_dev_list));
-	if(elt == NULL) {
-		fprintf(stderr, "Failed to malloc(%lu)\n", (unsigned long)sizeof(struct upnp_dev_list));
-		return;
-	}
-	elt->next = *list_head;
-	elt->descURL = strdup(dev->descURL);
-	if(elt->descURL == NULL) {
-		fprintf(stderr, "Failed to strdup(%s)\n", dev->descURL);
-		free(elt);
-		return;
-	}
-	elt->allocated_count = ADD_DEVICE_COUNT_STEP;
-	elt->array = malloc(ADD_DEVICE_COUNT_STEP * sizeof(struct UPNPDev *));
-	if(elt->array == NULL) {
-		fprintf(stderr, "Failed to malloc(%lu)\n", (unsigned long)(ADD_DEVICE_COUNT_STEP * sizeof(struct UPNPDev *)));
-		free(elt->descURL);
-		free(elt);
-		return;
-	}
-	elt->array[0] = dev;
-	elt->count = 1;
-	*list_head = elt;
-}
-
-void free_device(struct upnp_dev_list * elt)
-{
-	free(elt->descURL);
-	free(elt->array);
-	free(elt);
-}
-
-int main(int argc, char * * argv)
-{
-	const char * searched_device = NULL;
-	const char * * searched_devices = NULL;
-	const char * multicastif = 0;
-	const char * minissdpdpath = 0;
-	int ipv6 = 0;
-	unsigned char ttl = 2;
-	int error = 0;
-	struct UPNPDev * devlist = 0;
-	struct UPNPDev * dev;
-	struct upnp_dev_list * sorted_list = NULL;
-	struct upnp_dev_list * dev_array;
-	int i;
-
-#ifdef _WIN32
-	WSADATA wsaData;
-	int nResult = WSAStartup(MAKEWORD(2,2), &wsaData);
-	if(nResult != NO_ERROR)
-	{
-		fprintf(stderr, "WSAStartup() failed.\n");
-		return -1;
-	}
-#endif
-
-	for(i = 1; i < argc; i++) {
-		if(strcmp(argv[i], "-6") == 0)
-			ipv6 = 1;
-		else if(strcmp(argv[i], "-d") == 0) {
-			if(++i >= argc) {
-				fprintf(stderr, "%s option needs one argument\n", "-d");
-				return 1;
-			}
-			searched_device = argv[i];
-		} else if(strcmp(argv[i], "-t") == 0) {
-			if(++i >= argc) {
-				fprintf(stderr, "%s option needs one argument\n", "-t");
-				return 1;
-			}
-			ttl = (unsigned char)atoi(argv[i]);
-		} else if(strcmp(argv[i], "-l") == 0) {
-			if(++i >= argc) {
-				fprintf(stderr, "-l option needs at least one argument\n");
-				return 1;
-			}
-			searched_devices = (const char * *)(argv + i);
-			break;
-		} else if(strcmp(argv[i], "-m") == 0) {
-			if(++i >= argc) {
-				fprintf(stderr, "-m option needs one argument\n");
-				return 1;
-			}
-			multicastif = argv[i];
-		} else {
-			printf("usage : %s [options] [-l <device1> <device2> ...]\n", argv[0]);
-			printf("options :\n");
-			printf("   -6 : use IPv6\n");
-			printf("   -m address/ifname : network interface to use for multicast\n");
-			printf("   -d <device string> : search only for this type of device\n");
-			printf("   -l <device1> <device2> ... : search only for theses types of device\n");
-			printf("   -t ttl : set multicast TTL. Default value is 2.\n");
-			printf("   -h : this help\n");
-			return 1;
-		}
-	}
-
-	if(searched_device) {
-		printf("searching UPnP device type %s\n", searched_device);
-		devlist = upnpDiscoverDevice(searched_device,
-		                             2000, multicastif, minissdpdpath,
-		                             0/*localport*/, ipv6, ttl, &error);
-	} else if(searched_devices) {
-		printf("searching UPnP device types :\n");
-		for(i = 0; searched_devices[i]; i++)
-			printf("\t%s\n", searched_devices[i]);
-		devlist = upnpDiscoverDevices(searched_devices,
-		                              2000, multicastif, minissdpdpath,
-		                              0/*localport*/, ipv6, ttl, &error, 1);
-	} else {
-		printf("searching all UPnP devices\n");
-		devlist = upnpDiscoverAll(2000, multicastif, minissdpdpath,
-		                             0/*localport*/, ipv6, ttl, &error);
-	}
-	if(devlist) {
-		for(dev = devlist, i = 1; dev != NULL; dev = dev->pNext, i++) {
-			printf("%3d: %-48s\n", i, dev->st);
-			printf("     %s\n", dev->descURL);
-			printf("     %s\n", dev->usn);
-			add_device(&sorted_list, dev);
-		}
-		putchar('\n');
-		for (dev_array = sorted_list; dev_array != NULL ; dev_array = dev_array->next) {
-			printf("%s :\n", dev_array->descURL);
-			for(i = 0; (unsigned)i < dev_array->count; i++) {
-				printf("%2d: %s\n", i+1, dev_array->array[i]->st);
-				printf("    %s\n", dev_array->array[i]->usn);
-			}
-			putchar('\n');
-		}
-		freeUPNPDevlist(devlist);
-		while(sorted_list != NULL) {
-			dev_array = sorted_list;
-			sorted_list = sorted_list->next;
-			free_device(dev_array);
-		}
-	} else {
-		printf("no device found.\n");
-	}
-
-	return 0;
-}
-
diff --git a/thirdparty/miniupnpc/miniupnpc/minisoap.c b/thirdparty/miniupnpc/miniupnpc/minisoap.c
index f92b36ce89..78606672d5 100644
--- a/thirdparty/miniupnpc/miniupnpc/minisoap.c
+++ b/thirdparty/miniupnpc/miniupnpc/minisoap.c
@@ -1,8 +1,8 @@
-/* $Id: minisoap.c,v 1.25 2017/04/21 10:03:24 nanard Exp $ */
+/* $Id: minisoap.c,v 1.30 2020/11/09 19:27:42 nanard Exp $ */
 /* vim: tabstop=4 shiftwidth=4 noexpandtab
  * Project : miniupnp
  * Author : Thomas Bernard
- * Copyright (c) 2005-2018 Thomas Bernard
+ * Copyright (c) 2005-2020 Thomas Bernard
  * This software is subject to the conditions detailed in the
  * LICENCE file provided in this distribution.
  *
@@ -13,7 +13,7 @@
 #ifdef _WIN32
 #include <io.h>
 #include <winsock2.h>
-#define snprintf _snprintf
+#include "win32_snprintf.h"
 #else
 #include <unistd.h>
 #include <sys/types.h>
diff --git a/thirdparty/miniupnpc/miniupnpc/minissdpc.c b/thirdparty/miniupnpc/miniupnpc/minissdpc.c
index 36244dedec..5d3a0fd049 100644
--- a/thirdparty/miniupnpc/miniupnpc/minissdpc.c
+++ b/thirdparty/miniupnpc/miniupnpc/minissdpc.c
@@ -1,15 +1,15 @@
-/* $Id: minissdpc.c,v 1.40 2019/04/23 12:12:55 nanard Exp $ */
+/* $Id: minissdpc.c,v 1.47 2021/03/02 23:38:30 nanard Exp $ */
 /* vim: tabstop=4 shiftwidth=4 noexpandtab
  * Project : miniupnp
- * Web : http://miniupnp.free.fr/
+ * Web : http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
  * Author : Thomas BERNARD
- * copyright (c) 2005-2019 Thomas Bernard
+ * copyright (c) 2005-2021 Thomas Bernard
  * This software is subjet to the conditions detailed in the
  * provided LICENCE file. */
-/*#include <syslog.h>*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <time.h>
 #include <sys/types.h>
 #if defined (__NetBSD__)
 #include <net/if.h>
@@ -20,7 +20,7 @@
 #include <ws2tcpip.h>
 #include <io.h>
 #include <iphlpapi.h>
-#define snprintf _snprintf
+#include "win32_snprintf.h"
 #if !defined(_MSC_VER)
 #include <stdint.h>
 #else /* !defined(_MSC_VER) */
@@ -33,6 +33,12 @@ typedef unsigned short uint16_t;
 #define strncasecmp memicmp
 #endif /* defined(_MSC_VER) && (_MSC_VER >= 1400) */
 #endif /* #ifndef strncasecmp */
+#if defined(WINAPI_FAMILY) && defined(WINAPI_FAMILY_PARTITION)
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP
+#define in6addr_any in6addr_any_init
+static const IN6_ADDR in6addr_any_init = {0};
+#endif
+#endif
 #endif /* _WIN32 */
 #if defined(__amigaos__) || defined(__amigaos4__)
 #include <sys/socket.h>
@@ -66,7 +72,7 @@ struct sockaddr_un {
 #define HAS_IP_MREQN
 #endif
 
-#if !defined(HAS_IP_MREQN) && !defined(_WIN32)
+#ifndef _WIN32
 #include <sys/ioctl.h>
 #if defined(__sun) || defined(__HAIKU__)
 #include <sys/sockio.h>
@@ -445,6 +451,36 @@ parseMSEARCHReply(const char * reply, int size,
 	}
 }
 
+#if defined(CLOCK_MONOTONIC_FAST)
+#define UPNP_CLOCKID CLOCK_MONOTONIC_FAST
+#elif defined(CLOCK_MONOTONIC)
+#define UPNP_CLOCKID CLOCK_MONOTONIC
+#endif
+
+static int upnp_gettimeofday(struct timeval * tv)
+{
+#if defined(_WIN32)
+#if defined(_WIN32_WINNT_VISTA) && (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+	ULONGLONG ts = GetTickCount64();
+#else
+	DWORD ts = GetTickCount();
+#endif
+	tv->tv_sec = (long)(ts / 1000);
+	tv->tv_usec = (ts % 1000) * 1000;
+	return 0; /* success */
+#elif defined(CLOCK_MONOTONIC_FAST) || defined(CLOCK_MONOTONIC)
+	struct timespec ts;
+	int ret_code = clock_gettime(UPNP_CLOCKID, &ts);
+	if (ret_code == 0)
+	{
+		tv->tv_sec = ts.tv_sec;
+		tv->tv_usec = ts.tv_nsec / 1000;
+	}
+	return ret_code;
+#else
+	return gettimeofday(tv, NULL);
+#endif
+}
 /* port upnp discover : SSDP protocol */
 #define SSDP_PORT 1900
 #define XSTR(s) STR(s)
@@ -540,12 +576,17 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
  * in order to give this ip to setsockopt(sudp, IPPROTO_IP, IP_MULTICAST_IF) */
 	if(!ipv6) {
 		DWORD ifbestidx;
+#if _WIN32_WINNT >= 0x0600 // _WIN32_WINNT_VISTA
+		// While we don't need IPv6 support, the IPv4 only funciton is not available in UWP apps.
 		SOCKADDR_IN destAddr;
 		memset(&destAddr, 0, sizeof(destAddr));
 		destAddr.sin_family = AF_INET;
 		destAddr.sin_addr.s_addr = inet_addr("223.255.255.255");
 		destAddr.sin_port = 0;
 		if (GetBestInterfaceEx((struct sockaddr *)&destAddr, &ifbestidx) == NO_ERROR) {
+#else
+		if (GetBestInterface(inet_addr("223.255.255.255"), &ifbestidx) == NO_ERROR) {
+#endif
 			DWORD dwRetVal = NO_ERROR;
 			PIP_ADAPTER_ADDRESSES pAddresses = NULL;
 			ULONG outBufLen = 15360;
@@ -672,6 +713,13 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
 			 * MS Windows Vista and MS Windows Server 2008.
 			 * http://msdn.microsoft.com/en-us/library/bb408409%28v=vs.85%29.aspx */
 			unsigned int ifindex = if_nametoindex(multicastif); /* eth0, etc. */
+			if(ifindex == 0)
+			{
+				if(error)
+					*error = MINISSDPC_INVALID_INPUT;
+				fprintf(stderr, "Invalid multicast interface name %s\n", multicastif);
+				goto error;
+			}
 			if(setsockopt(sudp, IPPROTO_IPV6, IPV6_MULTICAST_IF, &ifindex, sizeof(ifindex)) < 0)
 			{
 				PRINT_SOCKET_ERROR("setsockopt IPV6_MULTICAST_IF");
@@ -683,7 +731,18 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
 #endif
 		} else {
 			struct in_addr mc_if;
-			mc_if.s_addr = inet_addr(multicastif); /* ex: 192.168.x.x */
+#if defined(_WIN32)
+#if defined(_WIN32_WINNT_VISTA) && (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+			InetPtonA(AF_INET, multicastif, &mc_if);
+#else
+			mc_if.s_addr = inet_addr(multicastif); /* old Windows SDK do not support InetPtoA() */
+#endif
+#else
+			/* was : mc_if.s_addr = inet_addr(multicastif); */ /* ex: 192.168.x.x */
+			if (inet_pton(AF_INET, multicastif, &mc_if.s_addr) <= 0) {
+				mc_if.s_addr = INADDR_NONE;
+			}
+#endif
 			if(mc_if.s_addr != INADDR_NONE)
 			{
 				((struct sockaddr_in *)&sockudp_r)->sin_addr.s_addr = mc_if.s_addr;
@@ -692,16 +751,11 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
 					PRINT_SOCKET_ERROR("setsockopt IP_MULTICAST_IF");
 				}
 			} else {
-#ifdef HAS_IP_MREQN
 				/* was not an ip address, try with an interface name */
+#ifndef _WIN32
+#ifdef HAS_IP_MREQN
 				struct ip_mreqn reqn;	/* only defined with -D_BSD_SOURCE or -D_GNU_SOURCE */
-				memset(&reqn, 0, sizeof(struct ip_mreqn));
-				reqn.imr_ifindex = if_nametoindex(multicastif);
-				if(setsockopt(sudp, IPPROTO_IP, IP_MULTICAST_IF, (const char *)&reqn, sizeof(reqn)) < 0)
-				{
-					PRINT_SOCKET_ERROR("setsockopt IP_MULTICAST_IF");
-				}
-#elif !defined(_WIN32)
+#endif
 				struct ifreq ifr;
 				int ifrlen = sizeof(ifr);
 				strncpy(ifr.ifr_name, multicastif, IFNAMSIZ);
@@ -709,12 +763,30 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
 				if(ioctl(sudp, SIOCGIFADDR, &ifr, &ifrlen) < 0)
 				{
 					PRINT_SOCKET_ERROR("ioctl(...SIOCGIFADDR...)");
+					goto error;
 				}
 				mc_if.s_addr = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+#ifdef HAS_IP_MREQN
+				memset(&reqn, 0, sizeof(struct ip_mreqn));
+				reqn.imr_address.s_addr = mc_if.s_addr;
+				reqn.imr_ifindex = if_nametoindex(multicastif);
+				if(reqn.imr_ifindex == 0)
+				{
+					if(error)
+						*error = MINISSDPC_INVALID_INPUT;
+					fprintf(stderr, "Invalid multicast ip address / interface name %s\n", multicastif);
+					goto error;
+				}
+				if(setsockopt(sudp, IPPROTO_IP, IP_MULTICAST_IF, (const char *)&reqn, sizeof(reqn)) < 0)
+				{
+					PRINT_SOCKET_ERROR("setsockopt IP_MULTICAST_IF");
+				}
+#else
 				if(setsockopt(sudp, IPPROTO_IP, IP_MULTICAST_IF, (const char *)&mc_if, sizeof(mc_if)) < 0)
 				{
 					PRINT_SOCKET_ERROR("setsockopt IP_MULTICAST_IF");
 				}
+#endif
 #else /* _WIN32 */
 #ifdef DEBUG
 				printf("Setting of multicast interface not supported with interface name.\n");
@@ -838,73 +910,84 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
 		/* Waiting for SSDP REPLY packet to M-SEARCH
 		 * if searchalltypes is set, enter the loop only
 		 * when the last deviceType is reached */
-		if((sentok && !searchalltypes) || !deviceTypes[deviceIndex + 1]) do {
-			n = receivedata(sudp, bufr, sizeof(bufr), delay, &scope_id);
-			if (n < 0) {
-				/* error */
-				if(error)
-					*error = MINISSDPC_SOCKET_ERROR;
-				goto error;
-			} else if (n == 0) {
-				/* no data or Time Out */
-#ifdef DEBUG
-				printf("NODATA or TIMEOUT\n");
-#endif /* DEBUG */
-				if (devlist && !searchalltypes) {
-					/* found some devices, stop now*/
+		if((sentok && !searchalltypes) || !deviceTypes[deviceIndex + 1]) {
+			struct timeval start = {0, 0}, current = {0, 0};
+			upnp_gettimeofday(&start);
+			do {
+				n = receivedata(sudp, bufr, sizeof(bufr), delay, &scope_id);
+				if (n < 0) {
+					/* error */
 					if(error)
-						*error = MINISSDPC_SUCCESS;
+						*error = MINISSDPC_SOCKET_ERROR;
 					goto error;
-				}
-			} else {
-				const char * descURL=NULL;
-				int urlsize=0;
-				const char * st=NULL;
-				int stsize=0;
-				const char * usn=NULL;
-				int usnsize=0;
-				parseMSEARCHReply(bufr, n, &descURL, &urlsize, &st, &stsize, &usn, &usnsize);
-				if(st&&descURL) {
+				} else if (n == 0) {
+					/* no data or Time Out */
 #ifdef DEBUG
-					printf("M-SEARCH Reply:\n  ST: %.*s\n  USN: %.*s\n  Location: %.*s\n",
-					       stsize, st, usnsize, (usn?usn:""), urlsize, descURL);
+					printf("NODATA or TIMEOUT\n");
 #endif /* DEBUG */
-					for(tmp=devlist; tmp; tmp = tmp->pNext) {
-						if(strncmp(tmp->descURL, descURL, urlsize) == 0 &&
-						   tmp->descURL[urlsize] == '\0' &&
-						   strncmp(tmp->st, st, stsize) == 0 &&
-						   tmp->st[stsize] == '\0' &&
-						   (usnsize == 0 || strncmp(tmp->usn, usn, usnsize) == 0) &&
-						   tmp->usn[usnsize] == '\0')
-							break;
-					}
-					/* at the exit of the loop above, tmp is null if
-					 * no duplicate device was found */
-					if(tmp)
-						continue;
-					tmp = (struct UPNPDev *)malloc(sizeof(struct UPNPDev)+urlsize+stsize+usnsize);
-					if(!tmp) {
-						/* memory allocation error */
+					if (devlist && !searchalltypes) {
+						/* found some devices, stop now*/
 						if(error)
-							*error = MINISSDPC_MEMORY_ERROR;
+							*error = MINISSDPC_SUCCESS;
 						goto error;
 					}
-					tmp->pNext = devlist;
-					tmp->descURL = tmp->buffer;
-					tmp->st = tmp->buffer + 1 + urlsize;
-					tmp->usn = tmp->st + 1 + stsize;
-					memcpy(tmp->buffer, descURL, urlsize);
-					tmp->buffer[urlsize] = '\0';
-					memcpy(tmp->st, st, stsize);
-					tmp->buffer[urlsize+1+stsize] = '\0';
-					if(usn != NULL)
-						memcpy(tmp->usn, usn, usnsize);
-					tmp->buffer[urlsize+1+stsize+1+usnsize] = '\0';
-					tmp->scope_id = scope_id;
-					devlist = tmp;
+				} else {
+					const char * descURL=NULL;
+					int urlsize=0;
+					const char * st=NULL;
+					int stsize=0;
+					const char * usn=NULL;
+					int usnsize=0;
+					parseMSEARCHReply(bufr, n, &descURL, &urlsize, &st, &stsize, &usn, &usnsize);
+					if(st&&descURL) {
+#ifdef DEBUG
+						printf("M-SEARCH Reply:\n  ST: %.*s\n  USN: %.*s\n  Location: %.*s\n",
+						       stsize, st, usnsize, (usn?usn:""), urlsize, descURL);
+#endif /* DEBUG */
+						for(tmp=devlist; tmp; tmp = tmp->pNext) {
+							if(strncmp(tmp->descURL, descURL, urlsize) == 0 &&
+							   tmp->descURL[urlsize] == '\0' &&
+							   strncmp(tmp->st, st, stsize) == 0 &&
+							   tmp->st[stsize] == '\0' &&
+							   (usnsize == 0 || strncmp(tmp->usn, usn, usnsize) == 0) &&
+							   tmp->usn[usnsize] == '\0')
+								break;
+						}
+						/* at the exit of the loop above, tmp is null if
+						 * no duplicate device was found */
+						if(tmp)
+							continue;
+						tmp = (struct UPNPDev *)malloc(sizeof(struct UPNPDev)+urlsize+stsize+usnsize+3);
+						if(!tmp) {
+							/* memory allocation error */
+							if(error)
+								*error = MINISSDPC_MEMORY_ERROR;
+							goto error;
+						}
+						tmp->pNext = devlist;
+						tmp->descURL = tmp->buffer;
+						tmp->st = tmp->buffer + 1 + urlsize;
+						tmp->usn = tmp->st + 1 + stsize;
+						memcpy(tmp->buffer, descURL, urlsize);
+						tmp->buffer[urlsize] = '\0';
+						memcpy(tmp->st, st, stsize);
+						tmp->buffer[urlsize+1+stsize] = '\0';
+						if(usn != NULL)
+							memcpy(tmp->usn, usn, usnsize);
+						tmp->buffer[urlsize+1+stsize+1+usnsize] = '\0';
+						tmp->scope_id = scope_id;
+						devlist = tmp;
+					}
+					if (upnp_gettimeofday(&current) >= 0) {
+						/* exit the loop if delay is reached */
+						long interval = (current.tv_sec - start.tv_sec) * 1000;
+						interval += (current.tv_usec - start.tv_usec) / 1000;
+						if (interval > (long)delay)
+							break;
+					}
 				}
-			}
-		} while(n > 0);
+			} while(n > 0);
+		}
 		if(ipv6) {
 			/* switch linklocal flag */
 			if(linklocal) {
@@ -919,4 +1002,3 @@ error:
 	closesocket(sudp);
 	return devlist;
 }
-
diff --git a/thirdparty/miniupnpc/miniupnpc/miniupnpc.c b/thirdparty/miniupnpc/miniupnpc/miniupnpc.c
index 95ab6cf56b..696af93237 100644
--- a/thirdparty/miniupnpc/miniupnpc/miniupnpc.c
+++ b/thirdparty/miniupnpc/miniupnpc/miniupnpc.c
@@ -1,9 +1,9 @@
-/* $Id: miniupnpc.c,v 1.154 2019/04/23 12:12:13 nanard Exp $ */
+/* $Id: miniupnpc.c,v 1.159 2021/03/02 23:36:32 nanard Exp $ */
 /* vim: tabstop=4 shiftwidth=4 noexpandtab
  * Project : miniupnp
- * Web : http://miniupnp.free.fr/
+ * Web : http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
  * Author : Thomas BERNARD
- * copyright (c) 2005-2019 Thomas Bernard
+ * copyright (c) 2005-2021 Thomas Bernard
  * This software is subjet to the conditions detailed in the
  * provided LICENSE file. */
 #include <stdlib.h>
@@ -15,7 +15,7 @@
 #include <ws2tcpip.h>
 #include <io.h>
 #include <iphlpapi.h>
-#define snprintf _snprintf
+#include "win32_snprintf.h"
 #define strdup _strdup
 #ifndef strncasecmp
 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
@@ -61,6 +61,7 @@
 #include "minixml.h"
 #include "upnpcommands.h"
 #include "connecthostport.h"
+#include "addr_is_reserved.h"
 
 /* compare the beginning of a string with a constant string */
 #define COMPARE(str, cstr) (0==strncmp(str, cstr, sizeof(cstr) - 1))
@@ -73,24 +74,6 @@
 #define SERVICEPREFIX "u"
 #define SERVICEPREFIX2 'u'
 
-/* check if an ip address is a private (LAN) address
- * see https://tools.ietf.org/html/rfc1918 */
-static int is_rfc1918addr(const char * addr)
-{
-	/* 192.168.0.0     -   192.168.255.255 (192.168/16 prefix) */
-	if(COMPARE(addr, "192.168."))
-		return 1;
-	/* 10.0.0.0        -   10.255.255.255  (10/8 prefix) */
-	if(COMPARE(addr, "10."))
-		return 1;
-	/* 172.16.0.0      -   172.31.255.255  (172.16/12 prefix) */
-	if(COMPARE(addr, "172.")) {
-		if((atoi(addr + 4) | 0x0f) == 0x1f)
-			return 1;
-	}
-	return 0;
-}
-
 /* root description parsing */
 MINIUPNP_LIBSPEC void parserootdesc(const char * buffer, int bufsize, struct IGDdatas * data)
 {
@@ -337,6 +320,8 @@ upnpDiscoverDevices(const char * const deviceTypes[],
 			return devlist;
 		}
 	}
+#else	/* !defined(_WIN32) && !defined(__amigaos__) && !defined(__amigaos4__) */
+	(void)minissdpdsock; /* unused */
 #endif	/* !defined(_WIN32) && !defined(__amigaos__) && !defined(__amigaos4__) */
 
 	/* direct discovery if minissdpd responses are not sufficient */
@@ -643,8 +628,7 @@ UPNP_GetValidIGD(struct UPNPDev * devlist,
 				  /* checks that status is connected AND there is a external IP address assigned */
 				  if(is_connected &&
 				     (UPNP_GetExternalIPAddress(urls->controlURL,  data->first.servicetype, extIpAddr) == 0)) {
-					if(!is_rfc1918addr(extIpAddr) && (extIpAddr[0] != '\0')
-					   && (0 != strcmp(extIpAddr, "0.0.0.0")))
+					if(!addr_is_reserved(extIpAddr))
 					  goto free_and_return;
 				  }
 				  FreeUPNPUrls(urls);
@@ -665,8 +649,7 @@ UPNP_GetValidIGD(struct UPNPDev * devlist,
 #endif
 				    if(is_connected &&
 				       (UPNP_GetExternalIPAddress(urls->controlURL,  data->first.servicetype, extIpAddr) == 0)) {
-					  if(!is_rfc1918addr(extIpAddr) && (extIpAddr[0] != '\0')
-					     && (0 != strcmp(extIpAddr, "0.0.0.0")))
+					  if(!addr_is_reserved(extIpAddr))
 					    goto free_and_return;
 				    }
 				    FreeUPNPUrls(urls);
diff --git a/thirdparty/miniupnpc/miniupnpc/miniupnpc.h b/thirdparty/miniupnpc/miniupnpc/miniupnpc.h
index 8ddc282bd1..3aef8ea443 100644
--- a/thirdparty/miniupnpc/miniupnpc/miniupnpc.h
+++ b/thirdparty/miniupnpc/miniupnpc/miniupnpc.h
@@ -1,9 +1,9 @@
-/* $Id: miniupnpc.h,v 1.53 2018/05/07 11:05:16 nanard Exp $ */
+/* $Id: miniupnpc.h,v 1.58 2021/03/02 23:49:52 nanard Exp $ */
 /* vim: tabstop=4 shiftwidth=4 noexpandtab
  * Project: miniupnp
  * http://miniupnp.free.fr/
  * Author: Thomas Bernard
- * Copyright (c) 2005-2018 Thomas Bernard
+ * Copyright (c) 2005-2021 Thomas Bernard
  * This software is subjects to the conditions detailed
  * in the LICENCE file provided within this distribution */
 #ifndef MINIUPNPC_H_INCLUDED
@@ -20,7 +20,7 @@
 #define UPNPDISCOVER_MEMORY_ERROR (-102)
 
 /* versions : */
-#define MINIUPNPC_VERSION	"2.1"
+#define MINIUPNPC_VERSION	"2.2.2"
 #define MINIUPNPC_API_VERSION	17
 
 /* Source port:
diff --git a/thirdparty/miniupnpc/miniupnpc/miniupnpc_socketdef.h b/thirdparty/miniupnpc/miniupnpc/miniupnpc_socketdef.h
index d4f79a7bd6..5986e58c76 100644
--- a/thirdparty/miniupnpc/miniupnpc/miniupnpc_socketdef.h
+++ b/thirdparty/miniupnpc/miniupnpc/miniupnpc_socketdef.h
@@ -7,7 +7,7 @@
 #ifndef MINIUPNPC_SOCKETDEF_H_INCLUDED
 #define MINIUPNPC_SOCKETDEF_H_INCLUDED
 
-#ifdef _MSC_VER
+#ifdef _WIN32
 
 #define ISINVALID(s) (INVALID_SOCKET==(s))
 
diff --git a/thirdparty/miniupnpc/miniupnpc/miniupnpcmodule.c b/thirdparty/miniupnpc/miniupnpc/miniupnpcmodule.c
deleted file mode 100644
index d9341ab5bf..0000000000
--- a/thirdparty/miniupnpc/miniupnpc/miniupnpcmodule.c
+++ /dev/null
@@ -1,721 +0,0 @@
-/* $Id: miniupnpcmodule.c,v 1.34 2019/05/20 19:07:16 nanard Exp $*/
-/* vim: tabstop=4 shiftwidth=4 noexpandtab
- * Project : miniupnp
- * Author : Thomas BERNARD
- * website : https://miniupnp.tuxfamily.org/
- * copyright (c) 2007-2019 Thomas Bernard
- * This software is subjet to the conditions detailed in the
- * provided LICENCE file. */
-#include <Python.h>
-#define MINIUPNP_STATICLIB
-#include "structmember.h"
-#include "miniupnpc.h"
-#include "upnpcommands.h"
-#include "upnperrors.h"
-
-#ifdef _WIN32
-#include <winsock2.h>
-#endif
-
-/* for compatibility with Python < 2.4 */
-#ifndef Py_RETURN_NONE
-#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
-#endif
-
-#ifndef Py_RETURN_TRUE
-#define Py_RETURN_TRUE return Py_INCREF(Py_True), Py_True
-#endif
-
-#ifndef Py_RETURN_FALSE
-#define Py_RETURN_FALSE return Py_INCREF(Py_False), Py_False
-#endif
-
-/* for compatibility with Python < 3.0 */
-#ifndef PyVarObject_HEAD_INIT
-#define PyVarObject_HEAD_INIT(type, size) \
-    PyObject_HEAD_INIT(type) size,
-#endif
-
-#ifndef Py_TYPE
-#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
-#endif
-
-typedef struct {
-    PyObject_HEAD
-    /* Type-specific fields go here. */
-	struct UPNPDev * devlist;
-	struct UPNPUrls urls;
-	struct IGDdatas data;
-	unsigned int discoverdelay;	/* value passed to upnpDiscover() */
-	unsigned int localport;		/* value passed to upnpDiscover() */
-	char lanaddr[40];	/* our ip address on the LAN */
-	char * multicastif;
-	char * minissdpdsocket;
-} UPnPObject;
-
-static PyMemberDef UPnP_members[] = {
-	{"lanaddr", T_STRING_INPLACE, offsetof(UPnPObject, lanaddr),
-	 READONLY, "ip address on the LAN"
-	},
-	{"discoverdelay", T_UINT, offsetof(UPnPObject, discoverdelay),
-	 0/*READWRITE*/, "value in ms used to wait for SSDP responses"
-	},
-	{"localport", T_UINT, offsetof(UPnPObject, localport),
-	 0/*READWRITE*/,
-	    "If localport is set to UPNP_LOCAL_PORT_SAME(1) "
-	    "SSDP packets will be sent from the source port "
-	    "1900 (same as destination port), if set to "
-	    "UPNP_LOCAL_PORT_ANY(0) system assign a source "
-	    "port, any other value will be attempted as the "
-	    "source port"
-	},
-	/* T_STRING is allways readonly :( */
-	{"multicastif", T_STRING, offsetof(UPnPObject, multicastif),
-	 0, "IP of the network interface to be used for multicast operations"
-	},
-	{"minissdpdsocket", T_STRING, offsetof(UPnPObject, minissdpdsocket),
-	 0, "path of the MiniSSDPd unix socket"
-	},
-	{NULL}
-};
-
-
-static int UPnP_init(UPnPObject *self, PyObject *args, PyObject *kwds)
-{
-	char* multicastif = NULL;
-	char* minissdpdsocket = NULL;
-	static char *kwlist[] = {
-		"multicastif", "minissdpdsocket", "discoverdelay",
-		"localport", NULL
-	};
-
-	if(!PyArg_ParseTupleAndKeywords(args, kwds, "|zzII", kwlist,
-					&multicastif,
-					&minissdpdsocket,
-					&self->discoverdelay,
-					&self->localport))
-		return -1;
-
-	if(self->localport>1 &&
-	   (self->localport>65534||self->localport<1024)) {
-	    PyErr_SetString(PyExc_Exception, "Invalid localport value");
-	    return -1;
-	}
-	if(multicastif)
-		self->multicastif = strdup(multicastif);
-	if(minissdpdsocket)
-		self->minissdpdsocket = strdup(minissdpdsocket);
-
-	return 0;
-}
-
-static void
-UPnPObject_dealloc(UPnPObject *self)
-{
-	freeUPNPDevlist(self->devlist);
-	FreeUPNPUrls(&self->urls);
-	free(self->multicastif);
-	free(self->minissdpdsocket);
-	Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-static PyObject *
-UPnP_discover(UPnPObject *self)
-{
-	struct UPNPDev * dev;
-	int i;
-	PyObject *res = NULL;
-	if(self->devlist)
-	{
-		freeUPNPDevlist(self->devlist);
-		self->devlist = 0;
-	}
-	Py_BEGIN_ALLOW_THREADS
-	self->devlist = upnpDiscover((int)self->discoverdelay/*timeout in ms*/,
-	                             self->multicastif,
-	                             self->minissdpdsocket,
-	                             (int)self->localport,
-	                             0/*ip v6*/,
-	                             2/* TTL */,
-	                             0/*error */);
-	Py_END_ALLOW_THREADS
-	/* Py_RETURN_NONE ??? */
-	for(dev = self->devlist, i = 0; dev; dev = dev->pNext)
-		i++;
-	res = Py_BuildValue("i", i);
-	return res;
-}
-
-static PyObject *
-UPnP_selectigd(UPnPObject *self)
-{
-	int r;
-Py_BEGIN_ALLOW_THREADS
-	r = UPNP_GetValidIGD(self->devlist, &self->urls, &self->data,
-	                     self->lanaddr, sizeof(self->lanaddr));
-Py_END_ALLOW_THREADS
-	if(r)
-	{
-		return Py_BuildValue("s", self->urls.controlURL);
-	}
-	else
-	{
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, "No UPnP device discovered");
-		return NULL;
-	}
-}
-
-static PyObject *
-UPnP_totalbytesent(UPnPObject *self)
-{
-	UNSIGNED_INTEGER i;
-Py_BEGIN_ALLOW_THREADS
-	i = UPNP_GetTotalBytesSent(self->urls.controlURL_CIF,
-	                           self->data.CIF.servicetype);
-Py_END_ALLOW_THREADS
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-	return Py_BuildValue("I", i);
-#else
-	return Py_BuildValue("i", (int)i);
-#endif
-}
-
-static PyObject *
-UPnP_totalbytereceived(UPnPObject *self)
-{
-	UNSIGNED_INTEGER i;
-Py_BEGIN_ALLOW_THREADS
-	i = UPNP_GetTotalBytesReceived(self->urls.controlURL_CIF,
-		                           self->data.CIF.servicetype);
-Py_END_ALLOW_THREADS
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-	return Py_BuildValue("I", i);
-#else
-	return Py_BuildValue("i", (int)i);
-#endif
-}
-
-static PyObject *
-UPnP_totalpacketsent(UPnPObject *self)
-{
-	UNSIGNED_INTEGER i;
-Py_BEGIN_ALLOW_THREADS
-	i = UPNP_GetTotalPacketsSent(self->urls.controlURL_CIF,
-		                         self->data.CIF.servicetype);
-Py_END_ALLOW_THREADS
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-	return Py_BuildValue("I", i);
-#else
-	return Py_BuildValue("i", (int)i);
-#endif
-}
-
-static PyObject *
-UPnP_totalpacketreceived(UPnPObject *self)
-{
-	UNSIGNED_INTEGER i;
-Py_BEGIN_ALLOW_THREADS
-	i = UPNP_GetTotalPacketsReceived(self->urls.controlURL_CIF,
-		                          self->data.CIF.servicetype);
-Py_END_ALLOW_THREADS
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-	return Py_BuildValue("I", i);
-#else
-	return Py_BuildValue("i", (int)i);
-#endif
-}
-
-static PyObject *
-UPnP_statusinfo(UPnPObject *self)
-{
-	char status[64];
-	char lastconnerror[64];
-	unsigned int uptime = 0;
-	int r;
-	status[0] = '\0';
-	lastconnerror[0] = '\0';
-Py_BEGIN_ALLOW_THREADS
-	r = UPNP_GetStatusInfo(self->urls.controlURL, self->data.first.servicetype,
-	                   status, &uptime, lastconnerror);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-		return Py_BuildValue("(s,I,s)", status, uptime, lastconnerror);
-#else
-		return Py_BuildValue("(s,i,s)", status, (int)uptime, lastconnerror);
-#endif
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-static PyObject *
-UPnP_connectiontype(UPnPObject *self)
-{
-	char connectionType[64];
-	int r;
-	connectionType[0] = '\0';
-Py_BEGIN_ALLOW_THREADS
-	r = UPNP_GetConnectionTypeInfo(self->urls.controlURL,
-	                               self->data.first.servicetype,
-	                               connectionType);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-		return Py_BuildValue("s", connectionType);
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-static PyObject *
-UPnP_externalipaddress(UPnPObject *self)
-{
-	char externalIPAddress[40];
-	int r;
-	externalIPAddress[0] = '\0';
-Py_BEGIN_ALLOW_THREADS
-	r = UPNP_GetExternalIPAddress(self->urls.controlURL,
-	                              self->data.first.servicetype,
-	                              externalIPAddress);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-		return Py_BuildValue("s", externalIPAddress);
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-/* AddPortMapping(externalPort, protocol, internalHost, internalPort, desc,
- *                remoteHost, leaseDuration)
- * protocol is 'UDP' or 'TCP' */
-static PyObject *
-UPnP_addportmapping(UPnPObject *self, PyObject *args)
-{
-	char extPort[6];
-	unsigned short ePort;
-	char inPort[6];
-	unsigned short iPort;
-	const char * proto;
-	const char * host;
-	const char * desc;
-	const char * remoteHost;
-	unsigned int intLeaseDuration = 0;
-	char strLeaseDuration[12];
-	int r;
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-	if (!PyArg_ParseTuple(args, "HssHzz|I", &ePort, &proto,
-	                                     &host, &iPort, &desc, &remoteHost, &intLeaseDuration))
-#else
-	if (!PyArg_ParseTuple(args, "HssHzz|i", &ePort, &proto,
-	                                     &host, &iPort, &desc, &remoteHost, (int *)&intLeaseDuration))
-#endif
-        return NULL;
-Py_BEGIN_ALLOW_THREADS
-	sprintf(extPort, "%hu", ePort);
-	sprintf(inPort, "%hu", iPort);
-	sprintf(strLeaseDuration, "%u", intLeaseDuration);
-	r = UPNP_AddPortMapping(self->urls.controlURL, self->data.first.servicetype,
-	                        extPort, inPort, host, desc, proto,
-	                        remoteHost, strLeaseDuration);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS)
-	{
-		Py_RETURN_TRUE;
-	}
-	else
-	{
-		// TODO: RAISE an Exception. See upnpcommands.h for errors codes.
-		// upnperrors.c
-		//Py_RETURN_FALSE;
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-/* AddAnyPortMapping(externalPort, protocol, internalHost, internalPort, desc,
- *                   remoteHost)
- * protocol is 'UDP' or 'TCP' */
-static PyObject *
-UPnP_addanyportmapping(UPnPObject *self, PyObject *args)
-{
-	char extPort[6];
-	unsigned short ePort;
-	char inPort[6];
-	unsigned short iPort;
-	char reservedPort[6];
-	const char * proto;
-	const char * host;
-	const char * desc;
-	const char * remoteHost;
-	const char * leaseDuration = "0";
-	int r;
-	if (!PyArg_ParseTuple(args, "HssHzz", &ePort, &proto, &host, &iPort, &desc, &remoteHost))
-        return NULL;
-Py_BEGIN_ALLOW_THREADS
-	sprintf(extPort, "%hu", ePort);
-	sprintf(inPort, "%hu", iPort);
-	r = UPNP_AddAnyPortMapping(self->urls.controlURL, self->data.first.servicetype,
-	                           extPort, inPort, host, desc, proto,
-	                           remoteHost, leaseDuration, reservedPort);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-		return Py_BuildValue("i", atoi(reservedPort));
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-
-/* DeletePortMapping(extPort, proto, removeHost='')
- * proto = 'UDP', 'TCP' */
-static PyObject *
-UPnP_deleteportmapping(UPnPObject *self, PyObject *args)
-{
-	char extPort[6];
-	unsigned short ePort;
-	const char * proto;
-	const char * remoteHost = "";
-	int r;
-	if(!PyArg_ParseTuple(args, "Hs|z", &ePort, &proto, &remoteHost))
-		return NULL;
-Py_BEGIN_ALLOW_THREADS
-	sprintf(extPort, "%hu", ePort);
-	r = UPNP_DeletePortMapping(self->urls.controlURL, self->data.first.servicetype,
-	                           extPort, proto, remoteHost);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-		Py_RETURN_TRUE;
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-/* DeletePortMappingRange(extPort, proto, removeHost='')
- * proto = 'UDP', 'TCP' */
-static PyObject *
-UPnP_deleteportmappingrange(UPnPObject *self, PyObject *args)
-{
-	char extPortStart[6];
-	unsigned short ePortStart;
-	char extPortEnd[6];
-	unsigned short ePortEnd;
-	const char * proto;
-	unsigned char manage;
-	char manageStr[6];
-	int r;
-	if(!PyArg_ParseTuple(args, "HHsb", &ePortStart, &ePortEnd, &proto, &manage))
-		return NULL;
-Py_BEGIN_ALLOW_THREADS
-	sprintf(extPortStart, "%hu", ePortStart);
-	sprintf(extPortEnd, "%hu", ePortEnd);
-	sprintf(manageStr, "%hu", (unsigned short)manage);
-	r = UPNP_DeletePortMappingRange(self->urls.controlURL, self->data.first.servicetype,
-					extPortStart, extPortEnd, proto, manageStr);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-		Py_RETURN_TRUE;
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-static PyObject *
-UPnP_getportmappingnumberofentries(UPnPObject *self)
-{
-	unsigned int n = 0;
-	int r;
-Py_BEGIN_ALLOW_THREADS
-	r = UPNP_GetPortMappingNumberOfEntries(self->urls.controlURL,
-	                                   self->data.first.servicetype,
-									   &n);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS) {
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-		return Py_BuildValue("I", n);
-#else
-		return Py_BuildValue("i", (int)n);
-#endif
-	} else {
-		/* TODO: have our own exception type ! */
-		PyErr_SetString(PyExc_Exception, strupnperror(r));
-		return NULL;
-	}
-}
-
-/* GetSpecificPortMapping(ePort, proto, remoteHost='')
- * proto = 'UDP' or 'TCP' */
-static PyObject *
-UPnP_getspecificportmapping(UPnPObject *self, PyObject *args)
-{
-	char extPort[6];
-	unsigned short ePort;
-	const char * proto;
-	const char * remoteHost = "";
-	char intClient[40];
-	char intPort[6];
-	unsigned short iPort;
-	char desc[80];
-	char enabled[4];
-	char leaseDuration[16];
-	if(!PyArg_ParseTuple(args, "Hs|z", &ePort, &proto, &remoteHost))
-		return NULL;
-	extPort[0] = '\0'; intClient[0] = '\0'; intPort[0] = '\0';
-	desc[0] = '\0'; enabled[0] = '\0'; leaseDuration[0] = '\0';
-Py_BEGIN_ALLOW_THREADS
-	sprintf(extPort, "%hu", ePort);
-	UPNP_GetSpecificPortMappingEntry(self->urls.controlURL,
-	                                 self->data.first.servicetype,
-									 extPort, proto, remoteHost,
-									 intClient, intPort,
-	                                 desc, enabled, leaseDuration);
-Py_END_ALLOW_THREADS
-	if(intClient[0])
-	{
-		iPort = (unsigned short)atoi(intPort);
-		return Py_BuildValue("(s,H,s,O,i)",
-		                     intClient, iPort, desc,
-		                     PyBool_FromLong(atoi(enabled)),
-		                     atoi(leaseDuration));
-	}
-	else
-	{
-		Py_RETURN_NONE;
-	}
-}
-
-/* GetGenericPortMapping(index) */
-static PyObject *
-UPnP_getgenericportmapping(UPnPObject *self, PyObject *args)
-{
-	int i, r;
-	char index[8];
-	char intClient[40];
-	char intPort[6];
-	unsigned short iPort;
-	char extPort[6];
-	unsigned short ePort;
-	char protocol[4];
-	char desc[80];
-	char enabled[6];
-	char rHost[64];
-	char duration[16];	/* lease duration */
-	unsigned int dur;
-	if(!PyArg_ParseTuple(args, "i", &i))
-		return NULL;
-Py_BEGIN_ALLOW_THREADS
-	snprintf(index, sizeof(index), "%d", i);
-	rHost[0] = '\0'; enabled[0] = '\0';
-	duration[0] = '\0'; desc[0] = '\0';
-	extPort[0] = '\0'; intPort[0] = '\0'; intClient[0] = '\0';
-	r = UPNP_GetGenericPortMappingEntry(self->urls.controlURL,
-	                                    self->data.first.servicetype,
-										index,
-										extPort, intClient, intPort,
-										protocol, desc, enabled, rHost,
-										duration);
-Py_END_ALLOW_THREADS
-	if(r==UPNPCOMMAND_SUCCESS)
-	{
-		ePort = (unsigned short)atoi(extPort);
-		iPort = (unsigned short)atoi(intPort);
-		dur = (unsigned int)strtoul(duration, 0, 0);
-#if (PY_MAJOR_VERSION >= 3) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 3)
-		return Py_BuildValue("(H,s,(s,H),s,s,s,I)",
-		                     ePort, protocol, intClient, iPort,
-		                     desc, enabled, rHost, dur);
-#else
-		return Py_BuildValue("(i,s,(s,i),s,s,s,i)",
-		                     (int)ePort, protocol, intClient, (int)iPort,
-		                     desc, enabled, rHost, (int)dur);
-#endif
-	}
-	else
-	{
-		Py_RETURN_NONE;
-	}
-}
-
-/* miniupnpc.UPnP object Method Table */
-static PyMethodDef UPnP_methods[] = {
-    {"discover", (PyCFunction)UPnP_discover, METH_NOARGS,
-     "discover UPnP IGD devices on the network"
-    },
-	{"selectigd", (PyCFunction)UPnP_selectigd, METH_NOARGS,
-	 "select a valid UPnP IGD among discovered devices"
-	},
-	{"totalbytesent", (PyCFunction)UPnP_totalbytesent, METH_NOARGS,
-	 "return the total number of bytes sent by UPnP IGD"
-	},
-	{"totalbytereceived", (PyCFunction)UPnP_totalbytereceived, METH_NOARGS,
-	 "return the total number of bytes received by UPnP IGD"
-	},
-	{"totalpacketsent", (PyCFunction)UPnP_totalpacketsent, METH_NOARGS,
-	 "return the total number of packets sent by UPnP IGD"
-	},
-	{"totalpacketreceived", (PyCFunction)UPnP_totalpacketreceived, METH_NOARGS,
-	 "return the total number of packets received by UPnP IGD"
-	},
-	{"statusinfo", (PyCFunction)UPnP_statusinfo, METH_NOARGS,
-	 "return status and uptime"
-	},
-	{"connectiontype", (PyCFunction)UPnP_connectiontype, METH_NOARGS,
-	 "return IGD WAN connection type"
-	},
-	{"externalipaddress", (PyCFunction)UPnP_externalipaddress, METH_NOARGS,
-	 "return external IP address"
-	},
-	{"addportmapping", (PyCFunction)UPnP_addportmapping, METH_VARARGS,
-	 "add a port mapping"
-	},
-	{"addanyportmapping", (PyCFunction)UPnP_addanyportmapping, METH_VARARGS,
-	 "add a port mapping, IGD to select alternative if necessary"
-	},
-	{"deleteportmapping", (PyCFunction)UPnP_deleteportmapping, METH_VARARGS,
-	 "delete a port mapping"
-	},
-	{"deleteportmappingrange", (PyCFunction)UPnP_deleteportmappingrange, METH_VARARGS,
-	 "delete a range of port mappings"
-	},
-	{"getportmappingnumberofentries", (PyCFunction)UPnP_getportmappingnumberofentries, METH_NOARGS,
-	 "-- non standard --"
-	},
-	{"getspecificportmapping", (PyCFunction)UPnP_getspecificportmapping, METH_VARARGS,
-	 "get details about a specific port mapping entry"
-	},
-	{"getgenericportmapping", (PyCFunction)UPnP_getgenericportmapping, METH_VARARGS,
-	 "get all details about the port mapping at index"
-	},
-    {NULL}  /* Sentinel */
-};
-
-static PyTypeObject UPnPType = {
-    PyVarObject_HEAD_INIT(NULL,
-    0)                         /*ob_size*/
-    "miniupnpc.UPnP",          /*tp_name*/
-    sizeof(UPnPObject),        /*tp_basicsize*/
-    0,                         /*tp_itemsize*/
-    (destructor)UPnPObject_dealloc,/*tp_dealloc*/
-    0,                         /*tp_print*/
-    0,                         /*tp_getattr*/
-    0,                         /*tp_setattr*/
-    0,                         /*tp_compare*/
-    0,                         /*tp_repr*/
-    0,                         /*tp_as_number*/
-    0,                         /*tp_as_sequence*/
-    0,                         /*tp_as_mapping*/
-    0,                         /*tp_hash */
-    0,                         /*tp_call*/
-    0,                         /*tp_str*/
-    0,                         /*tp_getattro*/
-    0,                         /*tp_setattro*/
-    0,                         /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT,        /*tp_flags*/
-    "UPnP objects",            /* tp_doc */
-    0,		                   /* tp_traverse */
-    0,		                   /* tp_clear */
-    0,		                   /* tp_richcompare */
-    0,		                   /* tp_weaklistoffset */
-    0,		                   /* tp_iter */
-    0,		                   /* tp_iternext */
-    UPnP_methods,              /* tp_methods */
-    UPnP_members,              /* tp_members */
-    0,                         /* tp_getset */
-    0,                         /* tp_base */
-    0,                         /* tp_dict */
-    0,                         /* tp_descr_get */
-    0,                         /* tp_descr_set */
-    0,                         /* tp_dictoffset */
-    (initproc)UPnP_init,       /* tp_init */
-    0,                         /* tp_alloc */
-#ifndef _WIN32
-    PyType_GenericNew,/*UPnP_new,*/      /* tp_new */
-#else
-    0,
-#endif
-};
-
-/* module methods */
-static PyMethodDef miniupnpc_methods[] = {
-    {NULL}  /* Sentinel */
-};
-
-#if PY_MAJOR_VERSION >= 3
-static struct PyModuleDef moduledef = {
-    PyModuleDef_HEAD_INIT,
-    "miniupnpc",     /* m_name */
-    "miniupnpc module.",  /* m_doc */
-    -1,                  /* m_size */
-    miniupnpc_methods,    /* m_methods */
-    NULL,                /* m_reload */
-    NULL,                /* m_traverse */
-    NULL,                /* m_clear */
-    NULL,                /* m_free */
-};
-#endif
-
-#ifndef PyMODINIT_FUNC	/* declarations for DLL import/export */
-#define PyMODINIT_FUNC void
-#endif
-
-PyMODINIT_FUNC
-#if PY_MAJOR_VERSION >= 3
-PyInit_miniupnpc(void)
-#else
-initminiupnpc(void)
-#endif
-{
-    PyObject* m;
-
-#ifdef _WIN32
-    /* initialize Winsock. */
-    WSADATA wsaData;
-    int nResult = WSAStartup(MAKEWORD(2,2), &wsaData);
-	if (nResult != 0)
-	{
-		/* error code could be WSASYSNOTREADY WSASYSNOTREADY
-		 * WSASYSNOTREADY WSASYSNOTREADY WSASYSNOTREADY */
-#if PY_MAJOR_VERSION >= 3
-        return 0;
-#else
-        return;
-#endif
-	}
-
-    UPnPType.tp_new = PyType_GenericNew;
-#endif
-    if (PyType_Ready(&UPnPType) < 0)
-#if PY_MAJOR_VERSION >= 3
-        return 0;
-#else
-        return;
-#endif
-
-#if PY_MAJOR_VERSION >= 3
-    m = PyModule_Create(&moduledef);
-#else
-    m = Py_InitModule3("miniupnpc", miniupnpc_methods,
-                       "miniupnpc module.");
-#endif
-
-    Py_INCREF(&UPnPType);
-    PyModule_AddObject(m, "UPnP", (PyObject *)&UPnPType);
-
-#if PY_MAJOR_VERSION >= 3
-    return m;
-#endif
-}
-
diff --git a/thirdparty/miniupnpc/miniupnpc/miniupnpcstrings.h b/thirdparty/miniupnpc/miniupnpc/miniupnpcstrings.h
index a718cc7bbf..7b3d04074a 100644
--- a/thirdparty/miniupnpc/miniupnpc/miniupnpcstrings.h
+++ b/thirdparty/miniupnpc/miniupnpc/miniupnpcstrings.h
@@ -4,7 +4,7 @@
 #include "core/version.h"
 
 #define OS_STRING VERSION_NAME "/1.0"
-#define MINIUPNPC_VERSION_STRING "2.1"
+#define MINIUPNPC_VERSION_STRING "2.2.2"
 
 #if 0
 /* according to "UPnP Device Architecture 1.0" */
diff --git a/thirdparty/miniupnpc/miniupnpc/miniwget.c b/thirdparty/miniupnpc/miniupnpc/miniwget.c
index 5c135f4efd..d5b7970632 100644
--- a/thirdparty/miniupnpc/miniupnpc/miniwget.c
+++ b/thirdparty/miniupnpc/miniupnpc/miniwget.c
@@ -1,8 +1,8 @@
-/* $Id: miniwget.c,v 1.78 2018/03/13 23:22:18 nanard Exp $ */
+/* $Id: miniwget.c,v 1.82 2020/05/29 21:14:22 nanard Exp $ */
 /* Project : miniupnp
- * Website : http://miniupnp.free.fr/
+ * Website : http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
  * Author : Thomas Bernard
- * Copyright (c) 2005-2018 Thomas Bernard
+ * Copyright (c) 2005-2020 Thomas Bernard
  * This software is subject to the conditions detailed in the
  * LICENCE file provided in this distribution. */
 
@@ -15,7 +15,7 @@
 #include <ws2tcpip.h>
 #include <io.h>
 #define MAXHOSTNAMELEN 64
-#define snprintf _snprintf
+#include "win32_snprintf.h"
 #define socklen_t int
 #ifndef strncasecmp
 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
@@ -176,11 +176,14 @@ getHTTPResponse(SOCKET s, int * size, int * status_code)
 						/* Status line
 						 * HTTP-Version SP Status-Code SP Reason-Phrase CRLF */
 						int sp;
-						for(sp = 0; sp < i; sp++)
+						for(sp = 0; sp < i - 1; sp++)
 							if(header_buf[sp] == ' ')
 							{
 								if(*status_code < 0)
-									*status_code = atoi(header_buf + sp + 1);
+								{
+									if (header_buf[sp+1] >= '1' && header_buf[sp+1] <= '9')
+										*status_code = atoi(header_buf + sp + 1);
+								}
 								else
 								{
 #ifdef DEBUG
diff --git a/thirdparty/miniupnpc/miniupnpc/portlistingparse.c b/thirdparty/miniupnpc/miniupnpc/portlistingparse.c
index 18d967b877..162cf8b7ec 100644
--- a/thirdparty/miniupnpc/miniupnpc/portlistingparse.c
+++ b/thirdparty/miniupnpc/miniupnpc/portlistingparse.c
@@ -15,7 +15,7 @@
 #if defined(__HAIKU__)
 /* rename our private function because Haiku already defines a atoui() function */
 #define atoui atoui2
-#endif
+#endif 
 
 /* list of the elements */
 static const struct {
diff --git a/thirdparty/miniupnpc/miniupnpc/receivedata.c b/thirdparty/miniupnpc/miniupnpc/receivedata.c
index 7b9cc5b778..7f187f6e56 100644
--- a/thirdparty/miniupnpc/miniupnpc/receivedata.c
+++ b/thirdparty/miniupnpc/miniupnpc/receivedata.c
@@ -1,8 +1,8 @@
-/* $Id: receivedata.c,v 1.7 2015/11/09 21:51:41 nanard Exp $ */
+/* $Id: receivedata.c,v 1.10 2021/03/02 23:33:07 nanard Exp $ */
 /* Project : miniupnp
  * Website : http://miniupnp.free.fr/
  * Author : Thomas Bernard
- * Copyright (c) 2011-2014 Thomas Bernard
+ * Copyright (c) 2011-2021 Thomas Bernard
  * This software is subject to the conditions detailed in the
  * LICENCE file provided in this distribution. */
 
@@ -92,7 +92,13 @@ receivedata(SOCKET socket,
 #endif	/* DEBUG */
 		if(scope_id)
 			*scope_id = src_addr6->sin6_scope_id;
+	} else {
+		if(scope_id)
+			*scope_id = 0;
 	}
+#else	/* MINIUPNPC_GET_SRC_ADDR */
+	if(scope_id)
+		*scope_id = 0;
 #endif	/* MINIUPNPC_GET_SRC_ADDR */
 	return n;
 }
diff --git a/thirdparty/miniupnpc/miniupnpc/upnpc.c b/thirdparty/miniupnpc/miniupnpc/upnpc.c
deleted file mode 100644
index cb7f18b5f6..0000000000
--- a/thirdparty/miniupnpc/miniupnpc/upnpc.c
+++ /dev/null
@@ -1,864 +0,0 @@
-/* $Id: upnpc.c,v 1.119 2018/03/13 23:34:46 nanard Exp $ */
-/* Project : miniupnp
- * Author : Thomas Bernard
- * Copyright (c) 2005-2020 Thomas Bernard
- * This software is subject to the conditions detailed in the
- * LICENCE file provided in this distribution. */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#ifdef _WIN32
-#include <winsock2.h>
-#define snprintf _snprintf
-#else
-/* for IPPROTO_TCP / IPPROTO_UDP */
-#include <netinet/in.h>
-#endif
-#include <ctype.h>
-#include "miniwget.h"
-#include "miniupnpc.h"
-#include "upnpcommands.h"
-#include "portlistingparse.h"
-#include "upnperrors.h"
-#include "miniupnpcstrings.h"
-
-/* protofix() checks if protocol is "UDP" or "TCP"
- * returns NULL if not */
-const char * protofix(const char * proto)
-{
-	static const char proto_tcp[4] = { 'T', 'C', 'P', 0};
-	static const char proto_udp[4] = { 'U', 'D', 'P', 0};
-	int i, b;
-	for(i=0, b=1; i<4; i++)
-		b = b && (   (proto[i] == proto_tcp[i])
-		          || (proto[i] == (proto_tcp[i] | 32)) );
-	if(b)
-		return proto_tcp;
-	for(i=0, b=1; i<4; i++)
-		b = b && (   (proto[i] == proto_udp[i])
-		          || (proto[i] == (proto_udp[i] | 32)) );
-	if(b)
-		return proto_udp;
-	return 0;
-}
-
-/* is_int() checks if parameter is an integer or not
- * 1 for integer
- * 0 for not an integer */
-int is_int(char const* s)
-{
-	if(s == NULL)
-		return 0;
-	while(*s) {
-		/* #define isdigit(c) ((c) >= '0' && (c) <= '9') */
-		if(!isdigit(*s))
-			return 0;
-		s++;
-	}
-	return 1;
-}
-
-static void DisplayInfos(struct UPNPUrls * urls,
-                         struct IGDdatas * data)
-{
-	char externalIPAddress[40];
-	char connectionType[64];
-	char status[64];
-	char lastconnerr[64];
-	unsigned int uptime = 0;
-	unsigned int brUp, brDown;
-	time_t timenow, timestarted;
-	int r;
-	if(UPNP_GetConnectionTypeInfo(urls->controlURL,
-	                              data->first.servicetype,
-	                              connectionType) != UPNPCOMMAND_SUCCESS)
-		printf("GetConnectionTypeInfo failed.\n");
-	else
-		printf("Connection Type : %s\n", connectionType);
-	if(UPNP_GetStatusInfo(urls->controlURL, data->first.servicetype,
-	                      status, &uptime, lastconnerr) != UPNPCOMMAND_SUCCESS)
-		printf("GetStatusInfo failed.\n");
-	else
-		printf("Status : %s, uptime=%us, LastConnectionError : %s\n",
-		       status, uptime, lastconnerr);
-	if(uptime > 0) {
-		timenow = time(NULL);
-		timestarted = timenow - uptime;
-		printf("  Time started : %s", ctime(&timestarted));
-	}
-	if(UPNP_GetLinkLayerMaxBitRates(urls->controlURL_CIF, data->CIF.servicetype,
-	                                &brDown, &brUp) != UPNPCOMMAND_SUCCESS) {
-		printf("GetLinkLayerMaxBitRates failed.\n");
-	} else {
-		printf("MaxBitRateDown : %u bps", brDown);
-		if(brDown >= 1000000) {
-			printf(" (%u.%u Mbps)", brDown / 1000000, (brDown / 100000) % 10);
-		} else if(brDown >= 1000) {
-			printf(" (%u Kbps)", brDown / 1000);
-		}
-		printf("   MaxBitRateUp %u bps", brUp);
-		if(brUp >= 1000000) {
-			printf(" (%u.%u Mbps)", brUp / 1000000, (brUp / 100000) % 10);
-		} else if(brUp >= 1000) {
-			printf(" (%u Kbps)", brUp / 1000);
-		}
-		printf("\n");
-	}
-	r = UPNP_GetExternalIPAddress(urls->controlURL,
-	                          data->first.servicetype,
-							  externalIPAddress);
-	if(r != UPNPCOMMAND_SUCCESS) {
-		printf("GetExternalIPAddress failed. (errorcode=%d)\n", r);
-	} else {
-		printf("ExternalIPAddress = %s\n", externalIPAddress);
-	}
-}
-
-static void GetConnectionStatus(struct UPNPUrls * urls,
-                               struct IGDdatas * data)
-{
-	unsigned int bytessent, bytesreceived, packetsreceived, packetssent;
-	DisplayInfos(urls, data);
-	bytessent = UPNP_GetTotalBytesSent(urls->controlURL_CIF, data->CIF.servicetype);
-	bytesreceived = UPNP_GetTotalBytesReceived(urls->controlURL_CIF, data->CIF.servicetype);
-	packetssent = UPNP_GetTotalPacketsSent(urls->controlURL_CIF, data->CIF.servicetype);
-	packetsreceived = UPNP_GetTotalPacketsReceived(urls->controlURL_CIF, data->CIF.servicetype);
-	printf("Bytes:   Sent: %8u\tRecv: %8u\n", bytessent, bytesreceived);
-	printf("Packets: Sent: %8u\tRecv: %8u\n", packetssent, packetsreceived);
-}
-
-static void ListRedirections(struct UPNPUrls * urls,
-                             struct IGDdatas * data)
-{
-	int r;
-	int i = 0;
-	char index[6];
-	char intClient[40];
-	char intPort[6];
-	char extPort[6];
-	char protocol[4];
-	char desc[80];
-	char enabled[6];
-	char rHost[64];
-	char duration[16];
-	/*unsigned int num=0;
-	UPNP_GetPortMappingNumberOfEntries(urls->controlURL, data->servicetype, &num);
-	printf("PortMappingNumberOfEntries : %u\n", num);*/
-	printf(" i protocol exPort->inAddr:inPort description remoteHost leaseTime\n");
-	do {
-		snprintf(index, 6, "%d", i);
-		rHost[0] = '\0'; enabled[0] = '\0';
-		duration[0] = '\0'; desc[0] = '\0';
-		extPort[0] = '\0'; intPort[0] = '\0'; intClient[0] = '\0';
-		r = UPNP_GetGenericPortMappingEntry(urls->controlURL,
-		                               data->first.servicetype,
-		                               index,
-		                               extPort, intClient, intPort,
-									   protocol, desc, enabled,
-									   rHost, duration);
-		if(r==0)
-		/*
-			printf("%02d - %s %s->%s:%s\tenabled=%s leaseDuration=%s\n"
-			       "     desc='%s' rHost='%s'\n",
-			       i, protocol, extPort, intClient, intPort,
-				   enabled, duration,
-				   desc, rHost);
-				   */
-			printf("%2d %s %5s->%s:%-5s '%s' '%s' %s\n",
-			       i, protocol, extPort, intClient, intPort,
-			       desc, rHost, duration);
-		else
-			printf("GetGenericPortMappingEntry() returned %d (%s)\n",
-			       r, strupnperror(r));
-		i++;
-	} while(r==0);
-}
-
-static void NewListRedirections(struct UPNPUrls * urls,
-                                struct IGDdatas * data)
-{
-	int r;
-	int i = 0;
-	struct PortMappingParserData pdata;
-	struct PortMapping * pm;
-
-	memset(&pdata, 0, sizeof(struct PortMappingParserData));
-	r = UPNP_GetListOfPortMappings(urls->controlURL,
-                                   data->first.servicetype,
-	                               "0",
-	                               "65535",
-	                               "TCP",
-	                               "1000",
-	                               &pdata);
-	if(r == UPNPCOMMAND_SUCCESS)
-	{
-		printf(" i protocol exPort->inAddr:inPort description remoteHost leaseTime\n");
-		for(pm = pdata.l_head; pm != NULL; pm = pm->l_next)
-		{
-			printf("%2d %s %5hu->%s:%-5hu '%s' '%s' %u\n",
-			       i, pm->protocol, pm->externalPort, pm->internalClient,
-			       pm->internalPort,
-			       pm->description, pm->remoteHost,
-			       (unsigned)pm->leaseTime);
-			i++;
-		}
-		FreePortListing(&pdata);
-	}
-	else
-	{
-		printf("GetListOfPortMappings() returned %d (%s)\n",
-		       r, strupnperror(r));
-	}
-	r = UPNP_GetListOfPortMappings(urls->controlURL,
-                                   data->first.servicetype,
-	                               "0",
-	                               "65535",
-	                               "UDP",
-	                               "1000",
-	                               &pdata);
-	if(r == UPNPCOMMAND_SUCCESS)
-	{
-		for(pm = pdata.l_head; pm != NULL; pm = pm->l_next)
-		{
-			printf("%2d %s %5hu->%s:%-5hu '%s' '%s' %u\n",
-			       i, pm->protocol, pm->externalPort, pm->internalClient,
-			       pm->internalPort,
-			       pm->description, pm->remoteHost,
-			       (unsigned)pm->leaseTime);
-			i++;
-		}
-		FreePortListing(&pdata);
-	}
-	else
-	{
-		printf("GetListOfPortMappings() returned %d (%s)\n",
-		       r, strupnperror(r));
-	}
-}
-
-/* Test function
- * 1 - get connection type
- * 2 - get extenal ip address
- * 3 - Add port mapping
- * 4 - get this port mapping from the IGD */
-static int SetRedirectAndTest(struct UPNPUrls * urls,
-			       struct IGDdatas * data,
-			       const char * iaddr,
-			       const char * iport,
-			       const char * eport,
-			       const char * proto,
-			       const char * leaseDuration,
-			       const char * remoteHost,
-			       const char * description,
-			       int addAny)
-{
-	char externalIPAddress[40];
-	char intClient[40];
-	char intPort[6];
-	char reservedPort[6];
-	char duration[16];
-	int r;
-
-	if(!iaddr || !iport || !eport || !proto)
-	{
-		fprintf(stderr, "Wrong arguments\n");
-		return -1;
-	}
-	proto = protofix(proto);
-	if(!proto)
-	{
-		fprintf(stderr, "invalid protocol\n");
-		return -1;
-	}
-
-	r = UPNP_GetExternalIPAddress(urls->controlURL,
-				      data->first.servicetype,
-				      externalIPAddress);
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("GetExternalIPAddress failed.\n");
-	else
-		printf("ExternalIPAddress = %s\n", externalIPAddress);
-
-	if (addAny) {
-		r = UPNP_AddAnyPortMapping(urls->controlURL, data->first.servicetype,
-					   eport, iport, iaddr, description,
-					   proto, remoteHost, leaseDuration, reservedPort);
-		if(r==UPNPCOMMAND_SUCCESS)
-			eport = reservedPort;
-		else
-			printf("AddAnyPortMapping(%s, %s, %s) failed with code %d (%s)\n",
-			       eport, iport, iaddr, r, strupnperror(r));
-	} else {
-		r = UPNP_AddPortMapping(urls->controlURL, data->first.servicetype,
-					eport, iport, iaddr, description,
-					proto, remoteHost, leaseDuration);
-		if(r!=UPNPCOMMAND_SUCCESS) {
-			printf("AddPortMapping(%s, %s, %s) failed with code %d (%s)\n",
-			       eport, iport, iaddr, r, strupnperror(r));
-			return -2;
-	}
-	}
-
-	r = UPNP_GetSpecificPortMappingEntry(urls->controlURL,
-					     data->first.servicetype,
-					     eport, proto, remoteHost,
-					     intClient, intPort, NULL/*desc*/,
-					     NULL/*enabled*/, duration);
-	if(r!=UPNPCOMMAND_SUCCESS) {
-		printf("GetSpecificPortMappingEntry() failed with code %d (%s)\n",
-		       r, strupnperror(r));
-		return -2;
-	} else {
-		printf("InternalIP:Port = %s:%s\n", intClient, intPort);
-		printf("external %s:%s %s is redirected to internal %s:%s (duration=%s)\n",
-		       externalIPAddress, eport, proto, intClient, intPort, duration);
-	}
-	return 0;
-}
-
-static int
-RemoveRedirect(struct UPNPUrls * urls,
-               struct IGDdatas * data,
-               const char * eport,
-               const char * proto,
-               const char * remoteHost)
-{
-	int r;
-	if(!proto || !eport)
-	{
-		fprintf(stderr, "invalid arguments\n");
-		return -1;
-	}
-	proto = protofix(proto);
-	if(!proto)
-	{
-		fprintf(stderr, "protocol invalid\n");
-		return -1;
-	}
-	r = UPNP_DeletePortMapping(urls->controlURL, data->first.servicetype, eport, proto, remoteHost);
-	if(r!=UPNPCOMMAND_SUCCESS) {
-		printf("UPNP_DeletePortMapping() failed with code : %d\n", r);
-		return -2;
-	}else {
-		printf("UPNP_DeletePortMapping() returned : %d\n", r);
-	}
-	return 0;
-}
-
-static int
-RemoveRedirectRange(struct UPNPUrls * urls,
-		    struct IGDdatas * data,
-		    const char * ePortStart, char const * ePortEnd,
-		    const char * proto, const char * manage)
-{
-	int r;
-
-	if (!manage)
-		manage = "0";
-
-	if(!proto || !ePortStart || !ePortEnd)
-	{
-		fprintf(stderr, "invalid arguments\n");
-		return -1;
-	}
-	proto = protofix(proto);
-	if(!proto)
-	{
-		fprintf(stderr, "protocol invalid\n");
-		return -1;
-	}
-	r = UPNP_DeletePortMappingRange(urls->controlURL, data->first.servicetype, ePortStart, ePortEnd, proto, manage);
-	if(r!=UPNPCOMMAND_SUCCESS) {
-		printf("UPNP_DeletePortMappingRange() failed with code : %d\n", r);
-		return -2;
-	}else {
-		printf("UPNP_DeletePortMappingRange() returned : %d\n", r);
-	}
-	return 0;
-}
-
-/* IGD:2, functions for service WANIPv6FirewallControl:1 */
-static void GetFirewallStatus(struct UPNPUrls * urls, struct IGDdatas * data)
-{
-	unsigned int bytessent, bytesreceived, packetsreceived, packetssent;
-	int firewallEnabled = 0, inboundPinholeAllowed = 0;
-
-	UPNP_GetFirewallStatus(urls->controlURL_6FC, data->IPv6FC.servicetype, &firewallEnabled, &inboundPinholeAllowed);
-	printf("FirewallEnabled: %d & Inbound Pinhole Allowed: %d\n", firewallEnabled, inboundPinholeAllowed);
-	printf("GetFirewallStatus:\n   Firewall Enabled: %s\n   Inbound Pinhole Allowed: %s\n", (firewallEnabled)? "Yes":"No", (inboundPinholeAllowed)? "Yes":"No");
-
-	bytessent = UPNP_GetTotalBytesSent(urls->controlURL_CIF, data->CIF.servicetype);
-	bytesreceived = UPNP_GetTotalBytesReceived(urls->controlURL_CIF, data->CIF.servicetype);
-	packetssent = UPNP_GetTotalPacketsSent(urls->controlURL_CIF, data->CIF.servicetype);
-	packetsreceived = UPNP_GetTotalPacketsReceived(urls->controlURL_CIF, data->CIF.servicetype);
-	printf("Bytes:   Sent: %8u\tRecv: %8u\n", bytessent, bytesreceived);
-	printf("Packets: Sent: %8u\tRecv: %8u\n", packetssent, packetsreceived);
-}
-
-/* Test function
- * 1 - Add pinhole
- * 2 - Check if pinhole is working from the IGD side */
-static void SetPinholeAndTest(struct UPNPUrls * urls, struct IGDdatas * data,
-					const char * remoteaddr, const char * eport,
-					const char * intaddr, const char * iport,
-					const char * proto, const char * lease_time)
-{
-	char uniqueID[8];
-	/*int isWorking = 0;*/
-	int r;
-	char proto_tmp[8];
-
-	if(!intaddr || !remoteaddr || !iport || !eport || !proto || !lease_time)
-	{
-		fprintf(stderr, "Wrong arguments\n");
-		return;
-	}
-	if(atoi(proto) == 0)
-	{
-		const char * protocol;
-		protocol = protofix(proto);
-		if(protocol && (strcmp("TCP", protocol) == 0))
-		{
-			snprintf(proto_tmp, sizeof(proto_tmp), "%d", IPPROTO_TCP);
-			proto = proto_tmp;
-		}
-		else if(protocol && (strcmp("UDP", protocol) == 0))
-		{
-			snprintf(proto_tmp, sizeof(proto_tmp), "%d", IPPROTO_UDP);
-			proto = proto_tmp;
-		}
-		else
-		{
-			fprintf(stderr, "invalid protocol\n");
-			return;
-		}
-	}
-	r = UPNP_AddPinhole(urls->controlURL_6FC, data->IPv6FC.servicetype, remoteaddr, eport, intaddr, iport, proto, lease_time, uniqueID);
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("AddPinhole([%s]:%s -> [%s]:%s) failed with code %d (%s)\n",
-		       remoteaddr, eport, intaddr, iport, r, strupnperror(r));
-	else
-	{
-		printf("AddPinhole: ([%s]:%s -> [%s]:%s) / Pinhole ID = %s\n",
-		       remoteaddr, eport, intaddr, iport, uniqueID);
-		/*r = UPNP_CheckPinholeWorking(urls->controlURL_6FC, data->servicetype_6FC, uniqueID, &isWorking);
-		if(r!=UPNPCOMMAND_SUCCESS)
-			printf("CheckPinholeWorking() failed with code %d (%s)\n", r, strupnperror(r));
-		printf("CheckPinholeWorking: Pinhole ID = %s / IsWorking = %s\n", uniqueID, (isWorking)? "Yes":"No");*/
-	}
-}
-
-/* Test function
- * 1 - Check if pinhole is working from the IGD side
- * 2 - Update pinhole */
-static void GetPinholeAndUpdate(struct UPNPUrls * urls, struct IGDdatas * data,
-					const char * uniqueID, const char * lease_time)
-{
-	int isWorking = 0;
-	int r;
-
-	if(!uniqueID || !lease_time)
-	{
-		fprintf(stderr, "Wrong arguments\n");
-		return;
-	}
-	r = UPNP_CheckPinholeWorking(urls->controlURL_6FC, data->IPv6FC.servicetype, uniqueID, &isWorking);
-	printf("CheckPinholeWorking: Pinhole ID = %s / IsWorking = %s\n", uniqueID, (isWorking)? "Yes":"No");
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("CheckPinholeWorking() failed with code %d (%s)\n", r, strupnperror(r));
-	if(isWorking || r==709)
-	{
-		r = UPNP_UpdatePinhole(urls->controlURL_6FC, data->IPv6FC.servicetype, uniqueID, lease_time);
-		printf("UpdatePinhole: Pinhole ID = %s with Lease Time: %s\n", uniqueID, lease_time);
-		if(r!=UPNPCOMMAND_SUCCESS)
-			printf("UpdatePinhole: ID (%s) failed with code %d (%s)\n", uniqueID, r, strupnperror(r));
-	}
-}
-
-/* Test function
- * Get pinhole timeout
- */
-static void GetPinholeOutboundTimeout(struct UPNPUrls * urls, struct IGDdatas * data,
-					const char * remoteaddr, const char * eport,
-					const char * intaddr, const char * iport,
-					const char * proto)
-{
-	int timeout = 0;
-	int r;
-
-	if(!intaddr || !remoteaddr || !iport || !eport || !proto)
-	{
-		fprintf(stderr, "Wrong arguments\n");
-		return;
-	}
-
-	r = UPNP_GetOutboundPinholeTimeout(urls->controlURL_6FC, data->IPv6FC.servicetype, remoteaddr, eport, intaddr, iport, proto, &timeout);
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("GetOutboundPinholeTimeout([%s]:%s -> [%s]:%s) failed with code %d (%s)\n",
-		       intaddr, iport, remoteaddr, eport, r, strupnperror(r));
-	else
-		printf("GetOutboundPinholeTimeout: ([%s]:%s -> [%s]:%s) / Timeout = %d\n", intaddr, iport, remoteaddr, eport, timeout);
-}
-
-static void
-GetPinholePackets(struct UPNPUrls * urls,
-               struct IGDdatas * data, const char * uniqueID)
-{
-	int r, pinholePackets = 0;
-	if(!uniqueID)
-	{
-		fprintf(stderr, "invalid arguments\n");
-		return;
-	}
-	r = UPNP_GetPinholePackets(urls->controlURL_6FC, data->IPv6FC.servicetype, uniqueID, &pinholePackets);
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("GetPinholePackets() failed with code %d (%s)\n", r, strupnperror(r));
-	else
-		printf("GetPinholePackets: Pinhole ID = %s / PinholePackets = %d\n", uniqueID, pinholePackets);
-}
-
-static void
-CheckPinhole(struct UPNPUrls * urls,
-               struct IGDdatas * data, const char * uniqueID)
-{
-	int r, isWorking = 0;
-	if(!uniqueID)
-	{
-		fprintf(stderr, "invalid arguments\n");
-		return;
-	}
-	r = UPNP_CheckPinholeWorking(urls->controlURL_6FC, data->IPv6FC.servicetype, uniqueID, &isWorking);
-	if(r!=UPNPCOMMAND_SUCCESS)
-		printf("CheckPinholeWorking() failed with code %d (%s)\n", r, strupnperror(r));
-	else
-		printf("CheckPinholeWorking: Pinhole ID = %s / IsWorking = %s\n", uniqueID, (isWorking)? "Yes":"No");
-}
-
-static void
-RemovePinhole(struct UPNPUrls * urls,
-               struct IGDdatas * data, const char * uniqueID)
-{
-	int r;
-	if(!uniqueID)
-	{
-		fprintf(stderr, "invalid arguments\n");
-		return;
-	}
-	r = UPNP_DeletePinhole(urls->controlURL_6FC, data->IPv6FC.servicetype, uniqueID);
-	printf("UPNP_DeletePinhole() returned : %d\n", r);
-}
-
-
-/* sample upnp client program */
-int main(int argc, char ** argv)
-{
-	char command = 0;
-	char ** commandargv = 0;
-	int commandargc = 0;
-	struct UPNPDev * devlist = 0;
-	char lanaddr[64] = "unset";	/* my ip address on the LAN */
-	int i;
-	const char * rootdescurl = 0;
-	const char * multicastif = 0;
-	const char * minissdpdpath = 0;
-	int localport = UPNP_LOCAL_PORT_ANY;
-	int retcode = 0;
-	int error = 0;
-	int ipv6 = 0;
-	unsigned char ttl = 2;	/* defaulting to 2 */
-	const char * description = 0;
-
-#ifdef _WIN32
-	WSADATA wsaData;
-	int nResult = WSAStartup(MAKEWORD(2,2), &wsaData);
-	if(nResult != NO_ERROR)
-	{
-		fprintf(stderr, "WSAStartup() failed.\n");
-		return -1;
-	}
-#endif
-    printf("upnpc : miniupnpc library test client, version %s.\n", MINIUPNPC_VERSION_STRING);
-	printf(" (c) 2005-2020 Thomas Bernard.\n");
-    printf("Go to http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/\n"
-	       "for more information.\n");
-	/* command line processing */
-	for(i=1; i<argc; i++)
-	{
-		if(0 == strcmp(argv[i], "--help") || 0 == strcmp(argv[i], "-h"))
-		{
-			command = 0;
-			break;
-		}
-		if(argv[i][0] == '-')
-		{
-			if(argv[i][1] == 'u')
-				rootdescurl = argv[++i];
-			else if(argv[i][1] == 'm')
-			{
-				multicastif = argv[++i];
-				minissdpdpath = "";	/* Disable usage of minissdpd */
-			}
-			else if(argv[i][1] == 'z')
-			{
-				char junk;
-				if(sscanf(argv[++i], "%d%c", &localport, &junk)!=1 ||
-					localport<0 || localport>65535 ||
-				   (localport >1 && localport < 1024))
-				{
-					fprintf(stderr, "Invalid localport '%s'\n", argv[i]);
-					localport = UPNP_LOCAL_PORT_ANY;
-					break;
-				}
-			}
-			else if(argv[i][1] == 'p')
-				minissdpdpath = argv[++i];
-			else if(argv[i][1] == '6')
-				ipv6 = 1;
-			else if(argv[i][1] == 'e')
-				description = argv[++i];
-			else if(argv[i][1] == 't')
-				ttl = (unsigned char)atoi(argv[++i]);
-			else
-			{
-				command = argv[i][1];
-				i++;
-				commandargv = argv + i;
-				commandargc = argc - i;
-				break;
-			}
-		}
-		else
-		{
-			fprintf(stderr, "option '%s' invalid\n", argv[i]);
-		}
-	}
-
-	if(!command
-	   || (command == 'a' && commandargc<4)
-	   || (command == 'd' && argc<2)
-	   || (command == 'r' && argc<2)
-	   || (command == 'A' && commandargc<6)
-	   || (command == 'U' && commandargc<2)
-	   || (command == 'D' && commandargc<1))
-	{
-		fprintf(stderr, "Usage :\t%s [options] -a ip port external_port protocol [duration] [remote host]\n\t\tAdd port redirection\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -d external_port protocol [remote host]\n\t\tDelete port redirection\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -s\n\t\tGet Connection status\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -l\n\t\tList redirections\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -L\n\t\tList redirections (using GetListOfPortMappings (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -n ip port external_port protocol [duration] [remote host]\n\t\tAdd (any) port redirection allowing IGD to use alternative external_port (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -N external_port_start external_port_end protocol [manage]\n\t\tDelete range of port redirections (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -r port1 [external_port1] protocol1 [port2 [external_port2] protocol2] [...]\n\t\tAdd all redirections to the current host\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -A remote_ip remote_port internal_ip internal_port protocol lease_time\n\t\tAdd Pinhole (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -U uniqueID new_lease_time\n\t\tUpdate Pinhole (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -C uniqueID\n\t\tCheck if Pinhole is Working (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -K uniqueID\n\t\tGet Number of packets going through the rule (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -D uniqueID\n\t\tDelete Pinhole (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -S\n\t\tGet Firewall status (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -G remote_ip remote_port internal_ip internal_port protocol\n\t\tGet Outbound Pinhole Timeout (for IGD:2 only)\n", argv[0]);
-		fprintf(stderr, "       \t%s [options] -P\n\t\tGet Presentation url\n", argv[0]);
-		fprintf(stderr, "\nprotocol is UDP or TCP\n");
-		fprintf(stderr, "Options:\n");
-		fprintf(stderr, "  -e description : set description for port mapping.\n");
-		fprintf(stderr, "  -6 : use ip v6 instead of ip v4.\n");
-		fprintf(stderr, "  -u url : bypass discovery process by providing the XML root description url.\n");
-		fprintf(stderr, "  -m address/interface : provide ip address (ip v4) or interface name (ip v4 or v6) to use for sending SSDP multicast packets.\n");
-		fprintf(stderr, "  -z localport : SSDP packets local (source) port (1024-65535).\n");
-		fprintf(stderr, "  -p path : use this path for MiniSSDPd socket.\n");
-		fprintf(stderr, "  -t ttl : set multicast TTL. Default value is 2.\n");
-		return 1;
-	}
-
-	if( rootdescurl
-	  || (devlist = upnpDiscover(2000, multicastif, minissdpdpath,
-	                             localport, ipv6, ttl, &error)))
-	{
-		struct UPNPDev * device;
-		struct UPNPUrls urls;
-		struct IGDdatas data;
-		if(devlist)
-		{
-			printf("List of UPNP devices found on the network :\n");
-			for(device = devlist; device; device = device->pNext)
-			{
-				printf(" desc: %s\n st: %s\n\n",
-					   device->descURL, device->st);
-			}
-		}
-		else if(!rootdescurl)
-		{
-			printf("upnpDiscover() error code=%d\n", error);
-		}
-		i = 1;
-		if( (rootdescurl && UPNP_GetIGDFromUrl(rootdescurl, &urls, &data, lanaddr, sizeof(lanaddr)))
-		  || (i = UPNP_GetValidIGD(devlist, &urls, &data, lanaddr, sizeof(lanaddr))))
-		{
-			switch(i) {
-			case 1:
-				printf("Found valid IGD : %s\n", urls.controlURL);
-				break;
-			case 2:
-				printf("Found a (not connected?) IGD : %s\n", urls.controlURL);
-				printf("Trying to continue anyway\n");
-				break;
-			case 3:
-				printf("UPnP device found. Is it an IGD ? : %s\n", urls.controlURL);
-				printf("Trying to continue anyway\n");
-				break;
-			default:
-				printf("Found device (igd ?) : %s\n", urls.controlURL);
-				printf("Trying to continue anyway\n");
-			}
-			printf("Local LAN ip address : %s\n", lanaddr);
-			#if 0
-			printf("getting \"%s\"\n", urls.ipcondescURL);
-			descXML = miniwget(urls.ipcondescURL, &descXMLsize);
-			if(descXML)
-			{
-				/*fwrite(descXML, 1, descXMLsize, stdout);*/
-				free(descXML); descXML = NULL;
-			}
-			#endif
-
-			switch(command)
-			{
-			case 'l':
-				DisplayInfos(&urls, &data);
-				ListRedirections(&urls, &data);
-				break;
-			case 'L':
-				NewListRedirections(&urls, &data);
-				break;
-			case 'a':
-				if (SetRedirectAndTest(&urls, &data,
-						   commandargv[0], commandargv[1],
-						   commandargv[2], commandargv[3],
-						   (commandargc > 4)&is_int(commandargv[4])?commandargv[4]:"0",
-						   (commandargc > 4)&!is_int(commandargv[4])?commandargv[4]:(commandargc > 5)?commandargv[5]:NULL,
-						   description, 0) < 0)
-					retcode = 2;
-				break;
-			case 'd':
-				if (RemoveRedirect(&urls, &data, commandargv[0], commandargv[1],
-				               commandargc > 2 ? commandargv[2] : NULL) < 0)
-					retcode = 2;
-				break;
-			case 'n':	/* aNy */
-				if (SetRedirectAndTest(&urls, &data,
-						   commandargv[0], commandargv[1],
-						   commandargv[2], commandargv[3],
-                                                   (commandargc > 4)&is_int(commandargv[4])?commandargv[4]:"0",
-                                                   (commandargc > 4)&!is_int(commandargv[4])?commandargv[4]:(commandargc > 5)?commandargv[5]:NULL,
-						   description, 1) < 0)
-					retcode = 2;
-				break;
-			case 'N':
-				if (commandargc < 3)
-					fprintf(stderr, "too few arguments\n");
-
-				if (RemoveRedirectRange(&urls, &data, commandargv[0], commandargv[1], commandargv[2],
-						    commandargc > 3 ? commandargv[3] : NULL) < 0)
-					retcode = 2;
-				break;
-			case 's':
-				GetConnectionStatus(&urls, &data);
-				break;
-			case 'r':
-				i = 0;
-				while(i<commandargc)
-				{
-					if(!is_int(commandargv[i])) {
-						/* 1st parameter not an integer : error */
-						fprintf(stderr, "command -r : %s is not an port number\n", commandargv[i]);
-						retcode = 1;
-						break;
-					} else if(is_int(commandargv[i+1])){
-						/* 2nd parameter is an integer : <port> <external_port> <protocol> */
-						if (SetRedirectAndTest(&urls, &data,
-								   lanaddr, commandargv[i],
-								   commandargv[i+1], commandargv[i+2], "0", NULL,
-								   description, 0) < 0)
-							retcode = 2;
-						i+=3;	/* 3 parameters parsed */
-					} else {
-						/* 2nd parameter not an integer : <port> <protocol> */
-						if (SetRedirectAndTest(&urls, &data,
-								   lanaddr, commandargv[i],
-								   commandargv[i], commandargv[i+1], "0", NULL,
-								   description, 0) < 0)
-							retcode = 2;
-						i+=2;	/* 2 parameters parsed */
-					}
-				}
-				break;
-			case 'A':
-				SetPinholeAndTest(&urls, &data,
-				                  commandargv[0], commandargv[1],
-				                  commandargv[2], commandargv[3],
-				                  commandargv[4], commandargv[5]);
-				break;
-			case 'U':
-				GetPinholeAndUpdate(&urls, &data,
-				                   commandargv[0], commandargv[1]);
-				break;
-			case 'C':
-				for(i=0; i<commandargc; i++)
-				{
-					CheckPinhole(&urls, &data, commandargv[i]);
-				}
-				break;
-			case 'K':
-				for(i=0; i<commandargc; i++)
-				{
-					GetPinholePackets(&urls, &data, commandargv[i]);
-				}
-				break;
-			case 'D':
-				for(i=0; i<commandargc; i++)
-				{
-					RemovePinhole(&urls, &data, commandargv[i]);
-				}
-				break;
-			case 'S':
-				GetFirewallStatus(&urls, &data);
-				break;
-			case 'G':
-				GetPinholeOutboundTimeout(&urls, &data,
-							commandargv[0], commandargv[1],
-							commandargv[2], commandargv[3],
-							commandargv[4]);
-				break;
-			case 'P':
-				printf("Presentation URL found:\n");
-				printf("            %s\n", data.presentationurl);
-				break;
-			default:
-				fprintf(stderr, "Unknown switch -%c\n", command);
-				retcode = 1;
-			}
-
-			FreeUPNPUrls(&urls);
-		}
-		else
-		{
-			fprintf(stderr, "No valid UPNP Internet Gateway Device found.\n");
-			retcode = 1;
-		}
-		freeUPNPDevlist(devlist); devlist = 0;
-	}
-	else
-	{
-		fprintf(stderr, "No IGD UPnP Device found on the network !\n");
-		retcode = 1;
-	}
-#ifdef _WIN32
-	nResult = WSACleanup();
-	if(nResult != NO_ERROR) {
-		fprintf(stderr, "WSACleanup() failed.\n");
-	}
-#endif /* _WIN32 */
-	return retcode;
-}
-
diff --git a/thirdparty/miniupnpc/miniupnpc/upnpdev.h b/thirdparty/miniupnpc/miniupnpc/upnpdev.h
index f4ae174426..9b2cb431ba 100644
--- a/thirdparty/miniupnpc/miniupnpc/upnpdev.h
+++ b/thirdparty/miniupnpc/miniupnpc/upnpdev.h
@@ -1,8 +1,8 @@
-/* $Id: upnpdev.h,v 1.1 2015/08/28 12:14:19 nanard Exp $ */
+/* $Id: upnpdev.h,v 1.3 2020/05/29 15:57:42 nanard Exp $ */
 /* Project : miniupnp
- * Web : http://miniupnp.free.fr/
+ * Web : http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
  * Author : Thomas BERNARD
- * copyright (c) 2005-2018 Thomas Bernard
+ * copyright (c) 2005-2020 Thomas Bernard
  * This software is subjet to the conditions detailed in the
  * provided LICENSE file. */
 #ifndef UPNPDEV_H_INCLUDED
@@ -20,7 +20,15 @@ struct UPNPDev {
 	char * st;
 	char * usn;
 	unsigned int scope_id;
-	char buffer[3];
+#if defined(__STDC_VERSION) && __STDC_VERSION__ >= 199901L
+	/* C99 flexible array member */
+	char buffer[];
+#elif defined(__GNUC__)
+	char buffer[0];
+#else
+	/* Fallback to a hack */
+	char buffer[1];
+#endif
 };
 
 /* freeUPNPDevlist()
diff --git a/thirdparty/miniupnpc/miniupnpc/upnperrors.c b/thirdparty/miniupnpc/miniupnpc/upnperrors.c
deleted file mode 100644
index 4496e8622c..0000000000
--- a/thirdparty/miniupnpc/miniupnpc/upnperrors.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/* $Id: upnperrors.c,v 1.10 2019/08/24 08:49:53 nanard Exp $ */
-/* vim: tabstop=4 shiftwidth=4 noexpandtab
- * Project : miniupnp
- * Author : Thomas BERNARD
- * copyright (c) 2007-2019 Thomas Bernard
- * All Right reserved.
- * http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
- * This software is subjet to the conditions detailed in the
- * provided LICENCE file. */
-#include <string.h>
-#include "upnperrors.h"
-#include "upnpcommands.h"
-#include "miniupnpc.h"
-
-const char * strupnperror(int err)
-{
-	const char * s = NULL;
-	switch(err) {
-	case UPNPCOMMAND_SUCCESS:
-		s = "Success";
-		break;
-	case UPNPCOMMAND_UNKNOWN_ERROR:
-		s = "Miniupnpc Unknown Error";
-		break;
-	case UPNPCOMMAND_INVALID_ARGS:
-		s = "Miniupnpc Invalid Arguments";
-		break;
-	case UPNPCOMMAND_INVALID_RESPONSE:
-		s = "Miniupnpc Invalid response";
-		break;
-	case UPNPCOMMAND_HTTP_ERROR:
-		s = "Miniupnpc HTTP error";
-		break;
-	case UPNPDISCOVER_SOCKET_ERROR:
-		s = "Miniupnpc Socket error";
-		break;
-	case UPNPDISCOVER_MEMORY_ERROR:
-	case UPNPCOMMAND_MEM_ALLOC_ERROR:
-		s = "Miniupnpc Memory allocation error";
-		break;
-	case 401:
-		s = "Invalid Action";
-		break;
-	case 402:
-		s = "Invalid Args";
-		break;
-	case 501:
-		s = "Action Failed";
-		break;
-	case 606:
-		s = "Action not authorized";
-		break;
-	case 701:
-		s = "PinholeSpaceExhausted";
-		break;
-	case 702:
-		s = "FirewallDisabled";
-		break;
-	case 703:
-		s = "InboundPinholeNotAllowed";
-		break;
-	case 704:
-		s = "NoSuchEntry";
-		break;
-	case 705:
-		s = "ProtocolNotSupported";
-		break;
-	case 706:
-		s = "InternalPortWildcardingNotAllowed";
-		break;
-	case 707:
-		s = "ProtocolWildcardingNotAllowed";
-		break;
-	case 708:
-		s = "InvalidLayer2Address";
-		break;
-	case 709:
-		s = "NoPacketSent";
-		break;
-	case 713:
-		s = "SpecifiedArrayIndexInvalid";
-		break;
-	case 714:
-		s = "NoSuchEntryInArray";
-		break;
-	case 715:
-		s = "WildCardNotPermittedInSrcIP";
-		break;
-	case 716:
-		s = "WildCardNotPermittedInExtPort";
-		break;
-	case 718:
-		s = "ConflictInMappingEntry";
-		break;
-	case 724:
-		s = "SamePortValuesRequired";
-		break;
-	case 725:
-		s = "OnlyPermanentLeasesSupported";
-		break;
-	case 726:
-		s = "RemoteHostOnlySupportsWildcard";
-		break;
-	case 727:
-		s = "ExternalPortOnlySupportsWildcard";
-		break;
-	default:
-		s = "UnknownError";
-		break;
-	}
-	return s;
-}
diff --git a/thirdparty/miniupnpc/miniupnpc/upnperrors.h b/thirdparty/miniupnpc/miniupnpc/upnperrors.h
deleted file mode 100644
index 8499d9a1c9..0000000000
--- a/thirdparty/miniupnpc/miniupnpc/upnperrors.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* $Id: upnperrors.h,v 1.2 2008/07/02 23:31:15 nanard Exp $ */
-/* (c) 2007-2015 Thomas Bernard
- * All rights reserved.
- * MiniUPnP Project.
- * http://miniupnp.free.fr/ or http://miniupnp.tuxfamily.org/
- * This software is subjet to the conditions detailed in the
- * provided LICENCE file. */
-#ifndef UPNPERRORS_H_INCLUDED
-#define UPNPERRORS_H_INCLUDED
-
-#include "miniupnpc_declspec.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* strupnperror()
- * Return a string description of the UPnP error code
- * or NULL for undefinded errors */
-MINIUPNP_LIBSPEC const char * strupnperror(int err);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/thirdparty/miniupnpc/miniupnpc/win32_snprintf.h b/thirdparty/miniupnpc/miniupnpc/win32_snprintf.h
new file mode 100644
index 0000000000..1fc284ecff
--- /dev/null
+++ b/thirdparty/miniupnpc/miniupnpc/win32_snprintf.h
@@ -0,0 +1,71 @@
+/* vim: tabstop=4 shiftwidth=4 noexpandtab
+ * MiniUPnP project
+ * http://miniupnp.free.fr/ or https://miniupnp.tuxfamily.org/
+ * (c) 2020 Pali Rohár
+ * This software is subject to the conditions detailed
+ * in the LICENCE file provided within the distribution */
+
+#ifndef WIN32_SNPRINTF_H
+#define WIN32_SNPRINTF_H
+
+#ifdef _WIN32
+
+#include <stdio.h>
+
+/* snprintf is supported by:
+ *  - Visual Studio 2015 or new
+ *  - mingw32 with iso c ext
+ *  - mingw-w64 with ansi stdio
+ *  - mingw-w64 6.0.0 or new with ucrt
+ *  - mingw-w64 8.0.0 or new with iso c ext
+ */
+#if ( \
+	(defined(_MSC_VER) && _MSC_VER < 1900) /* Visual Studio older than 2015 */ || \
+	(defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR) && defined(__NO_ISOCEXT)) /* mingw32 without iso c ext */ || \
+	(defined(__MINGW64_VERSION_MAJOR) && /* mingw-w64 not ... */ !( \
+		(defined (__USE_MINGW_ANSI_STDIO) && __USE_MINGW_ANSI_STDIO != 0)) /* ... with ansi stdio */ || \
+		(__MINGW64_VERSION_MAJOR >= 6 && defined(_UCRT)) /* ... at least 6.0.0 with ucrt */ || \
+		(__MINGW64_VERSION_MAJOR >= 8 && !defined(__NO_ISOCEXT)) /* ... at least 8.0.0 with iso c ext */ || \
+	0) || \
+0)
+
+/* _scprintf is supported by:
+ *  - Visual Studio 2002 or new
+ *  - msvcr70.dll or new
+ *  - msvcrt.dll on Windows XP or new
+ */
+#if ( \
+	(defined(_MSC_VER) && _MSC_VER < 1300) /* Visual Studio older than 2002 */ || \
+	(defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ < 0x700) /* msvcrt older than 7.0 */ || \
+0)
+#define CHECK_SCPRINTF 0
+#define IF_SCPRINTF(expr) 0
+#define ELSE_SCPRINTF(expr) expr
+#else
+#define CHECK_SCPRINTF 1
+#define IF_SCPRINTF(expr) expr
+#define ELSE_SCPRINTF(expr) 0
+#endif
+
+/* Emulation of snprintf for win32 */
+#define snprintf(buf, size, fmt, ...) ( \
+	(((size) != 0 && (buf) != NULL) ? ( /* _snprintf does not work with NULL buffer */ \
+		_snprintf((buf), (size), (fmt), __VA_ARGS__), /* _snprintf returns -1 on overflow, so ignore its value */ \
+		(((char *)buf)[(size_t)(size)-1] = 0), /* _snprintf does not fill nul byte on overflow */ \
+	0) : 0), \
+	(CHECK_SCPRINTF ? IF_SCPRINTF( \
+		_scprintf((fmt), __VA_ARGS__) /* calculate return value for snprintf via _scprintf */ \
+	) : ELSE_SCPRINTF( \
+		((size) != 0 && (buf) != NULL) ? \
+			strlen((buf)) /* return just length of buffer */ \
+		: \
+			1 /* no buffer, impossible to calculate, return just non-zero number */ \
+		) \
+	) \
+)
+
+#endif
+
+#endif /* _WIN32 */
+
+#endif /* WIN32_SNPRINTF_H */
diff --git a/thirdparty/miniupnpc/windows_fix.diff b/thirdparty/miniupnpc/windows_fix.diff
deleted file mode 100644
index 460b596888..0000000000
--- a/thirdparty/miniupnpc/windows_fix.diff
+++ /dev/null
@@ -1,16 +0,0 @@
-diff --git a/thirdparty/miniupnpc/miniupnpc/minissdpc.c b/thirdparty/miniupnpc/miniupnpc/minissdpc.c
-index 29f8110155..ea9af02e1f 100644
---- a/thirdparty/miniupnpc/miniupnpc/minissdpc.c
-+++ b/thirdparty/miniupnpc/miniupnpc/minissdpc.c
-@@ -683,11 +683,7 @@ ssdpDiscoverDevices(const char * const deviceTypes[],
- #endif
- 		} else {
- 			struct in_addr mc_if;
--#if defined(_WIN32) && (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
--			InetPtonA(AF_INET, multicastif, &mc_if);
--#else
- 			mc_if.s_addr = inet_addr(multicastif); /* ex: 192.168.x.x */
--#endif
- 			if(mc_if.s_addr != INADDR_NONE)
- 			{
- 				((struct sockaddr_in *)&sockudp_r)->sin_addr.s_addr = mc_if.s_addr;
diff --git a/thirdparty/misc/patches/polypartition-godot-types.patch b/thirdparty/misc/patches/polypartition-godot-types.patch
index 59fdb2707c..782f02e8dc 100644
--- a/thirdparty/misc/patches/polypartition-godot-types.patch
+++ b/thirdparty/misc/patches/polypartition-godot-types.patch
@@ -1,5 +1,5 @@
 diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
-index 3a8a6efa8319..4f1b6dcb21d8 100644
+index 3a8a6efa83..5e94793b79 100644
 --- a/thirdparty/misc/polypartition.cpp
 +++ b/thirdparty/misc/polypartition.cpp
 @@ -23,10 +23,7 @@
@@ -510,7 +510,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
 -      return 0;
 -    }
 -    numvertices += iter->GetNumPoints();
-+  for (iter = inpolys->front(); iter; iter++) {
++  for (iter = inpolys->front(); iter; iter = iter->next()) {
 +    numvertices += iter->get().GetNumPoints();
    }
  
@@ -521,7 +521,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
    polystartindex = 0;
 -  for (iter = inpolys->begin(); iter != inpolys->end(); iter++) {
 -    poly = &(*iter);
-+  for (iter = inpolys->front(); iter; iter++) {
++  for (iter = inpolys->front(); iter; iter = iter->next()) {
 +    poly = &(iter->get());
      polyendindex = polystartindex + poly->GetNumPoints() - 1;
      for (i = 0; i < poly->GetNumPoints(); i++) {
@@ -569,7 +569,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
 -        if (edgeIter == edgeTree.begin()) {
-+        if (edgeIter == edgeTree.front()) {
++        if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
            error = true;
            break;
          }
@@ -606,7 +606,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
 -        if (edgeIter == edgeTree.begin()) {
-+        if (edgeIter == edgeTree.front()) {
++        if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
            error = true;
            break;
          }
@@ -648,7 +648,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
            newedge.p2 = v->p;
            edgeIter = edgeTree.lower_bound(newedge);
 -          if (edgeIter == edgeTree.begin()) {
-+          if (edgeIter == edgeTree.front()) {
++          if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
              error = true;
              break;
            }
@@ -716,7 +716,7 @@ index 3a8a6efa8319..4f1b6dcb21d8 100644
      }
    }
 diff --git a/thirdparty/misc/polypartition.h b/thirdparty/misc/polypartition.h
-index f163f5d2173f..b2d905a3ef76 100644
+index f163f5d217..b2d905a3ef 100644
 --- a/thirdparty/misc/polypartition.h
 +++ b/thirdparty/misc/polypartition.h
 @@ -24,8 +24,9 @@
diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
index 4f1b6dcb21..5e94793b79 100644
--- a/thirdparty/misc/polypartition.cpp
+++ b/thirdparty/misc/polypartition.cpp
@@ -1289,7 +1289,7 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
   bool error = false;
 
   numvertices = 0;
-  for (iter = inpolys->front(); iter; iter++) {
+  for (iter = inpolys->front(); iter; iter = iter->next()) {
     numvertices += iter->get().GetNumPoints();
   }
 
@@ -1298,7 +1298,7 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
   newnumvertices = numvertices;
 
   polystartindex = 0;
-  for (iter = inpolys->front(); iter; iter++) {
+  for (iter = inpolys->front(); iter; iter = iter->next()) {
     poly = &(iter->get());
     polyendindex = polystartindex + poly->GetNumPoints() - 1;
     for (i = 0; i < poly->GetNumPoints(); i++) {
@@ -1408,7 +1408,7 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
         newedge.p1 = v->p;
         newedge.p2 = v->p;
         edgeIter = edgeTree.lower_bound(newedge);
-        if (edgeIter == edgeTree.front()) {
+        if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
           error = true;
           break;
         }
@@ -1449,7 +1449,7 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
         newedge.p1 = v->p;
         newedge.p2 = v->p;
         edgeIter = edgeTree.lower_bound(newedge);
-        if (edgeIter == edgeTree.front()) {
+        if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
           error = true;
           break;
         }
@@ -1494,7 +1494,7 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
           newedge.p1 = v->p;
           newedge.p2 = v->p;
           edgeIter = edgeTree.lower_bound(newedge);
-          if (edgeIter == edgeTree.front()) {
+          if (edgeIter == nullptr || edgeIter == edgeTree.front()) {
             error = true;
             break;
           }
diff --git a/thirdparty/misc/smolv.cpp b/thirdparty/misc/smolv.cpp
new file mode 100644
index 0000000000..26ed7294f9
--- /dev/null
+++ b/thirdparty/misc/smolv.cpp
@@ -0,0 +1,2108 @@
+// smol-v - public domain - https://github.com/aras-p/smol-v
+// authored 2016-2020 by Aras Pranckevicius
+// no warranty implied; use at your own risk
+// See end of file for license information.
+
+#include "smolv.h"
+#include <stdint.h>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+
+#if !defined(_MSC_VER) && __cplusplus < 201103L
+#define static_assert(x,y)
+#endif
+
+#define _SMOLV_ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
+
+// --------------------------------------------------------------------------------------------
+// Metadata about known SPIR-V operations
+
+enum SpvOp
+{
+	SpvOpNop = 0,
+	SpvOpUndef = 1,
+	SpvOpSourceContinued = 2,
+	SpvOpSource = 3,
+	SpvOpSourceExtension = 4,
+	SpvOpName = 5,
+	SpvOpMemberName = 6,
+	SpvOpString = 7,
+	SpvOpLine = 8,
+	SpvOpExtension = 10,
+	SpvOpExtInstImport = 11,
+	SpvOpExtInst = 12,
+	SpvOpVectorShuffleCompact = 13, // not in SPIR-V, added for SMOL-V!
+	SpvOpMemoryModel = 14,
+	SpvOpEntryPoint = 15,
+	SpvOpExecutionMode = 16,
+	SpvOpCapability = 17,
+	SpvOpTypeVoid = 19,
+	SpvOpTypeBool = 20,
+	SpvOpTypeInt = 21,
+	SpvOpTypeFloat = 22,
+	SpvOpTypeVector = 23,
+	SpvOpTypeMatrix = 24,
+	SpvOpTypeImage = 25,
+	SpvOpTypeSampler = 26,
+	SpvOpTypeSampledImage = 27,
+	SpvOpTypeArray = 28,
+	SpvOpTypeRuntimeArray = 29,
+	SpvOpTypeStruct = 30,
+	SpvOpTypeOpaque = 31,
+	SpvOpTypePointer = 32,
+	SpvOpTypeFunction = 33,
+	SpvOpTypeEvent = 34,
+	SpvOpTypeDeviceEvent = 35,
+	SpvOpTypeReserveId = 36,
+	SpvOpTypeQueue = 37,
+	SpvOpTypePipe = 38,
+	SpvOpTypeForwardPointer = 39,
+	SpvOpConstantTrue = 41,
+	SpvOpConstantFalse = 42,
+	SpvOpConstant = 43,
+	SpvOpConstantComposite = 44,
+	SpvOpConstantSampler = 45,
+	SpvOpConstantNull = 46,
+	SpvOpSpecConstantTrue = 48,
+	SpvOpSpecConstantFalse = 49,
+	SpvOpSpecConstant = 50,
+	SpvOpSpecConstantComposite = 51,
+	SpvOpSpecConstantOp = 52,
+	SpvOpFunction = 54,
+	SpvOpFunctionParameter = 55,
+	SpvOpFunctionEnd = 56,
+	SpvOpFunctionCall = 57,
+	SpvOpVariable = 59,
+	SpvOpImageTexelPointer = 60,
+	SpvOpLoad = 61,
+	SpvOpStore = 62,
+	SpvOpCopyMemory = 63,
+	SpvOpCopyMemorySized = 64,
+	SpvOpAccessChain = 65,
+	SpvOpInBoundsAccessChain = 66,
+	SpvOpPtrAccessChain = 67,
+	SpvOpArrayLength = 68,
+	SpvOpGenericPtrMemSemantics = 69,
+	SpvOpInBoundsPtrAccessChain = 70,
+	SpvOpDecorate = 71,
+	SpvOpMemberDecorate = 72,
+	SpvOpDecorationGroup = 73,
+	SpvOpGroupDecorate = 74,
+	SpvOpGroupMemberDecorate = 75,
+	SpvOpVectorExtractDynamic = 77,
+	SpvOpVectorInsertDynamic = 78,
+	SpvOpVectorShuffle = 79,
+	SpvOpCompositeConstruct = 80,
+	SpvOpCompositeExtract = 81,
+	SpvOpCompositeInsert = 82,
+	SpvOpCopyObject = 83,
+	SpvOpTranspose = 84,
+	SpvOpSampledImage = 86,
+	SpvOpImageSampleImplicitLod = 87,
+	SpvOpImageSampleExplicitLod = 88,
+	SpvOpImageSampleDrefImplicitLod = 89,
+	SpvOpImageSampleDrefExplicitLod = 90,
+	SpvOpImageSampleProjImplicitLod = 91,
+	SpvOpImageSampleProjExplicitLod = 92,
+	SpvOpImageSampleProjDrefImplicitLod = 93,
+	SpvOpImageSampleProjDrefExplicitLod = 94,
+	SpvOpImageFetch = 95,
+	SpvOpImageGather = 96,
+	SpvOpImageDrefGather = 97,
+	SpvOpImageRead = 98,
+	SpvOpImageWrite = 99,
+	SpvOpImage = 100,
+	SpvOpImageQueryFormat = 101,
+	SpvOpImageQueryOrder = 102,
+	SpvOpImageQuerySizeLod = 103,
+	SpvOpImageQuerySize = 104,
+	SpvOpImageQueryLod = 105,
+	SpvOpImageQueryLevels = 106,
+	SpvOpImageQuerySamples = 107,
+	SpvOpConvertFToU = 109,
+	SpvOpConvertFToS = 110,
+	SpvOpConvertSToF = 111,
+	SpvOpConvertUToF = 112,
+	SpvOpUConvert = 113,
+	SpvOpSConvert = 114,
+	SpvOpFConvert = 115,
+	SpvOpQuantizeToF16 = 116,
+	SpvOpConvertPtrToU = 117,
+	SpvOpSatConvertSToU = 118,
+	SpvOpSatConvertUToS = 119,
+	SpvOpConvertUToPtr = 120,
+	SpvOpPtrCastToGeneric = 121,
+	SpvOpGenericCastToPtr = 122,
+	SpvOpGenericCastToPtrExplicit = 123,
+	SpvOpBitcast = 124,
+	SpvOpSNegate = 126,
+	SpvOpFNegate = 127,
+	SpvOpIAdd = 128,
+	SpvOpFAdd = 129,
+	SpvOpISub = 130,
+	SpvOpFSub = 131,
+	SpvOpIMul = 132,
+	SpvOpFMul = 133,
+	SpvOpUDiv = 134,
+	SpvOpSDiv = 135,
+	SpvOpFDiv = 136,
+	SpvOpUMod = 137,
+	SpvOpSRem = 138,
+	SpvOpSMod = 139,
+	SpvOpFRem = 140,
+	SpvOpFMod = 141,
+	SpvOpVectorTimesScalar = 142,
+	SpvOpMatrixTimesScalar = 143,
+	SpvOpVectorTimesMatrix = 144,
+	SpvOpMatrixTimesVector = 145,
+	SpvOpMatrixTimesMatrix = 146,
+	SpvOpOuterProduct = 147,
+	SpvOpDot = 148,
+	SpvOpIAddCarry = 149,
+	SpvOpISubBorrow = 150,
+	SpvOpUMulExtended = 151,
+	SpvOpSMulExtended = 152,
+	SpvOpAny = 154,
+	SpvOpAll = 155,
+	SpvOpIsNan = 156,
+	SpvOpIsInf = 157,
+	SpvOpIsFinite = 158,
+	SpvOpIsNormal = 159,
+	SpvOpSignBitSet = 160,
+	SpvOpLessOrGreater = 161,
+	SpvOpOrdered = 162,
+	SpvOpUnordered = 163,
+	SpvOpLogicalEqual = 164,
+	SpvOpLogicalNotEqual = 165,
+	SpvOpLogicalOr = 166,
+	SpvOpLogicalAnd = 167,
+	SpvOpLogicalNot = 168,
+	SpvOpSelect = 169,
+	SpvOpIEqual = 170,
+	SpvOpINotEqual = 171,
+	SpvOpUGreaterThan = 172,
+	SpvOpSGreaterThan = 173,
+	SpvOpUGreaterThanEqual = 174,
+	SpvOpSGreaterThanEqual = 175,
+	SpvOpULessThan = 176,
+	SpvOpSLessThan = 177,
+	SpvOpULessThanEqual = 178,
+	SpvOpSLessThanEqual = 179,
+	SpvOpFOrdEqual = 180,
+	SpvOpFUnordEqual = 181,
+	SpvOpFOrdNotEqual = 182,
+	SpvOpFUnordNotEqual = 183,
+	SpvOpFOrdLessThan = 184,
+	SpvOpFUnordLessThan = 185,
+	SpvOpFOrdGreaterThan = 186,
+	SpvOpFUnordGreaterThan = 187,
+	SpvOpFOrdLessThanEqual = 188,
+	SpvOpFUnordLessThanEqual = 189,
+	SpvOpFOrdGreaterThanEqual = 190,
+	SpvOpFUnordGreaterThanEqual = 191,
+	SpvOpShiftRightLogical = 194,
+	SpvOpShiftRightArithmetic = 195,
+	SpvOpShiftLeftLogical = 196,
+	SpvOpBitwiseOr = 197,
+	SpvOpBitwiseXor = 198,
+	SpvOpBitwiseAnd = 199,
+	SpvOpNot = 200,
+	SpvOpBitFieldInsert = 201,
+	SpvOpBitFieldSExtract = 202,
+	SpvOpBitFieldUExtract = 203,
+	SpvOpBitReverse = 204,
+	SpvOpBitCount = 205,
+	SpvOpDPdx = 207,
+	SpvOpDPdy = 208,
+	SpvOpFwidth = 209,
+	SpvOpDPdxFine = 210,
+	SpvOpDPdyFine = 211,
+	SpvOpFwidthFine = 212,
+	SpvOpDPdxCoarse = 213,
+	SpvOpDPdyCoarse = 214,
+	SpvOpFwidthCoarse = 215,
+	SpvOpEmitVertex = 218,
+	SpvOpEndPrimitive = 219,
+	SpvOpEmitStreamVertex = 220,
+	SpvOpEndStreamPrimitive = 221,
+	SpvOpControlBarrier = 224,
+	SpvOpMemoryBarrier = 225,
+	SpvOpAtomicLoad = 227,
+	SpvOpAtomicStore = 228,
+	SpvOpAtomicExchange = 229,
+	SpvOpAtomicCompareExchange = 230,
+	SpvOpAtomicCompareExchangeWeak = 231,
+	SpvOpAtomicIIncrement = 232,
+	SpvOpAtomicIDecrement = 233,
+	SpvOpAtomicIAdd = 234,
+	SpvOpAtomicISub = 235,
+	SpvOpAtomicSMin = 236,
+	SpvOpAtomicUMin = 237,
+	SpvOpAtomicSMax = 238,
+	SpvOpAtomicUMax = 239,
+	SpvOpAtomicAnd = 240,
+	SpvOpAtomicOr = 241,
+	SpvOpAtomicXor = 242,
+	SpvOpPhi = 245,
+	SpvOpLoopMerge = 246,
+	SpvOpSelectionMerge = 247,
+	SpvOpLabel = 248,
+	SpvOpBranch = 249,
+	SpvOpBranchConditional = 250,
+	SpvOpSwitch = 251,
+	SpvOpKill = 252,
+	SpvOpReturn = 253,
+	SpvOpReturnValue = 254,
+	SpvOpUnreachable = 255,
+	SpvOpLifetimeStart = 256,
+	SpvOpLifetimeStop = 257,
+	SpvOpGroupAsyncCopy = 259,
+	SpvOpGroupWaitEvents = 260,
+	SpvOpGroupAll = 261,
+	SpvOpGroupAny = 262,
+	SpvOpGroupBroadcast = 263,
+	SpvOpGroupIAdd = 264,
+	SpvOpGroupFAdd = 265,
+	SpvOpGroupFMin = 266,
+	SpvOpGroupUMin = 267,
+	SpvOpGroupSMin = 268,
+	SpvOpGroupFMax = 269,
+	SpvOpGroupUMax = 270,
+	SpvOpGroupSMax = 271,
+	SpvOpReadPipe = 274,
+	SpvOpWritePipe = 275,
+	SpvOpReservedReadPipe = 276,
+	SpvOpReservedWritePipe = 277,
+	SpvOpReserveReadPipePackets = 278,
+	SpvOpReserveWritePipePackets = 279,
+	SpvOpCommitReadPipe = 280,
+	SpvOpCommitWritePipe = 281,
+	SpvOpIsValidReserveId = 282,
+	SpvOpGetNumPipePackets = 283,
+	SpvOpGetMaxPipePackets = 284,
+	SpvOpGroupReserveReadPipePackets = 285,
+	SpvOpGroupReserveWritePipePackets = 286,
+	SpvOpGroupCommitReadPipe = 287,
+	SpvOpGroupCommitWritePipe = 288,
+	SpvOpEnqueueMarker = 291,
+	SpvOpEnqueueKernel = 292,
+	SpvOpGetKernelNDrangeSubGroupCount = 293,
+	SpvOpGetKernelNDrangeMaxSubGroupSize = 294,
+	SpvOpGetKernelWorkGroupSize = 295,
+	SpvOpGetKernelPreferredWorkGroupSizeMultiple = 296,
+	SpvOpRetainEvent = 297,
+	SpvOpReleaseEvent = 298,
+	SpvOpCreateUserEvent = 299,
+	SpvOpIsValidEvent = 300,
+	SpvOpSetUserEventStatus = 301,
+	SpvOpCaptureEventProfilingInfo = 302,
+	SpvOpGetDefaultQueue = 303,
+	SpvOpBuildNDRange = 304,
+	SpvOpImageSparseSampleImplicitLod = 305,
+	SpvOpImageSparseSampleExplicitLod = 306,
+	SpvOpImageSparseSampleDrefImplicitLod = 307,
+	SpvOpImageSparseSampleDrefExplicitLod = 308,
+	SpvOpImageSparseSampleProjImplicitLod = 309,
+	SpvOpImageSparseSampleProjExplicitLod = 310,
+	SpvOpImageSparseSampleProjDrefImplicitLod = 311,
+	SpvOpImageSparseSampleProjDrefExplicitLod = 312,
+	SpvOpImageSparseFetch = 313,
+	SpvOpImageSparseGather = 314,
+	SpvOpImageSparseDrefGather = 315,
+	SpvOpImageSparseTexelsResident = 316,
+	SpvOpNoLine = 317,
+	SpvOpAtomicFlagTestAndSet = 318,
+	SpvOpAtomicFlagClear = 319,
+	SpvOpImageSparseRead = 320,
+	SpvOpSizeOf = 321,
+	SpvOpTypePipeStorage = 322,
+	SpvOpConstantPipeStorage = 323,
+	SpvOpCreatePipeFromPipeStorage = 324,
+	SpvOpGetKernelLocalSizeForSubgroupCount = 325,
+	SpvOpGetKernelMaxNumSubgroups = 326,
+	SpvOpTypeNamedBarrier = 327,
+	SpvOpNamedBarrierInitialize = 328,
+	SpvOpMemoryNamedBarrier = 329,
+	SpvOpModuleProcessed = 330,
+	SpvOpExecutionModeId = 331,
+	SpvOpDecorateId = 332,
+	SpvOpGroupNonUniformElect = 333,
+	SpvOpGroupNonUniformAll = 334,
+	SpvOpGroupNonUniformAny = 335,
+	SpvOpGroupNonUniformAllEqual = 336,
+	SpvOpGroupNonUniformBroadcast = 337,
+	SpvOpGroupNonUniformBroadcastFirst = 338,
+	SpvOpGroupNonUniformBallot = 339,
+	SpvOpGroupNonUniformInverseBallot = 340,
+	SpvOpGroupNonUniformBallotBitExtract = 341,
+	SpvOpGroupNonUniformBallotBitCount = 342,
+	SpvOpGroupNonUniformBallotFindLSB = 343,
+	SpvOpGroupNonUniformBallotFindMSB = 344,
+	SpvOpGroupNonUniformShuffle = 345,
+	SpvOpGroupNonUniformShuffleXor = 346,
+	SpvOpGroupNonUniformShuffleUp = 347,
+	SpvOpGroupNonUniformShuffleDown = 348,
+	SpvOpGroupNonUniformIAdd = 349,
+	SpvOpGroupNonUniformFAdd = 350,
+	SpvOpGroupNonUniformIMul = 351,
+	SpvOpGroupNonUniformFMul = 352,
+	SpvOpGroupNonUniformSMin = 353,
+	SpvOpGroupNonUniformUMin = 354,
+	SpvOpGroupNonUniformFMin = 355,
+	SpvOpGroupNonUniformSMax = 356,
+	SpvOpGroupNonUniformUMax = 357,
+	SpvOpGroupNonUniformFMax = 358,
+	SpvOpGroupNonUniformBitwiseAnd = 359,
+	SpvOpGroupNonUniformBitwiseOr = 360,
+	SpvOpGroupNonUniformBitwiseXor = 361,
+	SpvOpGroupNonUniformLogicalAnd = 362,
+	SpvOpGroupNonUniformLogicalOr = 363,
+	SpvOpGroupNonUniformLogicalXor = 364,
+	SpvOpGroupNonUniformQuadBroadcast = 365,
+	SpvOpGroupNonUniformQuadSwap = 366,
+};
+static const int kKnownOpsCount = SpvOpGroupNonUniformQuadSwap+1;
+
+
+static const char* kSpirvOpNames[] =
+{
+	"Nop",
+	"Undef",
+	"SourceContinued",
+	"Source",
+	"SourceExtension",
+	"Name",
+	"MemberName",
+	"String",
+	"Line",
+	"#9",
+	"Extension",
+	"ExtInstImport",
+	"ExtInst",
+	"VectorShuffleCompact",
+	"MemoryModel",
+	"EntryPoint",
+	"ExecutionMode",
+	"Capability",
+	"#18",
+	"TypeVoid",
+	"TypeBool",
+	"TypeInt",
+	"TypeFloat",
+	"TypeVector",
+	"TypeMatrix",
+	"TypeImage",
+	"TypeSampler",
+	"TypeSampledImage",
+	"TypeArray",
+	"TypeRuntimeArray",
+	"TypeStruct",
+	"TypeOpaque",
+	"TypePointer",
+	"TypeFunction",
+	"TypeEvent",
+	"TypeDeviceEvent",
+	"TypeReserveId",
+	"TypeQueue",
+	"TypePipe",
+	"TypeForwardPointer",
+	"#40",
+	"ConstantTrue",
+	"ConstantFalse",
+	"Constant",
+	"ConstantComposite",
+	"ConstantSampler",
+	"ConstantNull",
+	"#47",
+	"SpecConstantTrue",
+	"SpecConstantFalse",
+	"SpecConstant",
+	"SpecConstantComposite",
+	"SpecConstantOp",
+	"#53",
+	"Function",
+	"FunctionParameter",
+	"FunctionEnd",
+	"FunctionCall",
+	"#58",
+	"Variable",
+	"ImageTexelPointer",
+	"Load",
+	"Store",
+	"CopyMemory",
+	"CopyMemorySized",
+	"AccessChain",
+	"InBoundsAccessChain",
+	"PtrAccessChain",
+	"ArrayLength",
+	"GenericPtrMemSemantics",
+	"InBoundsPtrAccessChain",
+	"Decorate",
+	"MemberDecorate",
+	"DecorationGroup",
+	"GroupDecorate",
+	"GroupMemberDecorate",
+	"#76",
+	"VectorExtractDynamic",
+	"VectorInsertDynamic",
+	"VectorShuffle",
+	"CompositeConstruct",
+	"CompositeExtract",
+	"CompositeInsert",
+	"CopyObject",
+	"Transpose",
+	"#85",
+	"SampledImage",
+	"ImageSampleImplicitLod",
+	"ImageSampleExplicitLod",
+	"ImageSampleDrefImplicitLod",
+	"ImageSampleDrefExplicitLod",
+	"ImageSampleProjImplicitLod",
+	"ImageSampleProjExplicitLod",
+	"ImageSampleProjDrefImplicitLod",
+	"ImageSampleProjDrefExplicitLod",
+	"ImageFetch",
+	"ImageGather",
+	"ImageDrefGather",
+	"ImageRead",
+	"ImageWrite",
+	"Image",
+	"ImageQueryFormat",
+	"ImageQueryOrder",
+	"ImageQuerySizeLod",
+	"ImageQuerySize",
+	"ImageQueryLod",
+	"ImageQueryLevels",
+	"ImageQuerySamples",
+	"#108",
+	"ConvertFToU",
+	"ConvertFToS",
+	"ConvertSToF",
+	"ConvertUToF",
+	"UConvert",
+	"SConvert",
+	"FConvert",
+	"QuantizeToF16",
+	"ConvertPtrToU",
+	"SatConvertSToU",
+	"SatConvertUToS",
+	"ConvertUToPtr",
+	"PtrCastToGeneric",
+	"GenericCastToPtr",
+	"GenericCastToPtrExplicit",
+	"Bitcast",
+	"#125",
+	"SNegate",
+	"FNegate",
+	"IAdd",
+	"FAdd",
+	"ISub",
+	"FSub",
+	"IMul",
+	"FMul",
+	"UDiv",
+	"SDiv",
+	"FDiv",
+	"UMod",
+	"SRem",
+	"SMod",
+	"FRem",
+	"FMod",
+	"VectorTimesScalar",
+	"MatrixTimesScalar",
+	"VectorTimesMatrix",
+	"MatrixTimesVector",
+	"MatrixTimesMatrix",
+	"OuterProduct",
+	"Dot",
+	"IAddCarry",
+	"ISubBorrow",
+	"UMulExtended",
+	"SMulExtended",
+	"#153",
+	"Any",
+	"All",
+	"IsNan",
+	"IsInf",
+	"IsFinite",
+	"IsNormal",
+	"SignBitSet",
+	"LessOrGreater",
+	"Ordered",
+	"Unordered",
+	"LogicalEqual",
+	"LogicalNotEqual",
+	"LogicalOr",
+	"LogicalAnd",
+	"LogicalNot",
+	"Select",
+	"IEqual",
+	"INotEqual",
+	"UGreaterThan",
+	"SGreaterThan",
+	"UGreaterThanEqual",
+	"SGreaterThanEqual",
+	"ULessThan",
+	"SLessThan",
+	"ULessThanEqual",
+	"SLessThanEqual",
+	"FOrdEqual",
+	"FUnordEqual",
+	"FOrdNotEqual",
+	"FUnordNotEqual",
+	"FOrdLessThan",
+	"FUnordLessThan",
+	"FOrdGreaterThan",
+	"FUnordGreaterThan",
+	"FOrdLessThanEqual",
+	"FUnordLessThanEqual",
+	"FOrdGreaterThanEqual",
+	"FUnordGreaterThanEqual",
+	"#192",
+	"#193",
+	"ShiftRightLogical",
+	"ShiftRightArithmetic",
+	"ShiftLeftLogical",
+	"BitwiseOr",
+	"BitwiseXor",
+	"BitwiseAnd",
+	"Not",
+	"BitFieldInsert",
+	"BitFieldSExtract",
+	"BitFieldUExtract",
+	"BitReverse",
+	"BitCount",
+	"#206",
+	"DPdx",
+	"DPdy",
+	"Fwidth",
+	"DPdxFine",
+	"DPdyFine",
+	"FwidthFine",
+	"DPdxCoarse",
+	"DPdyCoarse",
+	"FwidthCoarse",
+	"#216",
+	"#217",
+	"EmitVertex",
+	"EndPrimitive",
+	"EmitStreamVertex",
+	"EndStreamPrimitive",
+	"#222",
+	"#223",
+	"ControlBarrier",
+	"MemoryBarrier",
+	"#226",
+	"AtomicLoad",
+	"AtomicStore",
+	"AtomicExchange",
+	"AtomicCompareExchange",
+	"AtomicCompareExchangeWeak",
+	"AtomicIIncrement",
+	"AtomicIDecrement",
+	"AtomicIAdd",
+	"AtomicISub",
+	"AtomicSMin",
+	"AtomicUMin",
+	"AtomicSMax",
+	"AtomicUMax",
+	"AtomicAnd",
+	"AtomicOr",
+	"AtomicXor",
+	"#243",
+	"#244",
+	"Phi",
+	"LoopMerge",
+	"SelectionMerge",
+	"Label",
+	"Branch",
+	"BranchConditional",
+	"Switch",
+	"Kill",
+	"Return",
+	"ReturnValue",
+	"Unreachable",
+	"LifetimeStart",
+	"LifetimeStop",
+	"#258",
+	"GroupAsyncCopy",
+	"GroupWaitEvents",
+	"GroupAll",
+	"GroupAny",
+	"GroupBroadcast",
+	"GroupIAdd",
+	"GroupFAdd",
+	"GroupFMin",
+	"GroupUMin",
+	"GroupSMin",
+	"GroupFMax",
+	"GroupUMax",
+	"GroupSMax",
+	"#272",
+	"#273",
+	"ReadPipe",
+	"WritePipe",
+	"ReservedReadPipe",
+	"ReservedWritePipe",
+	"ReserveReadPipePackets",
+	"ReserveWritePipePackets",
+	"CommitReadPipe",
+	"CommitWritePipe",
+	"IsValidReserveId",
+	"GetNumPipePackets",
+	"GetMaxPipePackets",
+	"GroupReserveReadPipePackets",
+	"GroupReserveWritePipePackets",
+	"GroupCommitReadPipe",
+	"GroupCommitWritePipe",
+	"#289",
+	"#290",
+	"EnqueueMarker",
+	"EnqueueKernel",
+	"GetKernelNDrangeSubGroupCount",
+	"GetKernelNDrangeMaxSubGroupSize",
+	"GetKernelWorkGroupSize",
+	"GetKernelPreferredWorkGroupSizeMultiple",
+	"RetainEvent",
+	"ReleaseEvent",
+	"CreateUserEvent",
+	"IsValidEvent",
+	"SetUserEventStatus",
+	"CaptureEventProfilingInfo",
+	"GetDefaultQueue",
+	"BuildNDRange",
+	"ImageSparseSampleImplicitLod",
+	"ImageSparseSampleExplicitLod",
+	"ImageSparseSampleDrefImplicitLod",
+	"ImageSparseSampleDrefExplicitLod",
+	"ImageSparseSampleProjImplicitLod",
+	"ImageSparseSampleProjExplicitLod",
+	"ImageSparseSampleProjDrefImplicitLod",
+	"ImageSparseSampleProjDrefExplicitLod",
+	"ImageSparseFetch",
+	"ImageSparseGather",
+	"ImageSparseDrefGather",
+	"ImageSparseTexelsResident",
+	"NoLine",
+	"AtomicFlagTestAndSet",
+	"AtomicFlagClear",
+	"ImageSparseRead",
+	"SizeOf",
+	"TypePipeStorage",
+	"ConstantPipeStorage",
+	"CreatePipeFromPipeStorage",
+	"GetKernelLocalSizeForSubgroupCount",
+	"GetKernelMaxNumSubgroups",
+	"TypeNamedBarrier",
+	"NamedBarrierInitialize",
+	"MemoryNamedBarrier",
+	"ModuleProcessed",
+	"ExecutionModeId",
+	"DecorateId",
+	"GroupNonUniformElect",
+	"GroupNonUniformAll",
+	"GroupNonUniformAny",
+	"GroupNonUniformAllEqual",
+	"GroupNonUniformBroadcast",
+	"GroupNonUniformBroadcastFirst",
+	"GroupNonUniformBallot",
+	"GroupNonUniformInverseBallot",
+	"GroupNonUniformBallotBitExtract",
+	"GroupNonUniformBallotBitCount",
+	"GroupNonUniformBallotFindLSB",
+	"GroupNonUniformBallotFindMSB",
+	"GroupNonUniformShuffle",
+	"GroupNonUniformShuffleXor",
+	"GroupNonUniformShuffleUp",
+	"GroupNonUniformShuffleDown",
+	"GroupNonUniformIAdd",
+	"GroupNonUniformFAdd",
+	"GroupNonUniformIMul",
+	"GroupNonUniformFMul",
+	"GroupNonUniformSMin",
+	"GroupNonUniformUMin",
+	"GroupNonUniformFMin",
+	"GroupNonUniformSMax",
+	"GroupNonUniformUMax",
+	"GroupNonUniformFMax",
+	"GroupNonUniformBitwiseAnd",
+	"GroupNonUniformBitwiseOr",
+	"GroupNonUniformBitwiseXor",
+	"GroupNonUniformLogicalAnd",
+	"GroupNonUniformLogicalOr",
+	"GroupNonUniformLogicalXor",
+	"GroupNonUniformQuadBroadcast",
+	"GroupNonUniformQuadSwap",
+};
+static_assert(_SMOLV_ARRAY_SIZE(kSpirvOpNames) == kKnownOpsCount, "kSpirvOpNames table mismatch with known SpvOps");
+
+
+struct OpData
+{
+	uint8_t hasResult;	// does it have result ID?
+	uint8_t hasType;	// does it have type ID?
+	uint8_t deltaFromResult; // How many words after (optional) type+result to write out as deltas from result?
+	uint8_t varrest;	// should the rest of words be written in varint encoding?
+};
+static const OpData kSpirvOpData[] =
+{
+	{0, 0, 0, 0}, // Nop
+	{1, 1, 0, 0}, // Undef
+	{0, 0, 0, 0}, // SourceContinued
+	{0, 0, 0, 1}, // Source
+	{0, 0, 0, 0}, // SourceExtension
+	{0, 0, 0, 0}, // Name
+	{0, 0, 0, 0}, // MemberName
+	{0, 0, 0, 0}, // String
+	{0, 0, 0, 1}, // Line
+	{1, 1, 0, 0}, // #9
+	{0, 0, 0, 0}, // Extension
+	{1, 0, 0, 0}, // ExtInstImport
+	{1, 1, 0, 1}, // ExtInst
+	{1, 1, 2, 1}, // VectorShuffleCompact - new in SMOLV
+	{0, 0, 0, 1}, // MemoryModel
+	{0, 0, 0, 1}, // EntryPoint
+	{0, 0, 0, 1}, // ExecutionMode
+	{0, 0, 0, 1}, // Capability
+	{1, 1, 0, 0}, // #18
+	{1, 0, 0, 1}, // TypeVoid
+	{1, 0, 0, 1}, // TypeBool
+	{1, 0, 0, 1}, // TypeInt
+	{1, 0, 0, 1}, // TypeFloat
+	{1, 0, 0, 1}, // TypeVector
+	{1, 0, 0, 1}, // TypeMatrix
+	{1, 0, 0, 1}, // TypeImage
+	{1, 0, 0, 1}, // TypeSampler
+	{1, 0, 0, 1}, // TypeSampledImage
+	{1, 0, 0, 1}, // TypeArray
+	{1, 0, 0, 1}, // TypeRuntimeArray
+	{1, 0, 0, 1}, // TypeStruct
+	{1, 0, 0, 1}, // TypeOpaque
+	{1, 0, 0, 1}, // TypePointer
+	{1, 0, 0, 1}, // TypeFunction
+	{1, 0, 0, 1}, // TypeEvent
+	{1, 0, 0, 1}, // TypeDeviceEvent
+	{1, 0, 0, 1}, // TypeReserveId
+	{1, 0, 0, 1}, // TypeQueue
+	{1, 0, 0, 1}, // TypePipe
+	{0, 0, 0, 1}, // TypeForwardPointer
+	{1, 1, 0, 0}, // #40
+	{1, 1, 0, 0}, // ConstantTrue
+	{1, 1, 0, 0}, // ConstantFalse
+	{1, 1, 0, 0}, // Constant
+	{1, 1, 9, 0}, // ConstantComposite
+	{1, 1, 0, 1}, // ConstantSampler
+	{1, 1, 0, 0}, // ConstantNull
+	{1, 1, 0, 0}, // #47
+	{1, 1, 0, 0}, // SpecConstantTrue
+	{1, 1, 0, 0}, // SpecConstantFalse
+	{1, 1, 0, 0}, // SpecConstant
+	{1, 1, 9, 0}, // SpecConstantComposite
+	{1, 1, 0, 0}, // SpecConstantOp
+	{1, 1, 0, 0}, // #53
+	{1, 1, 0, 1}, // Function
+	{1, 1, 0, 0}, // FunctionParameter
+	{0, 0, 0, 0}, // FunctionEnd
+	{1, 1, 9, 0}, // FunctionCall
+	{1, 1, 0, 0}, // #58
+	{1, 1, 0, 1}, // Variable
+	{1, 1, 0, 0}, // ImageTexelPointer
+	{1, 1, 1, 1}, // Load
+	{0, 0, 2, 1}, // Store
+	{0, 0, 0, 0}, // CopyMemory
+	{0, 0, 0, 0}, // CopyMemorySized
+	{1, 1, 0, 1}, // AccessChain
+	{1, 1, 0, 0}, // InBoundsAccessChain
+	{1, 1, 0, 0}, // PtrAccessChain
+	{1, 1, 0, 0}, // ArrayLength
+	{1, 1, 0, 0}, // GenericPtrMemSemantics
+	{1, 1, 0, 0}, // InBoundsPtrAccessChain
+	{0, 0, 0, 1}, // Decorate
+	{0, 0, 0, 1}, // MemberDecorate
+	{1, 0, 0, 0}, // DecorationGroup
+	{0, 0, 0, 0}, // GroupDecorate
+	{0, 0, 0, 0}, // GroupMemberDecorate
+	{1, 1, 0, 0}, // #76
+	{1, 1, 1, 1}, // VectorExtractDynamic
+	{1, 1, 2, 1}, // VectorInsertDynamic
+	{1, 1, 2, 1}, // VectorShuffle
+	{1, 1, 9, 0}, // CompositeConstruct
+	{1, 1, 1, 1}, // CompositeExtract
+	{1, 1, 2, 1}, // CompositeInsert
+	{1, 1, 1, 0}, // CopyObject
+	{1, 1, 0, 0}, // Transpose
+	{1, 1, 0, 0}, // #85
+	{1, 1, 0, 0}, // SampledImage
+	{1, 1, 2, 1}, // ImageSampleImplicitLod
+	{1, 1, 2, 1}, // ImageSampleExplicitLod
+	{1, 1, 3, 1}, // ImageSampleDrefImplicitLod
+	{1, 1, 3, 1}, // ImageSampleDrefExplicitLod
+	{1, 1, 2, 1}, // ImageSampleProjImplicitLod
+	{1, 1, 2, 1}, // ImageSampleProjExplicitLod
+	{1, 1, 3, 1}, // ImageSampleProjDrefImplicitLod
+	{1, 1, 3, 1}, // ImageSampleProjDrefExplicitLod
+	{1, 1, 2, 1}, // ImageFetch
+	{1, 1, 3, 1}, // ImageGather
+	{1, 1, 3, 1}, // ImageDrefGather
+	{1, 1, 2, 1}, // ImageRead
+	{0, 0, 3, 1}, // ImageWrite
+	{1, 1, 1, 0}, // Image
+	{1, 1, 1, 0}, // ImageQueryFormat
+	{1, 1, 1, 0}, // ImageQueryOrder
+	{1, 1, 2, 0}, // ImageQuerySizeLod
+	{1, 1, 1, 0}, // ImageQuerySize
+	{1, 1, 2, 0}, // ImageQueryLod
+	{1, 1, 1, 0}, // ImageQueryLevels
+	{1, 1, 1, 0}, // ImageQuerySamples
+	{1, 1, 0, 0}, // #108
+	{1, 1, 1, 0}, // ConvertFToU
+	{1, 1, 1, 0}, // ConvertFToS
+	{1, 1, 1, 0}, // ConvertSToF
+	{1, 1, 1, 0}, // ConvertUToF
+	{1, 1, 1, 0}, // UConvert
+	{1, 1, 1, 0}, // SConvert
+	{1, 1, 1, 0}, // FConvert
+	{1, 1, 1, 0}, // QuantizeToF16
+	{1, 1, 1, 0}, // ConvertPtrToU
+	{1, 1, 1, 0}, // SatConvertSToU
+	{1, 1, 1, 0}, // SatConvertUToS
+	{1, 1, 1, 0}, // ConvertUToPtr
+	{1, 1, 1, 0}, // PtrCastToGeneric
+	{1, 1, 1, 0}, // GenericCastToPtr
+	{1, 1, 1, 1}, // GenericCastToPtrExplicit
+	{1, 1, 1, 0}, // Bitcast
+	{1, 1, 0, 0}, // #125
+	{1, 1, 1, 0}, // SNegate
+	{1, 1, 1, 0}, // FNegate
+	{1, 1, 2, 0}, // IAdd
+	{1, 1, 2, 0}, // FAdd
+	{1, 1, 2, 0}, // ISub
+	{1, 1, 2, 0}, // FSub
+	{1, 1, 2, 0}, // IMul
+	{1, 1, 2, 0}, // FMul
+	{1, 1, 2, 0}, // UDiv
+	{1, 1, 2, 0}, // SDiv
+	{1, 1, 2, 0}, // FDiv
+	{1, 1, 2, 0}, // UMod
+	{1, 1, 2, 0}, // SRem
+	{1, 1, 2, 0}, // SMod
+	{1, 1, 2, 0}, // FRem
+	{1, 1, 2, 0}, // FMod
+	{1, 1, 2, 0}, // VectorTimesScalar
+	{1, 1, 2, 0}, // MatrixTimesScalar
+	{1, 1, 2, 0}, // VectorTimesMatrix
+	{1, 1, 2, 0}, // MatrixTimesVector
+	{1, 1, 2, 0}, // MatrixTimesMatrix
+	{1, 1, 2, 0}, // OuterProduct
+	{1, 1, 2, 0}, // Dot
+	{1, 1, 2, 0}, // IAddCarry
+	{1, 1, 2, 0}, // ISubBorrow
+	{1, 1, 2, 0}, // UMulExtended
+	{1, 1, 2, 0}, // SMulExtended
+	{1, 1, 0, 0}, // #153
+	{1, 1, 1, 0}, // Any
+	{1, 1, 1, 0}, // All
+	{1, 1, 1, 0}, // IsNan
+	{1, 1, 1, 0}, // IsInf
+	{1, 1, 1, 0}, // IsFinite
+	{1, 1, 1, 0}, // IsNormal
+	{1, 1, 1, 0}, // SignBitSet
+	{1, 1, 2, 0}, // LessOrGreater
+	{1, 1, 2, 0}, // Ordered
+	{1, 1, 2, 0}, // Unordered
+	{1, 1, 2, 0}, // LogicalEqual
+	{1, 1, 2, 0}, // LogicalNotEqual
+	{1, 1, 2, 0}, // LogicalOr
+	{1, 1, 2, 0}, // LogicalAnd
+	{1, 1, 1, 0}, // LogicalNot
+	{1, 1, 3, 0}, // Select
+	{1, 1, 2, 0}, // IEqual
+	{1, 1, 2, 0}, // INotEqual
+	{1, 1, 2, 0}, // UGreaterThan
+	{1, 1, 2, 0}, // SGreaterThan
+	{1, 1, 2, 0}, // UGreaterThanEqual
+	{1, 1, 2, 0}, // SGreaterThanEqual
+	{1, 1, 2, 0}, // ULessThan
+	{1, 1, 2, 0}, // SLessThan
+	{1, 1, 2, 0}, // ULessThanEqual
+	{1, 1, 2, 0}, // SLessThanEqual
+	{1, 1, 2, 0}, // FOrdEqual
+	{1, 1, 2, 0}, // FUnordEqual
+	{1, 1, 2, 0}, // FOrdNotEqual
+	{1, 1, 2, 0}, // FUnordNotEqual
+	{1, 1, 2, 0}, // FOrdLessThan
+	{1, 1, 2, 0}, // FUnordLessThan
+	{1, 1, 2, 0}, // FOrdGreaterThan
+	{1, 1, 2, 0}, // FUnordGreaterThan
+	{1, 1, 2, 0}, // FOrdLessThanEqual
+	{1, 1, 2, 0}, // FUnordLessThanEqual
+	{1, 1, 2, 0}, // FOrdGreaterThanEqual
+	{1, 1, 2, 0}, // FUnordGreaterThanEqual
+	{1, 1, 0, 0}, // #192
+	{1, 1, 0, 0}, // #193
+	{1, 1, 2, 0}, // ShiftRightLogical
+	{1, 1, 2, 0}, // ShiftRightArithmetic
+	{1, 1, 2, 0}, // ShiftLeftLogical
+	{1, 1, 2, 0}, // BitwiseOr
+	{1, 1, 2, 0}, // BitwiseXor
+	{1, 1, 2, 0}, // BitwiseAnd
+	{1, 1, 1, 0}, // Not
+	{1, 1, 4, 0}, // BitFieldInsert
+	{1, 1, 3, 0}, // BitFieldSExtract
+	{1, 1, 3, 0}, // BitFieldUExtract
+	{1, 1, 1, 0}, // BitReverse
+	{1, 1, 1, 0}, // BitCount
+	{1, 1, 0, 0}, // #206
+	{1, 1, 0, 0}, // DPdx
+	{1, 1, 0, 0}, // DPdy
+	{1, 1, 0, 0}, // Fwidth
+	{1, 1, 0, 0}, // DPdxFine
+	{1, 1, 0, 0}, // DPdyFine
+	{1, 1, 0, 0}, // FwidthFine
+	{1, 1, 0, 0}, // DPdxCoarse
+	{1, 1, 0, 0}, // DPdyCoarse
+	{1, 1, 0, 0}, // FwidthCoarse
+	{1, 1, 0, 0}, // #216
+	{1, 1, 0, 0}, // #217
+	{0, 0, 0, 0}, // EmitVertex
+	{0, 0, 0, 0}, // EndPrimitive
+	{0, 0, 0, 0}, // EmitStreamVertex
+	{0, 0, 0, 0}, // EndStreamPrimitive
+	{1, 1, 0, 0}, // #222
+	{1, 1, 0, 0}, // #223
+	{0, 0, 3, 0}, // ControlBarrier
+	{0, 0, 2, 0}, // MemoryBarrier
+	{1, 1, 0, 0}, // #226
+	{1, 1, 0, 0}, // AtomicLoad
+	{0, 0, 0, 0}, // AtomicStore
+	{1, 1, 0, 0}, // AtomicExchange
+	{1, 1, 0, 0}, // AtomicCompareExchange
+	{1, 1, 0, 0}, // AtomicCompareExchangeWeak
+	{1, 1, 0, 0}, // AtomicIIncrement
+	{1, 1, 0, 0}, // AtomicIDecrement
+	{1, 1, 0, 0}, // AtomicIAdd
+	{1, 1, 0, 0}, // AtomicISub
+	{1, 1, 0, 0}, // AtomicSMin
+	{1, 1, 0, 0}, // AtomicUMin
+	{1, 1, 0, 0}, // AtomicSMax
+	{1, 1, 0, 0}, // AtomicUMax
+	{1, 1, 0, 0}, // AtomicAnd
+	{1, 1, 0, 0}, // AtomicOr
+	{1, 1, 0, 0}, // AtomicXor
+	{1, 1, 0, 0}, // #243
+	{1, 1, 0, 0}, // #244
+	{1, 1, 0, 0}, // Phi
+	{0, 0, 2, 1}, // LoopMerge
+	{0, 0, 1, 1}, // SelectionMerge
+	{1, 0, 0, 0}, // Label
+	{0, 0, 1, 0}, // Branch
+	{0, 0, 3, 1}, // BranchConditional
+	{0, 0, 0, 0}, // Switch
+	{0, 0, 0, 0}, // Kill
+	{0, 0, 0, 0}, // Return
+	{0, 0, 0, 0}, // ReturnValue
+	{0, 0, 0, 0}, // Unreachable
+	{0, 0, 0, 0}, // LifetimeStart
+	{0, 0, 0, 0}, // LifetimeStop
+	{1, 1, 0, 0}, // #258
+	{1, 1, 0, 0}, // GroupAsyncCopy
+	{0, 0, 0, 0}, // GroupWaitEvents
+	{1, 1, 0, 0}, // GroupAll
+	{1, 1, 0, 0}, // GroupAny
+	{1, 1, 0, 0}, // GroupBroadcast
+	{1, 1, 0, 0}, // GroupIAdd
+	{1, 1, 0, 0}, // GroupFAdd
+	{1, 1, 0, 0}, // GroupFMin
+	{1, 1, 0, 0}, // GroupUMin
+	{1, 1, 0, 0}, // GroupSMin
+	{1, 1, 0, 0}, // GroupFMax
+	{1, 1, 0, 0}, // GroupUMax
+	{1, 1, 0, 0}, // GroupSMax
+	{1, 1, 0, 0}, // #272
+	{1, 1, 0, 0}, // #273
+	{1, 1, 0, 0}, // ReadPipe
+	{1, 1, 0, 0}, // WritePipe
+	{1, 1, 0, 0}, // ReservedReadPipe
+	{1, 1, 0, 0}, // ReservedWritePipe
+	{1, 1, 0, 0}, // ReserveReadPipePackets
+	{1, 1, 0, 0}, // ReserveWritePipePackets
+	{0, 0, 0, 0}, // CommitReadPipe
+	{0, 0, 0, 0}, // CommitWritePipe
+	{1, 1, 0, 0}, // IsValidReserveId
+	{1, 1, 0, 0}, // GetNumPipePackets
+	{1, 1, 0, 0}, // GetMaxPipePackets
+	{1, 1, 0, 0}, // GroupReserveReadPipePackets
+	{1, 1, 0, 0}, // GroupReserveWritePipePackets
+	{0, 0, 0, 0}, // GroupCommitReadPipe
+	{0, 0, 0, 0}, // GroupCommitWritePipe
+	{1, 1, 0, 0}, // #289
+	{1, 1, 0, 0}, // #290
+	{1, 1, 0, 0}, // EnqueueMarker
+	{1, 1, 0, 0}, // EnqueueKernel
+	{1, 1, 0, 0}, // GetKernelNDrangeSubGroupCount
+	{1, 1, 0, 0}, // GetKernelNDrangeMaxSubGroupSize
+	{1, 1, 0, 0}, // GetKernelWorkGroupSize
+	{1, 1, 0, 0}, // GetKernelPreferredWorkGroupSizeMultiple
+	{0, 0, 0, 0}, // RetainEvent
+	{0, 0, 0, 0}, // ReleaseEvent
+	{1, 1, 0, 0}, // CreateUserEvent
+	{1, 1, 0, 0}, // IsValidEvent
+	{0, 0, 0, 0}, // SetUserEventStatus
+	{0, 0, 0, 0}, // CaptureEventProfilingInfo
+	{1, 1, 0, 0}, // GetDefaultQueue
+	{1, 1, 0, 0}, // BuildNDRange
+	{1, 1, 2, 1}, // ImageSparseSampleImplicitLod
+	{1, 1, 2, 1}, // ImageSparseSampleExplicitLod
+	{1, 1, 3, 1}, // ImageSparseSampleDrefImplicitLod
+	{1, 1, 3, 1}, // ImageSparseSampleDrefExplicitLod
+	{1, 1, 2, 1}, // ImageSparseSampleProjImplicitLod
+	{1, 1, 2, 1}, // ImageSparseSampleProjExplicitLod
+	{1, 1, 3, 1}, // ImageSparseSampleProjDrefImplicitLod
+	{1, 1, 3, 1}, // ImageSparseSampleProjDrefExplicitLod
+	{1, 1, 2, 1}, // ImageSparseFetch
+	{1, 1, 3, 1}, // ImageSparseGather
+	{1, 1, 3, 1}, // ImageSparseDrefGather
+	{1, 1, 1, 0}, // ImageSparseTexelsResident
+	{0, 0, 0, 0}, // NoLine
+	{1, 1, 0, 0}, // AtomicFlagTestAndSet
+	{0, 0, 0, 0}, // AtomicFlagClear
+	{1, 1, 0, 0}, // ImageSparseRead
+	{1, 1, 0, 0}, // SizeOf
+	{1, 1, 0, 0}, // TypePipeStorage
+	{1, 1, 0, 0}, // ConstantPipeStorage
+	{1, 1, 0, 0}, // CreatePipeFromPipeStorage
+	{1, 1, 0, 0}, // GetKernelLocalSizeForSubgroupCount
+	{1, 1, 0, 0}, // GetKernelMaxNumSubgroups
+	{1, 1, 0, 0}, // TypeNamedBarrier
+	{1, 1, 0, 1}, // NamedBarrierInitialize
+	{0, 0, 2, 1}, // MemoryNamedBarrier
+	{1, 1, 0, 0}, // ModuleProcessed
+	{0, 0, 0, 1}, // ExecutionModeId
+	{0, 0, 0, 1}, // DecorateId
+	{1, 1, 1, 1}, // GroupNonUniformElect
+	{1, 1, 1, 1}, // GroupNonUniformAll
+	{1, 1, 1, 1}, // GroupNonUniformAny
+	{1, 1, 1, 1}, // GroupNonUniformAllEqual
+	{1, 1, 1, 1}, // GroupNonUniformBroadcast
+	{1, 1, 1, 1}, // GroupNonUniformBroadcastFirst
+	{1, 1, 1, 1}, // GroupNonUniformBallot
+	{1, 1, 1, 1}, // GroupNonUniformInverseBallot
+	{1, 1, 1, 1}, // GroupNonUniformBallotBitExtract
+	{1, 1, 1, 1}, // GroupNonUniformBallotBitCount
+	{1, 1, 1, 1}, // GroupNonUniformBallotFindLSB
+	{1, 1, 1, 1}, // GroupNonUniformBallotFindMSB
+	{1, 1, 1, 1}, // GroupNonUniformShuffle
+	{1, 1, 1, 1}, // GroupNonUniformShuffleXor
+	{1, 1, 1, 1}, // GroupNonUniformShuffleUp
+	{1, 1, 1, 1}, // GroupNonUniformShuffleDown
+	{1, 1, 1, 1}, // GroupNonUniformIAdd
+	{1, 1, 1, 1}, // GroupNonUniformFAdd
+	{1, 1, 1, 1}, // GroupNonUniformIMul
+	{1, 1, 1, 1}, // GroupNonUniformFMul
+	{1, 1, 1, 1}, // GroupNonUniformSMin
+	{1, 1, 1, 1}, // GroupNonUniformUMin
+	{1, 1, 1, 1}, // GroupNonUniformFMin
+	{1, 1, 1, 1}, // GroupNonUniformSMax
+	{1, 1, 1, 1}, // GroupNonUniformUMax
+	{1, 1, 1, 1}, // GroupNonUniformFMax
+	{1, 1, 1, 1}, // GroupNonUniformBitwiseAnd
+	{1, 1, 1, 1}, // GroupNonUniformBitwiseOr
+	{1, 1, 1, 1}, // GroupNonUniformBitwiseXor
+	{1, 1, 1, 1}, // GroupNonUniformLogicalAnd
+	{1, 1, 1, 1}, // GroupNonUniformLogicalOr
+	{1, 1, 1, 1}, // GroupNonUniformLogicalXor
+	{1, 1, 1, 1}, // GroupNonUniformQuadBroadcast
+	{1, 1, 1, 1}, // GroupNonUniformQuadSwap
+};
+static_assert(_SMOLV_ARRAY_SIZE(kSpirvOpData) == kKnownOpsCount, "kSpirvOpData table mismatch with known SpvOps");
+
+// Instruction encoding depends on the table that describes the various SPIR-V opcodes.
+// Whenever we change or expand the table, we need to bump up the SMOL-V version, and make
+// sure that we can still decode files encoded by an older version.
+static int smolv_GetKnownOpsCount(int version)
+{
+	if (version == 0)
+		return SpvOpModuleProcessed+1;
+	if (version == 1) // 2020 February, version 1 added ExecutionModeId..GroupNonUniformQuadSwap
+		return SpvOpGroupNonUniformQuadSwap+1;
+	return 0;
+}
+
+static bool smolv_OpHasResult(SpvOp op, int opsCount)
+{
+	if (op < 0 || op >= opsCount)
+		return false;
+	return kSpirvOpData[op].hasResult != 0;
+}
+
+static bool smolv_OpHasType(SpvOp op, int opsCount)
+{
+	if (op < 0 || op >= opsCount)
+		return false;
+	return kSpirvOpData[op].hasType != 0;
+}
+
+static int smolv_OpDeltaFromResult(SpvOp op, int opsCount)
+{
+	if (op < 0 || op >= opsCount)
+		return 0;
+	return kSpirvOpData[op].deltaFromResult;
+}
+
+static bool smolv_OpVarRest(SpvOp op, int opsCount)
+{
+	if (op < 0 || op >= opsCount)
+		return false;
+	return kSpirvOpData[op].varrest != 0;
+}
+
+static bool smolv_OpDebugInfo(SpvOp op, int opsCount)
+{
+	return
+		op == SpvOpSourceContinued ||
+		op == SpvOpSource ||
+		op == SpvOpSourceExtension ||
+		op == SpvOpName ||
+		op == SpvOpMemberName ||
+		op == SpvOpString ||
+		op == SpvOpLine ||
+		op == SpvOpNoLine ||
+		op == SpvOpModuleProcessed;
+}
+
+
+static int smolv_DecorationExtraOps(int dec)
+{
+	if (dec == 0 || (dec >= 2 && dec <= 5)) // RelaxedPrecision, Block..ColMajor
+		return 0;
+	if (dec >= 29 && dec <= 37) // Stream..XfbStride
+		return 1;
+	return -1; // unknown, encode length
+}
+
+
+// --------------------------------------------------------------------------------------------
+
+
+static bool smolv_CheckGenericHeader(const uint32_t* words, size_t wordCount, uint32_t expectedMagic, uint32_t versionMask)
+{
+	if (!words)
+		return false;
+	if (wordCount < 5)
+		return false;
+	
+	uint32_t headerMagic = words[0];
+	if (headerMagic != expectedMagic)
+		return false;
+	uint32_t headerVersion = words[1] & versionMask;
+	if (headerVersion < 0x00010000 || headerVersion > 0x00010500)
+		return false; // only support 1.0 through 1.5
+	
+	return true;
+}
+
+static const int kSpirVHeaderMagic = 0x07230203;
+static const int kSmolHeaderMagic = 0x534D4F4C; // "SMOL"
+
+static const int kSmolCurrEncodingVersion = 1;
+
+static bool smolv_CheckSpirVHeader(const uint32_t* words, size_t wordCount)
+{
+	//@TODO: if SPIR-V header magic was reversed, that means the file got written
+	// in a "big endian" order. Need to byteswap all words then.
+	return smolv_CheckGenericHeader(words, wordCount, kSpirVHeaderMagic, 0xFFFFFFFF);
+}
+static bool smolv_CheckSmolHeader(const uint8_t* bytes, size_t byteCount)
+{
+	if (!smolv_CheckGenericHeader((const uint32_t*)bytes, byteCount/4, kSmolHeaderMagic, 0x00FFFFFF))
+		return false;
+	if (byteCount < 24) // one more word past header to store decoded length
+		return false;
+	// SMOL-V version
+	int smolVersion = ((const uint32_t*)bytes)[1] >> 24;
+	if (smolVersion < 0 || smolVersion > kSmolCurrEncodingVersion)
+		return false;
+	return true;
+}
+
+
+static void smolv_Write4(smolv::ByteArray& arr, uint32_t v)
+{
+	arr.push_back(v & 0xFF);
+	arr.push_back((v >> 8) & 0xFF);
+	arr.push_back((v >> 16) & 0xFF);
+	arr.push_back(v >> 24);
+}
+
+static void smolv_Write4(uint8_t*& buf, uint32_t v)
+{
+	memcpy(buf, &v, 4);
+	buf += 4;
+}
+
+
+static bool smolv_Read4(const uint8_t*& data, const uint8_t* dataEnd, uint32_t& outv)
+{
+	if (data + 4 > dataEnd)
+		return false;
+	outv = (data[0]) | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+	data += 4;
+	return true;
+}
+
+
+// --------------------------------------------------------------------------------------------
+
+// Variable-length integer encoding for unsigned integers. In each byte:
+// - highest bit set if more bytes follow, cleared if this is last byte.
+// - other 7 bits are the actual value payload.
+// Takes 1-5 bytes to encode an integer (values between 0 and 127 take one byte, etc.).
+
+static void smolv_WriteVarint(smolv::ByteArray& arr, uint32_t v)
+{
+	while (v > 127)
+	{
+		arr.push_back((v & 127) | 128);
+		v >>= 7;
+	}
+	arr.push_back(v & 127);
+}
+
+static bool smolv_ReadVarint(const uint8_t*& data, const uint8_t* dataEnd, uint32_t& outVal)
+{
+	uint32_t v = 0;
+	uint32_t shift = 0;
+	while (data < dataEnd)
+	{
+		uint8_t b = *data;
+		v |= (b & 127) << shift;
+		shift += 7;
+		data++;
+		if (!(b & 128))
+			break;
+	}
+	outVal = v;
+	return true; //@TODO: report failures
+}
+
+static uint32_t smolv_ZigEncode(int32_t i)
+{
+	return (uint32_t(i) << 1) ^ (i >> 31);
+}
+
+static int32_t smolv_ZigDecode(uint32_t u)
+{
+	 return (u & 1) ? ((u >> 1) ^ ~0) : (u >> 1);
+}
+
+
+// Remap most common Op codes (Load, Store, Decorate, VectorShuffle etc.) to be in < 16 range, for
+// more compact varint encoding. This basically swaps rarely used op values that are < 16 with the
+// ones that are common.
+
+static SpvOp smolv_RemapOp(SpvOp op)
+{
+#	define _SMOLV_SWAP_OP(op1,op2) if (op==op1) return op2; if (op==op2) return op1
+	_SMOLV_SWAP_OP(SpvOpDecorate,SpvOpNop); // 0: 24%
+	_SMOLV_SWAP_OP(SpvOpLoad,SpvOpUndef); // 1: 17%
+	_SMOLV_SWAP_OP(SpvOpStore,SpvOpSourceContinued); // 2: 9%
+	_SMOLV_SWAP_OP(SpvOpAccessChain,SpvOpSource); // 3: 7.2%
+	_SMOLV_SWAP_OP(SpvOpVectorShuffle,SpvOpSourceExtension); // 4: 5.0%
+	// Name - already small enum value - 5: 4.4%
+	// MemberName - already small enum value - 6: 2.9%
+	_SMOLV_SWAP_OP(SpvOpMemberDecorate,SpvOpString); // 7: 4.0%
+	_SMOLV_SWAP_OP(SpvOpLabel,SpvOpLine); // 8: 0.9%
+	_SMOLV_SWAP_OP(SpvOpVariable,(SpvOp)9); // 9: 3.9%
+	_SMOLV_SWAP_OP(SpvOpFMul,SpvOpExtension); // 10: 3.9%
+	_SMOLV_SWAP_OP(SpvOpFAdd,SpvOpExtInstImport); // 11: 2.5%
+	// ExtInst - already small enum value - 12: 1.2%
+	// VectorShuffleCompact - already small enum value - used for compact shuffle encoding
+	_SMOLV_SWAP_OP(SpvOpTypePointer,SpvOpMemoryModel); // 14: 2.2%
+	_SMOLV_SWAP_OP(SpvOpFNegate,SpvOpEntryPoint); // 15: 1.1%
+#	undef _SMOLV_SWAP_OP
+	return op;
+}
+
+
+// For most compact varint encoding of common instructions, the instruction length should come out
+// into 3 bits (be <8). SPIR-V instruction lengths are always at least 1, and for some other
+// instructions they are guaranteed to be some other minimum length. Adjust the length before encoding,
+// and after decoding accordingly.
+
+static uint32_t smolv_EncodeLen(SpvOp op, uint32_t len)
+{
+	len--;
+	if (op == SpvOpVectorShuffle)			len -= 4;
+	if (op == SpvOpVectorShuffleCompact)	len -= 4;
+	if (op == SpvOpDecorate)				len -= 2;
+	if (op == SpvOpLoad)					len -= 3;
+	if (op == SpvOpAccessChain)				len -= 3;
+	return len;
+}
+
+static uint32_t smolv_DecodeLen(SpvOp op, uint32_t len)
+{
+	len++;
+	if (op == SpvOpVectorShuffle)			len += 4;
+	if (op == SpvOpVectorShuffleCompact)	len += 4;
+	if (op == SpvOpDecorate)				len += 2;
+	if (op == SpvOpLoad)					len += 3;
+	if (op == SpvOpAccessChain)				len += 3;
+	return len;
+}
+
+
+// Shuffling bits of length + opcode to be more compact in varint encoding in typical cases:
+// 0x LLLL OOOO is how SPIR-V encodes it (L=length, O=op), we shuffle into:
+// 0x LLLO OOLO, so that common case (op<16, len<8) is encoded into one byte.
+
+static bool smolv_WriteLengthOp(smolv::ByteArray& arr, uint32_t len, SpvOp op)
+{
+	len = smolv_EncodeLen(op, len);
+	// SPIR-V length field is 16 bits; if we get a larger value that means something
+	// was wrong, e.g. a vector shuffle instruction with less than 4 words (and our
+	// adjustment to common lengths in smolv_EncodeLen wrapped around)
+	if (len > 0xFFFF)
+		return false;
+	op = smolv_RemapOp(op);
+	uint32_t oplen = ((len >> 4) << 20) | ((op >> 4) << 8) | ((len & 0xF) << 4) | (op & 0xF);
+	smolv_WriteVarint(arr, oplen);
+	return true;
+}
+
+static bool smolv_ReadLengthOp(const uint8_t*& data, const uint8_t* dataEnd, uint32_t& outLen, SpvOp& outOp)
+{
+	uint32_t val;
+	if (!smolv_ReadVarint(data, dataEnd, val))
+		return false;
+	outLen = ((val >> 20) << 4) | ((val >> 4) & 0xF);
+	outOp = (SpvOp)(((val >> 4) & 0xFFF0) | (val & 0xF));
+
+	outOp = smolv_RemapOp(outOp);
+	outLen = smolv_DecodeLen(outOp, outLen);
+	return true;
+}
+
+
+
+#define _SMOLV_READ_OP(len, words, op) \
+	uint32_t len = words[0] >> 16; \
+	if (len < 1) return false; /* malformed instruction, length needs to be at least 1 */ \
+	if (words + len > wordsEnd) return false; /* malformed instruction, goes past end of data */ \
+	SpvOp op = (SpvOp)(words[0] & 0xFFFF)
+
+
+bool smolv::Encode(const void* spirvData, size_t spirvSize, ByteArray& outSmolv, uint32_t flags, StripOpNameFilterFunc stripFilter)
+{
+	const size_t wordCount = spirvSize / 4;
+	if (wordCount * 4 != spirvSize)
+		return false;
+	const uint32_t* words = (const uint32_t*)spirvData;
+	const uint32_t* wordsEnd = words + wordCount;
+	if (!smolv_CheckSpirVHeader(words, wordCount))
+		return false;
+
+	// reserve space in output (typical compression is to about 30%; reserve half of input space)
+	outSmolv.reserve(outSmolv.size() + spirvSize/2);
+
+	// header (matches SPIR-V one, except different magic)
+	smolv_Write4(outSmolv, kSmolHeaderMagic);
+	smolv_Write4(outSmolv, (words[1] & 0x00FFFFFF) + (kSmolCurrEncodingVersion<<24)); // SPIR-V version (_XXX) + SMOL-V version (X___)
+	smolv_Write4(outSmolv, words[2]); // generator
+	smolv_Write4(outSmolv, words[3]); // bound
+	smolv_Write4(outSmolv, words[4]); // schema
+
+	const size_t headerSpirvSizeOffset = outSmolv.size(); // size field may get updated later if stripping is enabled
+	smolv_Write4(outSmolv, (uint32_t)spirvSize); // space needed to decode (i.e. original SPIR-V size)
+
+	size_t strippedSpirvWordCount = wordCount;
+	uint32_t prevResult = 0;
+	uint32_t prevDecorate = 0;
+	
+	const int knownOpsCount = smolv_GetKnownOpsCount(kSmolCurrEncodingVersion);
+
+	words += 5;
+	while (words < wordsEnd)
+	{
+		_SMOLV_READ_OP(instrLen, words, op);
+
+		if ((flags & kEncodeFlagStripDebugInfo) && smolv_OpDebugInfo(op, knownOpsCount))
+		{
+			if (!stripFilter || op != SpvOpName || !stripFilter(reinterpret_cast<const char*>(&words[2])))
+			{
+				strippedSpirvWordCount -= instrLen;
+				words += instrLen;
+				continue;
+			}
+		}
+
+		// A usual case of vector shuffle, with less than 4 components, each with a value
+		// in [0..3] range: encode it in a more compact form, with the swizzle pattern in one byte.
+		// Turn this into a VectorShuffleCompact instruction, that takes up unused slot in Ops.
+		uint32_t swizzle = 0;
+		if (op == SpvOpVectorShuffle && instrLen <= 9)
+		{
+			uint32_t swz0 = instrLen > 5 ? words[5] : 0;
+			uint32_t swz1 = instrLen > 6 ? words[6] : 0;
+			uint32_t swz2 = instrLen > 7 ? words[7] : 0;
+			uint32_t swz3 = instrLen > 8 ? words[8] : 0;
+			if (swz0 < 4 && swz1 < 4 && swz2 < 4 && swz3 < 4)
+			{
+				op = SpvOpVectorShuffleCompact;
+				swizzle = (swz0 << 6) | (swz1 << 4) | (swz2 << 2) | (swz3);
+			}
+		}
+
+		// length + opcode
+		if (!smolv_WriteLengthOp(outSmolv, instrLen, op))
+			return false;
+
+		size_t ioffs = 1;
+		// write type as varint, if we have it
+		if (smolv_OpHasType(op, knownOpsCount))
+		{
+			if (ioffs >= instrLen)
+				return false;
+			smolv_WriteVarint(outSmolv, words[ioffs]);
+			ioffs++;
+		}
+		// write result as delta+zig+varint, if we have it
+		if (smolv_OpHasResult(op, knownOpsCount))
+		{
+			if (ioffs >= instrLen)
+				return false;
+			uint32_t v = words[ioffs];
+			smolv_WriteVarint(outSmolv, smolv_ZigEncode(v - prevResult)); // some deltas are negative, use zig
+			prevResult = v;
+			ioffs++;
+		}
+
+		// Decorate & MemberDecorate: IDs relative to previous decorate
+		if (op == SpvOpDecorate || op == SpvOpMemberDecorate)
+		{
+			if (ioffs >= instrLen)
+				return false;
+			uint32_t v = words[ioffs];
+			smolv_WriteVarint(outSmolv, smolv_ZigEncode(v - prevDecorate)); // spirv-remapped deltas often negative, use zig
+			prevDecorate = v;
+			ioffs++;
+		}
+
+		// MemberDecorate special encoding: whole row of MemberDecorate instructions is often referring
+		// to the same type and linearly increasing member indices. Scan ahead to see how many we have,
+		// and encode whole bunch as one.
+		if (op == SpvOpMemberDecorate)
+		{
+			// scan ahead until we reach end, non-member-decoration or different type
+			const uint32_t decorationType = words[ioffs-1];
+			const uint32_t* memberWords = words;
+			uint32_t prevIndex = 0;
+			uint32_t prevOffset = 0;
+			// write a byte on how many we have encoded as a bunch
+			size_t countLocation = outSmolv.size();
+			outSmolv.push_back(0);
+			int count = 0;
+			while (memberWords < wordsEnd && count < 255)
+			{
+				_SMOLV_READ_OP(memberLen, memberWords, memberOp);
+				if (memberOp != SpvOpMemberDecorate)
+					break;
+				if (memberLen < 4)
+					return false; // invalid input
+				if (memberWords[1] != decorationType)
+					break;
+
+				// write member index as delta from previous
+				uint32_t memberIndex = memberWords[2];
+				smolv_WriteVarint(outSmolv, memberIndex - prevIndex);
+				prevIndex = memberIndex;
+
+				// decoration (and length if not common/known)
+				uint32_t memberDec = memberWords[3];
+				smolv_WriteVarint(outSmolv, memberDec);
+				const int knownExtraOps = smolv_DecorationExtraOps(memberDec);
+				if (knownExtraOps == -1)
+					smolv_WriteVarint(outSmolv, memberLen-4);
+				else if (unsigned(knownExtraOps) + 4 != memberLen)
+					return false; // invalid input
+
+				// Offset decorations are most often linearly increasing, so encode as deltas
+				if (memberDec == 35) // Offset
+				{
+					if (memberLen != 5)
+						return false;
+					smolv_WriteVarint(outSmolv, memberWords[4]-prevOffset);
+					prevOffset = memberWords[4];
+				}
+				else
+				{
+					// write rest of decorations as varint
+					for (uint32_t i = 4; i < memberLen; ++i)
+						smolv_WriteVarint(outSmolv, memberWords[i]);
+				}
+
+				memberWords += memberLen;
+				++count;
+			}
+			outSmolv[countLocation] = uint8_t(count);
+			words = memberWords;
+			continue;
+		}
+
+		// Write out this many IDs, encoding them relative+zigzag to result ID
+		int relativeCount = smolv_OpDeltaFromResult(op, knownOpsCount);
+		for (int i = 0; i < relativeCount && ioffs < instrLen; ++i, ++ioffs)
+		{
+			if (ioffs >= instrLen)
+				return false;
+			uint32_t delta = prevResult - words[ioffs];
+			// some deltas are negative (often on branches, or if program was processed by spirv-remap),
+			// so use zig encoding
+			smolv_WriteVarint(outSmolv, smolv_ZigEncode(delta));
+		}
+
+		if (op == SpvOpVectorShuffleCompact)
+		{
+			// compact vector shuffle, just write out single swizzle byte
+			outSmolv.push_back(uint8_t(swizzle));
+			ioffs = instrLen;
+		}
+		else if (smolv_OpVarRest(op, knownOpsCount))
+		{
+			// write out rest of words with variable encoding (expected to be small integers)
+			for (; ioffs < instrLen; ++ioffs)
+				smolv_WriteVarint(outSmolv, words[ioffs]);
+		}
+		else
+		{
+			// write out rest of words without any encoding
+			for (; ioffs < instrLen; ++ioffs)
+				smolv_Write4(outSmolv, words[ioffs]);
+		}
+		
+		words += instrLen;
+	}
+
+	if (strippedSpirvWordCount != wordCount)
+	{
+		uint8_t* headerSpirvSize = &outSmolv[headerSpirvSizeOffset];
+		smolv_Write4(headerSpirvSize, (uint32_t)strippedSpirvWordCount * 4);
+	}
+	
+	return true;
+}
+
+
+size_t smolv::GetDecodedBufferSize(const void* smolvData, size_t smolvSize)
+{
+	if (!smolv_CheckSmolHeader((const uint8_t*)smolvData, smolvSize))
+		return 0;
+	const uint32_t* words = (const uint32_t*)smolvData;
+	return words[5];
+}
+
+
+bool smolv::Decode(const void* smolvData, size_t smolvSize, void* spirvOutputBuffer, size_t spirvOutputBufferSize, uint32_t flags)
+{
+	// check header, and whether we have enough output buffer space
+	const size_t neededBufferSize = GetDecodedBufferSize(smolvData, smolvSize);
+	if (neededBufferSize == 0)
+		return false; // invalid SMOL-V
+	if (spirvOutputBufferSize < neededBufferSize)
+		return false; // not enough space in output buffer
+	if (spirvOutputBuffer == NULL)
+		return false; // output buffer is null
+
+	const uint8_t* bytes = (const uint8_t*)smolvData;
+	const uint8_t* bytesEnd = bytes + smolvSize;
+
+	uint8_t* outSpirv = (uint8_t*)spirvOutputBuffer;
+	
+	uint32_t val;
+	int smolVersion = 0;
+
+	// header
+	smolv_Write4(outSpirv, kSpirVHeaderMagic); bytes += 4;
+	smolv_Read4(bytes, bytesEnd, val); smolVersion = val >> 24; val &= 0x00FFFFFF; smolv_Write4(outSpirv, val); // version
+	smolv_Read4(bytes, bytesEnd, val); smolv_Write4(outSpirv, val); // generator
+	smolv_Read4(bytes, bytesEnd, val); smolv_Write4(outSpirv, val); // bound
+	smolv_Read4(bytes, bytesEnd, val); smolv_Write4(outSpirv, val); // schema
+	bytes += 4; // decode buffer size
+	
+	// there are two SMOL-V encoding versions, both not indicating anything in their header version field:
+	// one that is called "before zero" here (2016-08-31 code). Support decoding that one only by presence
+	// of this special flag.
+	const bool beforeZeroVersion = smolVersion == 0 && (flags & kDecodeFlagUse20160831AsZeroVersion) != 0;
+
+	const int knownOpsCount = smolv_GetKnownOpsCount(smolVersion);
+
+	uint32_t prevResult = 0;
+	uint32_t prevDecorate = 0;
+
+	while (bytes < bytesEnd)
+	{
+		// read length + opcode
+		uint32_t instrLen;
+		SpvOp op;
+		if (!smolv_ReadLengthOp(bytes, bytesEnd, instrLen, op))
+			return false;
+		const bool wasSwizzle = (op == SpvOpVectorShuffleCompact);
+		if (wasSwizzle)
+			op = SpvOpVectorShuffle;
+		smolv_Write4(outSpirv, (instrLen << 16) | op);
+
+		size_t ioffs = 1;
+
+		// read type as varint, if we have it
+		if (smolv_OpHasType(op, knownOpsCount))
+		{
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			smolv_Write4(outSpirv, val);
+			ioffs++;
+		}
+		// read result as delta+varint, if we have it
+		if (smolv_OpHasResult(op, knownOpsCount))
+		{
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			val = prevResult + smolv_ZigDecode(val);
+			smolv_Write4(outSpirv, val);
+			prevResult = val;
+			ioffs++;
+		}
+		
+		// Decorate: IDs relative to previous decorate
+		if (op == SpvOpDecorate || op == SpvOpMemberDecorate)
+		{
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			// "before zero" version did not use zig encoding for the value
+			val = prevDecorate + (beforeZeroVersion ? val : smolv_ZigDecode(val));
+			smolv_Write4(outSpirv, val);
+			prevDecorate = val;
+			ioffs++;
+		}
+
+		// MemberDecorate special decoding
+		if (op == SpvOpMemberDecorate && !beforeZeroVersion)
+		{
+			if (bytes >= bytesEnd)
+				return false; // broken input
+			int count = *bytes++;
+			int prevIndex = 0;
+			int prevOffset = 0;
+			for (int m = 0; m < count; ++m)
+			{
+				// read member index
+				uint32_t memberIndex;
+				if (!smolv_ReadVarint(bytes, bytesEnd, memberIndex)) return false;
+				memberIndex += prevIndex;
+				prevIndex = memberIndex;
+				
+				// decoration (and length if not common/known)
+				uint32_t memberDec;
+				if (!smolv_ReadVarint(bytes, bytesEnd, memberDec)) return false;
+				const int knownExtraOps = smolv_DecorationExtraOps(memberDec);
+				uint32_t memberLen;
+				if (knownExtraOps == -1)
+				{
+					if (!smolv_ReadVarint(bytes, bytesEnd, memberLen)) return false;
+					memberLen += 4;
+				}
+				else
+					memberLen = 4 + knownExtraOps;
+
+				// write SPIR-V op+length (unless it's first member decoration, in which case it was written before)
+				if (m != 0)
+				{
+					smolv_Write4(outSpirv, (memberLen << 16) | op);
+					smolv_Write4(outSpirv, prevDecorate);
+				}
+				smolv_Write4(outSpirv, memberIndex);
+				smolv_Write4(outSpirv, memberDec);
+				// Special case for Offset decorations
+				if (memberDec == 35) // Offset
+				{
+					if (memberLen != 5)
+						return false;
+					if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+					val += prevOffset;
+					smolv_Write4(outSpirv, val);
+					prevOffset = val;
+				}
+				else
+				{
+					for (uint32_t i = 4; i < memberLen; ++i)
+					{
+						if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+						smolv_Write4(outSpirv, val);
+					}
+				}
+			}
+			continue;
+		}
+
+		// Read this many IDs, that are relative to result ID
+		int relativeCount = smolv_OpDeltaFromResult(op, knownOpsCount);
+		// "before zero" version only used zig encoding for IDs of several ops; after
+		// that ops got zig encoding for their IDs
+		bool zigDecodeVals = true;
+		if (beforeZeroVersion)
+		{
+			if (op != SpvOpControlBarrier && op != SpvOpMemoryBarrier && op != SpvOpLoopMerge && op != SpvOpSelectionMerge && op != SpvOpBranch && op != SpvOpBranchConditional && op != SpvOpMemoryNamedBarrier)
+				zigDecodeVals = false;
+		}
+		for (int i = 0; i < relativeCount && ioffs < instrLen; ++i, ++ioffs)
+		{
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			if (zigDecodeVals)
+				val = smolv_ZigDecode(val);
+			smolv_Write4(outSpirv, prevResult - val);
+		}
+
+		if (wasSwizzle && instrLen <= 9)
+		{
+			uint32_t swizzle = *bytes++;
+			if (instrLen > 5) smolv_Write4(outSpirv, (swizzle >> 6) & 3);
+			if (instrLen > 6) smolv_Write4(outSpirv, (swizzle >> 4) & 3);
+			if (instrLen > 7) smolv_Write4(outSpirv, (swizzle >> 2) & 3);
+			if (instrLen > 8) smolv_Write4(outSpirv, swizzle & 3);
+		}
+		else if (smolv_OpVarRest(op, knownOpsCount))
+		{
+			// read rest of words with variable encoding
+			for (; ioffs < instrLen; ++ioffs)
+			{
+				if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+				smolv_Write4(outSpirv, val);
+			}
+		}
+		else
+		{
+			// read rest of words without any encoding
+			for (; ioffs < instrLen; ++ioffs)
+			{
+				if (!smolv_Read4(bytes, bytesEnd, val)) return false;
+				smolv_Write4(outSpirv, val);
+			}
+		}
+	}
+
+	if ((uint8_t*)spirvOutputBuffer + neededBufferSize != outSpirv)
+		return false; // something went wrong during decoding? we should have decoded to exact output size
+	
+	return true;
+}
+
+
+
+// --------------------------------------------------------------------------------------------
+// Calculating instruction count / space stats on SPIR-V and SMOL-V
+
+
+struct smolv::Stats
+{
+	Stats() { memset(this, 0, sizeof(*this)); }
+	size_t opCounts[kKnownOpsCount];
+	size_t opSizes[kKnownOpsCount];
+	size_t smolOpSizes[kKnownOpsCount];
+	size_t varintCountsOp[6];
+	size_t varintCountsType[6];
+	size_t varintCountsRes[6];
+	size_t varintCountsOther[6];
+	size_t totalOps;
+	size_t totalSize;
+	size_t totalSizeSmol;
+	size_t inputCount;
+};
+
+
+smolv::Stats* smolv::StatsCreate()
+{
+	return new Stats();
+}
+
+void smolv::StatsDelete(smolv::Stats *s)
+{
+	delete s;
+}
+
+
+bool smolv::StatsCalculate(smolv::Stats* stats, const void* spirvData, size_t spirvSize)
+{
+	if (!stats)
+		return false;
+
+	const size_t wordCount = spirvSize / 4;
+	if (wordCount * 4 != spirvSize)
+		return false;
+	const uint32_t* words = (const uint32_t*)spirvData;
+	const uint32_t* wordsEnd = words + wordCount;
+	if (!smolv_CheckSpirVHeader(words, wordCount))
+		return false;
+	words += 5;
+	
+	stats->inputCount++;
+	stats->totalSize += wordCount;
+
+	while (words < wordsEnd)
+	{
+		_SMOLV_READ_OP(instrLen, words, op);
+
+		if (op < kKnownOpsCount)
+		{
+			stats->opCounts[op]++;
+			stats->opSizes[op] += instrLen;
+		}
+		words += instrLen;
+		stats->totalOps++;
+	}
+	
+	return true;
+}
+
+
+bool smolv::StatsCalculateSmol(smolv::Stats* stats, const void* smolvData, size_t smolvSize)
+{
+	if (!stats)
+		return false;
+
+	// debugging helper to dump all encoded bytes to stdout, keep at "if 0"
+#	if 0
+#		define _SMOLV_DEBUG_PRINT_ENCODED_BYTES() { \
+			printf("Op %-22s ", op < kKnownOpsCount ? kSpirvOpNames[op] : "???"); \
+			for (const uint8_t* b = instrBegin; b < bytes; ++b) \
+				printf("%02x ", *b); \
+			printf("\n"); \
+		}
+#	else
+#		define _SMOLV_DEBUG_PRINT_ENCODED_BYTES() {}
+#	endif
+	
+	const uint8_t* bytes = (const uint8_t*)smolvData;
+	const uint8_t* bytesEnd = bytes + smolvSize;
+	if (!smolv_CheckSmolHeader(bytes, smolvSize))
+		return false;
+
+	uint32_t val;
+	int smolVersion;
+	bytes += 4;
+	smolv_Read4(bytes, bytesEnd, val); smolVersion = val >> 24;
+	const int knownOpsCount = smolv_GetKnownOpsCount(smolVersion);
+	bytes += 16;
+	
+	stats->totalSizeSmol += smolvSize;
+	
+	while (bytes < bytesEnd)
+	{
+		const uint8_t* instrBegin = bytes;
+		const uint8_t* varBegin;
+
+		// read length + opcode
+		uint32_t instrLen;
+		SpvOp op;
+		varBegin = bytes;
+		if (!smolv_ReadLengthOp(bytes, bytesEnd, instrLen, op))
+			return false;
+		const bool wasSwizzle = (op == SpvOpVectorShuffleCompact);
+		if (wasSwizzle)
+			op = SpvOpVectorShuffle;
+		stats->varintCountsOp[bytes-varBegin]++;
+		
+		size_t ioffs = 1;
+		if (smolv_OpHasType(op, knownOpsCount))
+		{
+			varBegin = bytes;
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			stats->varintCountsType[bytes-varBegin]++;
+			ioffs++;
+		}
+		if (smolv_OpHasResult(op, knownOpsCount))
+		{
+			varBegin = bytes;
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			stats->varintCountsRes[bytes-varBegin]++;
+			ioffs++;
+		}
+		
+		if (op == SpvOpDecorate || op == SpvOpMemberDecorate)
+		{
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			ioffs++;
+		}
+		// MemberDecorate special decoding
+		if (op == SpvOpMemberDecorate)
+		{
+			if (bytes >= bytesEnd)
+				return false; // broken input
+			int count = *bytes++;
+			for (int m = 0; m < count; ++m)
+			{
+				uint32_t memberIndex;
+				if (!smolv_ReadVarint(bytes, bytesEnd, memberIndex)) return false;
+				uint32_t memberDec;
+				if (!smolv_ReadVarint(bytes, bytesEnd, memberDec)) return false;
+				const int knownExtraOps = smolv_DecorationExtraOps(memberDec);
+				uint32_t memberLen;
+				if (knownExtraOps == -1)
+				{
+					if (!smolv_ReadVarint(bytes, bytesEnd, memberLen)) return false;
+					memberLen += 4;
+				}
+				else
+					memberLen = 4 + knownExtraOps;
+				for (uint32_t i = 4; i < memberLen; ++i)
+				{
+					if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+				}
+			}
+			stats->smolOpSizes[op] += bytes - instrBegin;
+			_SMOLV_DEBUG_PRINT_ENCODED_BYTES();
+			continue;
+		}
+
+		int relativeCount = smolv_OpDeltaFromResult(op, knownOpsCount);
+		for (int i = 0; i < relativeCount && ioffs < instrLen; ++i, ++ioffs)
+		{
+			varBegin = bytes;
+			if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+			stats->varintCountsRes[bytes-varBegin]++;
+		}
+
+		if (wasSwizzle && instrLen <= 9)
+		{
+			bytes++;
+		}
+		else if (smolv_OpVarRest(op, knownOpsCount))
+		{
+			for (; ioffs < instrLen; ++ioffs)
+			{
+				varBegin = bytes;
+				if (!smolv_ReadVarint(bytes, bytesEnd, val)) return false;
+				stats->varintCountsOther[bytes-varBegin]++;
+			}
+		}
+		else
+		{
+			for (; ioffs < instrLen; ++ioffs)
+			{
+				if (!smolv_Read4(bytes, bytesEnd, val)) return false;
+			}
+		}
+		
+		if (op < kKnownOpsCount)
+		{
+			stats->smolOpSizes[op] += bytes - instrBegin;
+		}
+		_SMOLV_DEBUG_PRINT_ENCODED_BYTES();
+	}
+	
+	return true;
+}
+
+static bool CompareOpCounters (std::pair<SpvOp,size_t> a, std::pair<SpvOp,size_t> b)
+{
+	return a.second > b.second;
+}
+
+void smolv::StatsPrint(const Stats* stats)
+{
+	if (!stats)
+		return;
+
+	typedef std::pair<SpvOp,size_t> OpCounter;
+	OpCounter counts[kKnownOpsCount];
+	OpCounter sizes[kKnownOpsCount];
+	OpCounter sizesSmol[kKnownOpsCount];
+	for (int i = 0; i < kKnownOpsCount; ++i)
+	{
+		counts[i].first = (SpvOp)i;
+		counts[i].second = stats->opCounts[i];
+		sizes[i].first = (SpvOp)i;
+		sizes[i].second = stats->opSizes[i];
+		sizesSmol[i].first = (SpvOp)i;
+		sizesSmol[i].second = stats->smolOpSizes[i];
+	}
+	std::sort(counts, counts + kKnownOpsCount, CompareOpCounters);
+	std::sort(sizes, sizes + kKnownOpsCount, CompareOpCounters);
+	std::sort(sizesSmol, sizesSmol + kKnownOpsCount, CompareOpCounters);
+	
+	printf("Stats for %i SPIR-V inputs, total size %i words (%.1fKB):\n", (int)stats->inputCount, (int)stats->totalSize, stats->totalSize * 4.0f / 1024.0f);
+	printf("Most occuring ops:\n");
+	for (int i = 0; i < 30; ++i)
+	{
+		SpvOp op = counts[i].first;
+		printf(" #%2i: %4i %-20s %4i (%4.1f%%)\n", i, op, kSpirvOpNames[op], (int)counts[i].second, (float)counts[i].second / (float)stats->totalOps * 100.0f);
+	}
+	printf("Largest total size of ops:\n");
+	for (int i = 0; i < 30; ++i)
+	{
+		SpvOp op = sizes[i].first;
+		printf(" #%2i: %-22s %6i (%4.1f%%) avg len %.1f\n",
+			   i,
+			   kSpirvOpNames[op],
+			   (int)sizes[i].second*4,
+			   (float)sizes[i].second / (float)stats->totalSize * 100.0f,
+			   (float)sizes[i].second*4 / (float)stats->opCounts[op]
+		);
+	}
+	printf("SMOL varint encoding counts per byte length:\n");
+	printf("  B: %6s %6s %6s %6s\n", "Op", "Type", "Result", "Other");
+	for (int i = 1; i < 6; ++i)
+	{
+		printf("  %i: %6i %6i %6i %6i\n", i, (int)stats->varintCountsOp[i], (int)stats->varintCountsType[i], (int)stats->varintCountsRes[i], (int)stats->varintCountsOther[i]);
+	}
+	printf("Largest total size of ops in SMOL:\n");
+	for (int i = 0; i < 30; ++i)
+	{
+		SpvOp op = sizesSmol[i].first;
+		printf(" #%2i: %-22s %6i (%4.1f%%) avg len %.1f\n",
+			   i,
+			   kSpirvOpNames[op],
+			   (int)sizesSmol[i].second,
+			   (float)sizesSmol[i].second / (float)stats->totalSizeSmol * 100.0f,
+			   (float)sizesSmol[i].second / (float)stats->opCounts[op]
+		);
+	}	
+}
+
+
+// ------------------------------------------------------------------------------
+// This software is available under 2 licenses -- choose whichever you prefer.
+// ------------------------------------------------------------------------------
+// ALTERNATIVE A - MIT License
+// Copyright (c) 2016-2020 Aras Pranckevicius
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal in
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+// ------------------------------------------------------------------------------
+// ALTERNATIVE B - Public Domain (www.unlicense.org)
+// This is free and unencumbered software released into the public domain.
+// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+// software, either in source code form or as a compiled binary, for any purpose,
+// commercial or non-commercial, and by any means.
+// In jurisdictions that recognize copyright laws, the author or authors of this
+// software dedicate any and all copyright interest in the software to the public
+// domain. We make this dedication for the benefit of the public at large and to
+// the detriment of our heirs and successors. We intend this dedication to be an
+// overt act of relinquishment in perpetuity of all present and future rights to
+// this software under copyright law.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// ------------------------------------------------------------------------------
diff --git a/thirdparty/misc/smolv.h b/thirdparty/misc/smolv.h
new file mode 100644
index 0000000000..798ee4126f
--- /dev/null
+++ b/thirdparty/misc/smolv.h
@@ -0,0 +1,169 @@
+// smol-v - public domain - https://github.com/aras-p/smol-v
+// authored 2016-2020 by Aras Pranckevicius
+// no warranty implied; use at your own risk
+// See end of file for license information.
+//
+//
+// ### OVERVIEW:
+//
+// SMOL-V encodes Vulkan/Khronos SPIR-V format programs into a form that is smaller, and is more
+// compressible. Normally no changes to the programs are done; they decode
+// into exactly same program as what was encoded. Optionally, debug information
+// can be removed too.
+//
+// SPIR-V is a very verbose format, several times larger than same programs expressed in other
+// shader formats (e.g. DX11 bytecode, GLSL, DX9 bytecode etc.). The SSA-form with ever increasing
+// IDs is not very appreciated by regular data compressors either. SMOL-V does several things
+// to improve this:
+// - Many words, especially ones that most often have small values, are encoded using
+//   "varint" scheme (1-5 bytes per word, with just one byte for values in 0..127 range).
+//   See https://developers.google.com/protocol-buffers/docs/encoding
+// - Some IDs used in the program are delta-encoded, relative to previously seen IDs (e.g. Result
+//   IDs). Often instructions reference things that were computed just before, so this results in
+//   small deltas. These values are also encoded using "varint" scheme.
+// - Reordering instruction opcodes so that the most common ones are the smallest values, for smaller
+//  varint encoding.
+// - Encoding several instructions in a more compact form, e.g. the "typical <=4 component swizzle"
+//  shape of a VectorShuffle instruction, or sequences of MemberDecorate instructions.
+//
+// A somewhat similar utility is spirv-remap from glslang, see
+// https://github.com/KhronosGroup/glslang/blob/master/README-spirv-remap.txt
+//
+//
+// ### USAGE:
+//
+// Add source/smolv.h and source/smolv.cpp to your C++ project build.
+// Currently it might require C++11 or somesuch; I only tested with Visual Studio 2017/2019, Mac Xcode 11 and Gcc 5.4.
+//
+// smolv::Encode and smolv::Decode is the basic functionality.
+//
+// Other functions are for development/statistics purposes, to figure out frequencies and
+// distributions of the instructions.
+//
+// There's a test + compression benchmarking suite in testing/testmain.cpp; using that needs adding
+// other files under testing/external to the build too (3rd party code: glslang remapper, Zstd, LZ4).
+//
+//
+// ### LIMITATIONS / TODO:
+//
+// - SPIR-V where the words got stored in big-endian layout is not supported yet.
+// - The whole thing might not work on Big-Endian CPUs. It might, but I'm not 100% sure.
+// - Not much prevention is done against malformed/corrupted inputs, TODO.
+// - Out of memory cases are not handled. The code will either throw exception
+//   or crash, depending on your compilation flags.
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+#include <cstddef>
+
+namespace smolv
+{
+	typedef std::vector<uint8_t> ByteArray;
+
+	enum EncodeFlags
+	{
+		kEncodeFlagNone = 0,
+		kEncodeFlagStripDebugInfo = (1<<0), // Strip all optional SPIR-V instructions (debug names etc.)
+	};
+	enum DecodeFlags
+	{
+		kDecodeFlagNone = 0,
+		kDecodeFlagUse20160831AsZeroVersion = (1 << 0), // For "version zero" of SMOL-V encoding, use 2016 08 31 code path (this is what happens to be used by Unity 2017-2020)
+	};
+
+	// Preserve *some* OpName debug names.
+	// Return true to preserve, false to strip.
+	// This is really only used to implement a workaround for problems with some Vulkan drivers.
+	typedef bool(*StripOpNameFilterFunc)(const char* name);
+
+	// -------------------------------------------------------------------
+	// Encoding / Decoding
+
+	// Encode SPIR-V into SMOL-V.
+	//
+	// Resulting data is appended to outSmolv array (the array is not cleared).
+	//
+	// flags is bitset of EncodeFlags values.
+	//
+	// Returns false on malformed SPIR-V input; if that happens the output array might get
+	// partial/broken SMOL-V program.
+	bool Encode(const void* spirvData, size_t spirvSize, ByteArray& outSmolv, uint32_t flags = kEncodeFlagNone, StripOpNameFilterFunc stripFilter = 0);
+
+
+	// Decode SMOL-V into SPIR-V.
+	//
+	// Resulting data is written into the passed buffer. Get required buffer space with
+	// GetDecodeBufferSize; this is the size of decoded SPIR-V program.
+	//
+	// flags is bitset of DecodeFlags values.
+
+	// Decoding does no memory allocations.
+	//
+	// Returns false on malformed input; if that happens the output buffer might be only partially
+	// written to.
+	bool Decode(const void* smolvData, size_t smolvSize, void* spirvOutputBuffer, size_t spirvOutputBufferSize, uint32_t flags = kDecodeFlagNone);
+
+
+	// Given a SMOL-V program, get size of the decoded SPIR-V program.
+	// This is the buffer size that Decode expects.
+	//
+	// Returns zero on malformed input (just checks the header, not the full input).
+	size_t GetDecodedBufferSize(const void* smolvData, size_t smolvSize);
+
+
+	// -------------------------------------------------------------------
+	// Computing instruction statistics on SPIR-V/SMOL-V programs
+
+	struct Stats;
+
+	Stats* StatsCreate();
+	void StatsDelete(Stats* s);
+
+	bool StatsCalculate(Stats* stats, const void* spirvData, size_t spirvSize);
+	bool StatsCalculateSmol(Stats* stats, const void* smolvData, size_t smolvSize);
+	void StatsPrint(const Stats* stats);
+
+} // namespace smolv
+
+
+// ------------------------------------------------------------------------------
+// This software is available under 2 licenses -- choose whichever you prefer.
+// ------------------------------------------------------------------------------
+// ALTERNATIVE A - MIT License
+// Copyright (c) 2016-2020 Aras Pranckevicius
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal in
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+// ------------------------------------------------------------------------------
+// ALTERNATIVE B - Public Domain (www.unlicense.org)
+// This is free and unencumbered software released into the public domain.
+// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+// software, either in source code form or as a compiled binary, for any purpose,
+// commercial or non-commercial, and by any means.
+// In jurisdictions that recognize copyright laws, the author or authors of this
+// software dedicate any and all copyright interest in the software to the public
+// domain. We make this dedication for the benefit of the public at large and to
+// the detriment of our heirs and successors. We intend this dedication to be an
+// overt act of relinquishment in perpetuity of all present and future rights to
+// this software under copyright law.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// ------------------------------------------------------------------------------
diff --git a/thirdparty/oidn/core/transfer_function.cpp b/thirdparty/oidn/core/transfer_function.cpp
index 487f0a9f75..ce5deca56b 100644
--- a/thirdparty/oidn/core/transfer_function.cpp
+++ b/thirdparty/oidn/core/transfer_function.cpp
@@ -24,10 +24,6 @@ namespace oidn {
   float AutoexposureNode::autoexposure(const Image& color)
   {
     assert(color.format == Format::Float3);
-// -- GODOT start --
-// We don't want to mess with TTB and we don't use autoexposure, so we disable this code
-#if 0
-// -- GODOT end --
 
     constexpr float key = 0.18f;
     constexpr float eps = 1e-8f;
@@ -42,61 +38,66 @@ namespace oidn {
     // Compute the average log luminance of the downsampled image
     using Sum = std::pair<float, int>;
 
-    Sum sum =
-      tbb::parallel_reduce(
-        tbb::blocked_range2d<int>(0, HK, 0, WK),
-        Sum(0.f, 0),
-        [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+    // -- GODOT start --
+    // Sum sum =
+    //   tbb::parallel_reduce(
+    //     tbb::blocked_range2d<int>(0, HK, 0, WK),
+    //     Sum(0.f, 0),
+    //     [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+    //     {
+    //       // Iterate over blocks
+    //       for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+    //       {
+    //         for (int j = r.cols().begin(); j != r.cols().end(); ++j)
+    //         {
+
+    Sum sum = Sum(0.0f, 0);
+
+    for (int i = 0; i != HK; ++i)
+    {
+      for (int j = 0; j != WK; ++j)
+      {
+        // Compute the average luminance in the current block
+        const int beginH = int(ptrdiff_t(i)   * H / HK);
+        const int beginW = int(ptrdiff_t(j)   * W / WK);
+        const int endH   = int(ptrdiff_t(i+1) * H / HK);
+        const int endW   = int(ptrdiff_t(j+1) * W / WK);
+
+        float L = 0.f;
+
+        for (int h = beginH; h < endH; ++h)
         {
-          // Iterate over blocks
-          for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+          for (int w = beginW; w < endW; ++w)
           {
-            for (int j = r.cols().begin(); j != r.cols().end(); ++j)
-            {
-              // Compute the average luminance in the current block
-              const int beginH = int(ptrdiff_t(i)   * H / HK);
-              const int beginW = int(ptrdiff_t(j)   * W / WK);
-              const int endH   = int(ptrdiff_t(i+1) * H / HK);
-              const int endW   = int(ptrdiff_t(j+1) * W / WK);
-
-              float L = 0.f;
-
-              for (int h = beginH; h < endH; ++h)
-              {
-                for (int w = beginW; w < endW; ++w)
-                {
-                  const float* rgb = (const float*)color.get(h, w);
-
-                  const float r = maxSafe(rgb[0], 0.f);
-                  const float g = maxSafe(rgb[1], 0.f);
-                  const float b = maxSafe(rgb[2], 0.f);
-
-                  L += luminance(r, g, b);
-                }
-              }
-
-              L /= (endH - beginH) * (endW - beginW);
-
-              // Accumulate the log luminance
-              if (L > eps)
-              {
-                sum.first += log2(L);
-                sum.second++;
-              }
-            }
+            const float* rgb = (const float*)color.get(h, w);
+
+            const float r = maxSafe(rgb[0], 0.f);
+            const float g = maxSafe(rgb[1], 0.f);
+            const float b = maxSafe(rgb[2], 0.f);
+
+            L += luminance(r, g, b);
           }
+        }
 
-          return sum;
-        },
-        [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
-        tbb::static_partitioner()
-      );
+        L /= (endH - beginH) * (endW - beginW);
+
+        // Accumulate the log luminance
+        if (L > eps)
+        {
+          sum.first += log2(L);
+          sum.second++;
+        }
+      }
+    }
+
+    //     return sum;
+    //   },
+    //   [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+    //   tbb::static_partitioner()
+    // );
+    // -- GODOT end --
 
     return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;
-// -- GODOT start --
-#endif
-    return 1.0;
-// -- GODOT end --
   }
 
 } // namespace oidn
diff --git a/thirdparty/oidn/patches/godot-changes-c58c5216.patch b/thirdparty/oidn/patches/godot-changes-c58c5216.patch
index 6a54703064..c01f00187b 100644
--- a/thirdparty/oidn/patches/godot-changes-c58c5216.patch
+++ b/thirdparty/oidn/patches/godot-changes-c58c5216.patch
@@ -280,28 +280,58 @@ index 8c2de09..ed8328c 100644
  namespace oidn {
  
 diff --git a/core/transfer_function.cpp b/core/transfer_function.cpp
-index 601f814..487f0a9 100644
+index 601f814..ce5deca 100644
 --- a/core/transfer_function.cpp
 +++ b/core/transfer_function.cpp
-@@ -24,6 +24,10 @@ namespace oidn {
-   float AutoexposureNode::autoexposure(const Image& color)
-   {
-     assert(color.format == Format::Float3);
-+// -- GODOT start --
-+// We don't want to mess with TTB and we don't use autoexposure, so we disable this code
-+#if 0
-+// -- GODOT end --
+@@ -38,16 +38,24 @@ namespace oidn {
+     // Compute the average log luminance of the downsampled image
+     using Sum = std::pair<float, int>;
+ 
+-    Sum sum =
+-      tbb::parallel_reduce(
+-        tbb::blocked_range2d<int>(0, HK, 0, WK),
+-        Sum(0.f, 0),
+-        [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
++    // -- GODOT start --
++    // Sum sum =
++    //   tbb::parallel_reduce(
++    //     tbb::blocked_range2d<int>(0, HK, 0, WK),
++    //     Sum(0.f, 0),
++    //     [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
++    //     {
++    //       // Iterate over blocks
++    //       for (int i = r.rows().begin(); i != r.rows().end(); ++i)
++    //       {
++    //         for (int j = r.cols().begin(); j != r.cols().end(); ++j)
++    //         {
++
++    Sum sum = Sum(0.0f, 0);
++
++    for (int i = 0; i != HK; ++i)
+     {
+-          // Iterate over blocks
+-          for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+-          {
+-            for (int j = r.cols().begin(); j != r.cols().end(); ++j)
++      for (int j = 0; j != WK; ++j)
+       {
+         // Compute the average luminance in the current block
+         const int beginH = int(ptrdiff_t(i)   * H / HK);
+@@ -82,11 +90,12 @@ namespace oidn {
+       }
+     }
  
-     constexpr float key = 0.18f;
-     constexpr float eps = 1e-8f;
-@@ -89,6 +93,10 @@ namespace oidn {
-       );
+-          return sum;
+-        },
+-        [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+-        tbb::static_partitioner()
+-      );
++    //     return sum;
++    //   },
++    //   [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
++    //   tbb::static_partitioner()
++    // );
++    // -- GODOT end --
  
      return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;
-+// -- GODOT start --
-+#endif
-+    return 1.0;
-+// -- GODOT end --
    }
- 
- } // namespace oidn